├── .DS_Store
├── .gitattributes
├── .gitignore
├── LICENSE
├── Layer_Drop.svg
├── README.md
├── SECURITY.md
├── requirements.txt
├── scripts
    ├── .DS_Store
    ├── benchmark
    │   ├── benchmark_lm_eval.sh
    │   └── benchmark_speed.sh
    ├── dropping
    │   ├── block_drop.sh
    │   ├── layer_drop.sh
    │   ├── layer_drop_iterative.sh
    │   └── layer_drop_joint.sh
    └── quantization
    │   ├── awq.sh
    │   └── gptq.sh
├── setup.py
└── src
    ├── .DS_Store
    ├── __init__.py
    ├── benchmark_speed.py
    ├── compress.py
    └── llmtuner
        ├── .DS_Store
        ├── __init__.py
        ├── compression
            ├── .DS_Store
            ├── __init__.py
            ├── prune
            │   ├── __init__.py
            │   ├── block_drop.py
            │   ├── io.py
            │   ├── layer_drop.py
            │   ├── models
            │   │   ├── __init__.py
            │   │   ├── configuration_deepseek.py
            │   │   ├── configuration_dropped_baichuan.py
            │   │   ├── configuration_dropped_gemma2.py
            │   │   ├── configuration_dropped_llama.py
            │   │   ├── configuration_dropped_mistral.py
            │   │   ├── modeling_dropped_baichuan.py
            │   │   ├── modeling_dropped_deepseek.py
            │   │   ├── modeling_dropped_gemma2.py
            │   │   ├── modeling_dropped_llama.py
            │   │   └── modeling_dropped_mistral.py
            │   ├── utils.py
            │   ├── workflow.py
            │   └── wrapper.py
            ├── quantization
            │   ├── AutoAWQ
            │   │   ├── AutoAWQ_kernels
            │   │   │   ├── LICENSE
            │   │   │   ├── README.md
            │   │   │   ├── __init__.py
            │   │   │   ├── awq_ext
            │   │   │   │   ├── attention
            │   │   │   │   │   ├── cuda_bf16_fallbacks.cuh
            │   │   │   │   │   ├── cuda_bf16_wrapper.h
            │   │   │   │   │   ├── decoder_masked_multihead_attention.cu
            │   │   │   │   │   ├── decoder_masked_multihead_attention.h
            │   │   │   │   │   ├── decoder_masked_multihead_attention_template.hpp
            │   │   │   │   │   ├── decoder_masked_multihead_attention_utils.h
            │   │   │   │   │   ├── ft_attention.cpp
            │   │   │   │   │   └── ft_attention.h
            │   │   │   │   ├── exllama
            │   │   │   │   │   ├── cu_compat.cuh
            │   │   │   │   │   ├── cuda_buffers.cu
            │   │   │   │   │   ├── cuda_buffers.cuh
            │   │   │   │   │   ├── cuda_func
            │   │   │   │   │   │   ├── column_remap.cu
            │   │   │   │   │   │   ├── column_remap.cuh
            │   │   │   │   │   │   ├── q4_matmul.cu
            │   │   │   │   │   │   ├── q4_matmul.cuh
            │   │   │   │   │   │   ├── q4_matrix.cu
            │   │   │   │   │   │   └── q4_matrix.cuh
            │   │   │   │   │   ├── exllama_ext.cpp
            │   │   │   │   │   ├── hip_compat.cuh
            │   │   │   │   │   ├── matrix.cuh
            │   │   │   │   │   ├── tuning.h
            │   │   │   │   │   └── util.cuh
            │   │   │   │   ├── exllamav2
            │   │   │   │   │   ├── config.h
            │   │   │   │   │   ├── cpp
            │   │   │   │   │   │   └── util.h
            │   │   │   │   │   ├── cuda
            │   │   │   │   │   │   ├── compat.cuh
            │   │   │   │   │   │   ├── compat_gemm.cuh
            │   │   │   │   │   │   ├── matrix_view.cuh
            │   │   │   │   │   │   ├── q_gemm.cu
            │   │   │   │   │   │   ├── q_gemm.cuh
            │   │   │   │   │   │   ├── q_gemm_kernel.cuh
            │   │   │   │   │   │   ├── q_gemm_kernel_gptq.cuh
            │   │   │   │   │   │   ├── q_matrix.cu
            │   │   │   │   │   │   ├── q_matrix.cuh
            │   │   │   │   │   │   ├── quant
            │   │   │   │   │   │   │   ├── qdq_2.cuh
            │   │   │   │   │   │   │   ├── qdq_3.cuh
            │   │   │   │   │   │   │   ├── qdq_4.cuh
            │   │   │   │   │   │   │   ├── qdq_5.cuh
            │   │   │   │   │   │   │   ├── qdq_6.cuh
            │   │   │   │   │   │   │   ├── qdq_8.cuh
            │   │   │   │   │   │   │   └── qdq_util.cuh
            │   │   │   │   │   │   └── util.cuh
            │   │   │   │   │   └── ext.cpp
            │   │   │   │   ├── layernorm
            │   │   │   │   │   ├── layernorm.cu
            │   │   │   │   │   ├── layernorm.h
            │   │   │   │   │   └── reduction.cuh
            │   │   │   │   ├── position_embedding
            │   │   │   │   │   ├── pos_encoding.h
            │   │   │   │   │   └── pos_encoding_kernels.cu
            │   │   │   │   ├── pybind_awq.cpp
            │   │   │   │   ├── pybind_awq_ft.cpp
            │   │   │   │   ├── pybind_awq_v2.cpp
            │   │   │   │   ├── quantization
            │   │   │   │   │   ├── dequantize.cuh
            │   │   │   │   │   ├── gemm_cuda.h
            │   │   │   │   │   ├── gemm_cuda_gen.cu
            │   │   │   │   │   ├── gemv_cuda.cu
            │   │   │   │   │   └── gemv_cuda.h
            │   │   │   │   ├── quantization_new
            │   │   │   │   │   ├── dequantize.cuh
            │   │   │   │   │   ├── gemm
            │   │   │   │   │   │   ├── gemm_cuda.cu
            │   │   │   │   │   │   ├── gemm_cuda.h
            │   │   │   │   │   │   └── semaphore.h
            │   │   │   │   │   └── gemv
            │   │   │   │   │   │   ├── gemv_cuda.cu
            │   │   │   │   │   │   └── gemv_cuda.h
            │   │   │   │   └── vllm
            │   │   │   │   │   ├── activation.cu
            │   │   │   │   │   ├── activation.h
            │   │   │   │   │   ├── moe_alig_block.cu
            │   │   │   │   │   ├── moe_alig_block.h
            │   │   │   │   │   ├── topk_softmax_kernels.cu
            │   │   │   │   │   └── topk_softmax_kernels.h
            │   │   │   ├── scripts
            │   │   │   │   └── download_wheels.sh
            │   │   │   └── setup.py
            │   │   ├── LICENSE
            │   │   ├── README.md
            │   │   ├── __init__.py
            │   │   ├── awq
            │   │   │   ├── __init__.py
            │   │   │   ├── evaluation
            │   │   │   │   ├── __init__.py
            │   │   │   │   ├── eval_utils.py
            │   │   │   │   ├── humaneval_utils.py
            │   │   │   │   └── kl_divergence.py
            │   │   │   ├── models
            │   │   │   │   ├── __init__.py
            │   │   │   │   ├── _config.py
            │   │   │   │   ├── aquila.py
            │   │   │   │   ├── auto.py
            │   │   │   │   ├── baichuan.py
            │   │   │   │   ├── base.py
            │   │   │   │   ├── bloom.py
            │   │   │   │   ├── deepseek.py
            │   │   │   │   ├── deepseek_moe
            │   │   │   │   │   ├── __init__.py
            │   │   │   │   │   ├── configuration_deepseek.py
            │   │   │   │   │   └── modeling_deepseek.py
            │   │   │   │   ├── falcon.py
            │   │   │   │   ├── gemma.py
            │   │   │   │   ├── gpt_bigcode.py
            │   │   │   │   ├── gpt_neox.py
            │   │   │   │   ├── gptj.py
            │   │   │   │   ├── llama.py
            │   │   │   │   ├── llava.py
            │   │   │   │   ├── mistral.py
            │   │   │   │   ├── mixtral.py
            │   │   │   │   ├── mpt.py
            │   │   │   │   ├── opt.py
            │   │   │   │   ├── qwen.py
            │   │   │   │   ├── qwen2.py
            │   │   │   │   ├── stablelm.py
            │   │   │   │   ├── starcoder2.py
            │   │   │   │   └── yi.py
            │   │   │   ├── modules
            │   │   │   │   ├── __init__.py
            │   │   │   │   ├── act.py
            │   │   │   │   ├── fused
            │   │   │   │   │   ├── __init__.py
            │   │   │   │   │   ├── attn.py
            │   │   │   │   │   ├── block.py
            │   │   │   │   │   ├── cache.py
            │   │   │   │   │   ├── mlp.py
            │   │   │   │   │   ├── model.py
            │   │   │   │   │   ├── moe.py
            │   │   │   │   │   └── norm.py
            │   │   │   │   └── linear
            │   │   │   │   │   ├── __init__.py
            │   │   │   │   │   ├── exllama.py
            │   │   │   │   │   ├── exllamav2.py
            │   │   │   │   │   ├── gemm.py
            │   │   │   │   │   ├── gemv.py
            │   │   │   │   │   ├── gemv_fast.py
            │   │   │   │   │   └── marlin.py
            │   │   │   ├── quantize
            │   │   │   │   ├── __init__.py
            │   │   │   │   ├── quantizer.py
            │   │   │   │   └── scale.py
            │   │   │   └── utils
            │   │   │   │   ├── __init__.py
            │   │   │   │   ├── calib_data.py
            │   │   │   │   ├── fused_utils.py
            │   │   │   │   ├── module.py
            │   │   │   │   ├── packing_utils.py
            │   │   │   │   ├── parallel.py
            │   │   │   │   ├── quant_utils.py
            │   │   │   │   └── utils.py
            │   │   ├── quantize.py
            │   │   └── setup.py
            │   ├── AutoGPTQ
            │   │   ├── LICENSE
            │   │   ├── README.md
            │   │   ├── __init__.py
            │   │   ├── auto_gptq
            │   │   │   ├── __init__.py
            │   │   │   ├── eval_tasks
            │   │   │   │   ├── __init__.py
            │   │   │   │   ├── _base.py
            │   │   │   │   ├── _utils
            │   │   │   │   │   ├── __init__.py
            │   │   │   │   │   ├── classification_utils.py
            │   │   │   │   │   └── generation_utils.py
            │   │   │   │   ├── language_modeling_task.py
            │   │   │   │   ├── sequence_classification_task.py
            │   │   │   │   └── text_summarization_task.py
            │   │   │   ├── modeling
            │   │   │   │   ├── __init__.py
            │   │   │   │   ├── _base.py
            │   │   │   │   ├── _const.py
            │   │   │   │   ├── _utils.py
            │   │   │   │   ├── auto.py
            │   │   │   │   ├── baichuan.py
            │   │   │   │   ├── bloom.py
            │   │   │   │   ├── codegen.py
            │   │   │   │   ├── decilm.py
            │   │   │   │   ├── deepseek.py
            │   │   │   │   ├── gemma.py
            │   │   │   │   ├── gpt2.py
            │   │   │   │   ├── gpt_bigcode.py
            │   │   │   │   ├── gpt_neox.py
            │   │   │   │   ├── gptj.py
            │   │   │   │   ├── internlm.py
            │   │   │   │   ├── llama.py
            │   │   │   │   ├── longllama.py
            │   │   │   │   ├── mistral.py
            │   │   │   │   ├── mixtral.py
            │   │   │   │   ├── moss.py
            │   │   │   │   ├── opt.py
            │   │   │   │   ├── qwen.py
            │   │   │   │   ├── qwen2.py
            │   │   │   │   ├── rw.py
            │   │   │   │   ├── stablelmepoch.py
            │   │   │   │   ├── xverse.py
            │   │   │   │   └── yi.py
            │   │   │   ├── nn_modules
            │   │   │   │   ├── __init__.py
            │   │   │   │   ├── _fused_base.py
            │   │   │   │   ├── fused_gptj_attn.py
            │   │   │   │   ├── fused_llama_attn.py
            │   │   │   │   ├── fused_llama_mlp.py
            │   │   │   │   ├── qlinear
            │   │   │   │   │   ├── __init__.py
            │   │   │   │   │   ├── qlinear_cuda.py
            │   │   │   │   │   ├── qlinear_cuda_old.py
            │   │   │   │   │   ├── qlinear_exllama.py
            │   │   │   │   │   ├── qlinear_exllamav2.py
            │   │   │   │   │   ├── qlinear_marlin.py
            │   │   │   │   │   ├── qlinear_qigen.py
            │   │   │   │   │   └── qlinear_triton.py
            │   │   │   │   └── triton_utils
            │   │   │   │   │   ├── __init__.py
            │   │   │   │   │   ├── custom_autotune.py
            │   │   │   │   │   ├── kernels.py
            │   │   │   │   │   └── mixin.py
            │   │   │   ├── quantization
            │   │   │   │   ├── __init__.py
            │   │   │   │   ├── gptq.py
            │   │   │   │   └── quantizer.py
            │   │   │   └── utils
            │   │   │   │   ├── __init__.py
            │   │   │   │   ├── data_utils.py
            │   │   │   │   ├── exllama_utils.py
            │   │   │   │   ├── import_utils.py
            │   │   │   │   ├── marlin_utils.py
            │   │   │   │   ├── modeling_utils.py
            │   │   │   │   ├── peft_utils.py
            │   │   │   │   └── perplexity_utils.py
            │   │   ├── datautils.py
            │   │   ├── gptq.py
            │   │   ├── modelutils.py
            │   │   ├── quant.py
            │   │   ├── quantize.py
            │   │   ├── setup_cuda.py
            │   │   └── test_kernel.py
            │   └── __init__.py
            ├── tuner.py
            └── utils.py
        ├── data
            ├── __init__.py
            ├── aligner.py
            ├── c4_demo.json
            ├── c4_train.json
            ├── c4_val.json
            ├── dataset_info.json
            ├── formatter.py
            ├── loader.py
            ├── parser.py
            ├── preprocess.py
            ├── template.py
            ├── test_data.py
            └── utils.py
        ├── extras
            ├── __init__.py
            ├── callbacks.py
            ├── constants.py
            ├── logging.py
            ├── misc.py
            ├── packages.py
            ├── patches
            │   ├── __init__.py
            │   ├── llama_patch.py
            │   └── mixtral_patch.py
            └── ploting.py
        ├── hparams
            ├── __init__.py
            ├── data_args.py
            ├── evaluation_args.py
            ├── finetuning_args.py
            ├── generating_args.py
            ├── model_args.py
            ├── parser.py
            └── pruning_args.py
        └── model
            ├── __init__.py
            ├── adapter.py
            ├── loader.py
            ├── patcher.py
            └── utils.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/.DS_Store


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | # Security Policy
 2 | 
 3 | ## Supported Versions
 4 | 
 5 | Use this section to tell people about which versions of your project are
 6 | currently being supported with security updates.
 7 | 
 8 | | Version | Supported          |
 9 | | ------- | ------------------ |
10 | | 5.1.x   | :white_check_mark: |
11 | | 5.0.x   | :x:                |
12 | | 4.0.x   | :white_check_mark: |
13 | | < 4.0   | :x:                |
14 | 
15 | ## Reporting a Vulnerability
16 | 
17 | Use this section to tell people how to report a vulnerability.
18 | 
19 | Tell them where to go, how often they can expect to get an update on a
20 | reported vulnerability, what to expect if the vulnerability is accepted or
21 | declined, etc.
22 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch>=1.13.1
 2 | transformers>=4.38.1
 3 | datasets>=2.14.3
 4 | accelerate>=0.21.0
 5 | peft>=0.8.2
 6 | trl>=0.7.6
 7 | gradio>=3.38.0,<4.0.0
 8 | scipy
 9 | einops
10 | sentencepiece
11 | protobuf
12 | jieba
13 | rouge-chinese
14 | nltk
15 | uvicorn
16 | pydantic
17 | fastapi
18 | sse-starlette
19 | matplotlib
20 | 


--------------------------------------------------------------------------------
/scripts/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/scripts/.DS_Store


--------------------------------------------------------------------------------
/scripts/benchmark/benchmark_lm_eval.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | port="21804"
 4 | GPUs="0,1,2,3,4,5,6,7"
 5 | 
 6 | # Taking mistralai/Mistral-7B-v0.1 as an example.
 7 | model_names=("mistral") # The model to be compressed.
 8 | drop_modules=("mlp" "attn" "block") # The modules to be dropped.
 9 | drop_nums=("4" "8") # The number of dropped modules.
10 | 
11 | tasks=("boolq" "rte" "openbookqa" "piqa" "mmlu" "winogrande" "gsm8k" "hellaswag" "arc_challenge")
12 | num_fewshots=("0" "0" "0" "0" "5" "5" "5" "10" "25")
13 | 
14 | for model_name in "${model_names[@]}"
15 | do
16 |     # Download the model to a local directory. 
17 |     git lfs install
18 |     git clone https://huggingface.co/mistralai/Mistral-7B-v0.1
19 |     mv Mistral-7B-v0.1 ./"$model_name"_model
20 | 
21 |     for drop_module in "${drop_modules[@]}"
22 |     do
23 |         for drop_num in "${drop_nums[@]}"
24 |         do
25 |             cfg_path=./"$model_name"_drop"$drop_num"_"$drop_module"/config.json # PATH to the corresponding config.json file.
26 |             cp -f "$cfg_path" ./"$model_name"_model/config.json # Replace the original config.json file.
27 |             cp ./"$model_name"_drop"$drop_num"_"$drop_module"/*.py ./"$model_name"_model/ # Build the configuration and modeling files for remote code.
28 |             echo "Eval the config of:"
29 |             echo $cfg_path
30 | 
31 |             num_tasks=${#tasks[@]}
32 |             for ((i=0; i<$num_tasks; i++)); do
33 |                 CUDA_VISIBLE_DEVICES=$GPUs accelerate launch --main_process_port $port  -m lm_eval \
34 |                     --model hf \
35 |                     --model_args pretrained=./${model_name}_model,trust_remote_code=True,dtype="bfloat16" \
36 |                     --tasks ${tasks[$i]} \
37 |                     --num_fewshot ${num_fewshots[$i]} \
38 |                     --batch_size 1 \
39 |                     --output_path ./${num_fewshots[$i]}shot_${tasks[$i]}_"$model_name"_drop"$drop_num"_"$drop_module".json >> output_"$model_name"_drop"$drop_num"_"$drop_module".out
40 |             done
41 |         done
42 |     done
43 | done


--------------------------------------------------------------------------------
/scripts/benchmark/benchmark_speed.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | model_path="########PATH_TO_HUGGING_FACE_CHECKPOINT########"
 4 | save_file="########PATH_TO_SAVE_THE_RESULTS########/speed.csv"
 5 | model_type="normal" # normal or quantized
 6 | 
 7 | python src/benchmark_speed.py \
 8 |   --model_path $model_path \
 9 |   --model_type ${model_type} \
10 |   --save_file ${save_file} \
11 |   --pretrained


--------------------------------------------------------------------------------
/scripts/dropping/block_drop.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | port="21304"
 3 | GPUs="0,1,2,3"
 4 | 
 5 | dataset="c4_val"
 6 | prune_data_type="pt"
 7 | n_calibration_samples=256
 8 | seq_len=2048
 9 | 
10 | prune_method="block_drop"
11 | block_drop_method="discrete"
12 | drop_n=8
13 | 
14 | model_name=mistral-base
15 | model_name_or_path=mistralai/Mistral-7B-v0.1
16 | 
17 | folder_name="${model_name}-${prune_method}-${block_drop_method}-drop${drop_n}"
18 | similarity_cache_file="../results_prune/cache/${model_name}-${prune_method}-${dataset}-${n_calibration_samples}samples.pt"
19 | 
20 | echo ${folder_name}
21 | 
22 | output_dir=../results_prune/${folder_name}
23 | prune_model_save_path=${output_dir}/checkpoint
24 | 
25 | CUDA_VISIBLE_DEVICES=$GPUs accelerate launch --main_process_port $port \
26 |   src/compress.py \
27 |   --stage prune \
28 |   --model_name_or_path ${model_name_or_path} \
29 |   --dataset ${dataset} \
30 |   --dataset_dir ./src/llmtuner/data \
31 |   --split "train" \
32 |   --prune_data_type ${prune_data_type} \
33 |   --cutoff_len ${seq_len} \
34 |   --output_dir ${output_dir} \
35 |   --logging_steps 10 \
36 |   --bf16 \
37 |   --n_calibration_samples ${n_calibration_samples} \
38 |   --prune_method ${prune_method} \
39 |   --block_drop_method ${block_drop_method} \
40 |   --drop_n ${drop_n} \
41 |   --similarity_cache_file ${similarity_cache_file} \
42 |   --prune_model_save_path ${prune_model_save_path}
43 | 
44 | block_drop_method="post_dropping"
45 | # set only_update_config to True to save the disk memory
46 | only_update_config=False
47 | 
48 | python \
49 |   src/compress.py \
50 |   --stage prune \
51 |   --model_name_or_path ${model_name_or_path} \
52 |   --dataset ${dataset} \
53 |   --dataset_dir ./src/llmtuner/data \
54 |   --split "train" \
55 |   --only_update_config $only_update_config \
56 |   --prune_data_type ${prune_data_type} \
57 |   --cutoff_len ${seq_len} \
58 |   --output_dir ${output_dir} \
59 |   --logging_steps 10 \
60 |   --bf16 \
61 |   --n_calibration_samples ${n_calibration_samples} \
62 |   --prune_method ${prune_method} \
63 |   --block_drop_method ${block_drop_method} \
64 |   --drop_n ${drop_n} \
65 |   --similarity_cache_file ${similarity_cache_file} \
66 |   --prune_model_save_path ${prune_model_save_path}


--------------------------------------------------------------------------------
/scripts/dropping/layer_drop.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | port="21304"
 4 | GPUs="0,1,2,3"
 5 | 
 6 | dataset="c4_val"
 7 | prune_data_type="pt"
 8 | n_calibration_samples=256
 9 | seq_len=2048
10 | 
11 | prune_method="layer_drop"
12 | layer_drop_method="discrete"
13 | target_layer="attn"
14 | drop_n=8
15 | 
16 | model_name=mistral-base
17 | model_name_or_path=mistralai/Mistral-7B-v0.1
18 | 
19 | folder_name="${model_name}-${prune_method}_${target_layer}-${layer_drop_method}-drop${drop_n}"
20 | similarity_cache_file="../results_prune/cache/${model_name}-${prune_method}_${target_layer}-${dataset}-${n_calibration_samples}samples.pt"
21 | 
22 | echo ${folder_name}
23 | 
24 | output_dir=../results_prune/${folder_name}
25 | prune_model_save_path=${output_dir}/checkpoint
26 | 
27 | CUDA_VISIBLE_DEVICES=$GPUs accelerate launch --main_process_port $port \
28 |   src/compress.py \
29 |   --stage prune \
30 |   --model_name_or_path ${model_name_or_path} \
31 |   --dataset ${dataset} \
32 |   --dataset_dir ./src/llmtuner/data \
33 |   --split "train" \
34 |   --layer_drop_norm True \
35 |   --target_layer ${target_layer} \
36 |   --only_update_config True \
37 |   --prune_data_type ${prune_data_type} \
38 |   --cutoff_len ${seq_len} \
39 |   --output_dir ${output_dir} \
40 |   --logging_steps 10 \
41 |   --bf16 \
42 |   --n_calibration_samples ${n_calibration_samples} \
43 |   --prune_method ${prune_method} \
44 |   --layer_drop_method ${layer_drop_method} \
45 |   --drop_n ${drop_n} \
46 |   --similarity_cache_file ${similarity_cache_file} \
47 |   --prune_model_save_path ${prune_model_save_path}
48 | 
49 | 
50 | layer_drop_method="post_dropping"
51 | # set only_update_config to True to save the disk memory
52 | only_update_config=False
53 | 
54 | python src/compress.py \
55 |   --stage prune \
56 |   --model_name_or_path ${model_name_or_path} \
57 |   --dataset ${dataset} \
58 |   --dataset_dir ./src/llmtuner/data \
59 |   --split "train" \
60 |   --only_update_config $only_update_config \
61 |   --layer_drop_norm True \
62 |   --target_layer ${target_layer} \
63 |   --prune_data_type ${prune_data_type} \
64 |   --cutoff_len ${seq_len} \
65 |   --output_dir ${output_dir} \
66 |   --logging_steps 10 \
67 |   --bf16 \
68 |   --n_calibration_samples ${n_calibration_samples} \
69 |   --prune_method ${prune_method} \
70 |   --layer_drop_method ${layer_drop_method} \
71 |   --drop_n ${drop_n} \
72 |   --similarity_cache_file ${similarity_cache_file} \
73 |   --prune_model_save_path ${prune_model_save_path}


--------------------------------------------------------------------------------
/scripts/dropping/layer_drop_iterative.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | port="21304"
 3 | GPUs="0,1,2,3"
 4 | 
 5 | dataset="c4_val"
 6 | prune_data_type="pt"
 7 | n_calibration_samples=128
 8 | seq_len=2048
 9 | 
10 | prune_method="layer_drop"
11 | layer_drop_method="discrete"
12 | target_layer="all"
13 | 
14 | drop_n=1
15 | num_epochs=8
16 | 
17 | model_name=mistral-base
18 | model_name_or_path=mistralai/Mistral-7B-v0.1
19 | 
20 | for ((epoch=1; epoch<=num_epochs; epoch++)) do
21 |   layer_drop_method="discrete"
22 |   folder_name="Iterative-epoch${epoch}-${model_name}-${prune_method}-${target_layer}-${layer_drop_method}-drop${drop_n}PerEpoch"
23 |   similarity_cache_file="../results_prune/cache/Iterative-epoch${epoch}-${model_name}-drop_${target_layer}-${dataset}-${n_calibration_samples}samples.pt"
24 |   echo ${folder_name}
25 |   echo ${model_name_or_path}
26 |   output_dir=./results_prune/Iterative/${folder_name}
27 |   prune_model_save_path=${output_dir}/checkpoint
28 | 
29 |   CUDA_VISIBLE_DEVICES=$GPUs accelerate launch --main_process_port $port \
30 |     src/compress.py \
31 |     --stage prune \
32 |     --model_name_or_path ${model_name_or_path} \
33 |     --dataset ${dataset} \
34 |     --dataset_dir ./src/llmtuner/data \
35 |     --split "train" \
36 |     --prune_data_type ${prune_data_type} \
37 |     --cutoff_len ${seq_len} \
38 |     --layer_drop_norm True \
39 |     --target_layer ${target_layer} \
40 |     --output_dir ${output_dir} \
41 |     --logging_steps 10 \
42 |     --bf16 \
43 |     --n_calibration_samples ${n_calibration_samples} \
44 |     --prune_method ${prune_method} \
45 |     --layer_drop_method ${layer_drop_method} \
46 |     --drop_n ${drop_n} \
47 |     --similarity_cache_file ${similarity_cache_file} \
48 |     --prune_model_save_path ${prune_model_save_path}
49 | 
50 |   # Save the converted the model without DeepSpeed
51 |   layer_drop_method="post_dropping"
52 |   # set only_update_config to True to save the disk memory
53 |   only_update_config=False
54 | 
55 |   python src/compress.py \
56 |     --stage prune \
57 |     --model_name_or_path ${model_name_or_path} \
58 |     --dataset ${dataset} \
59 |     --dataset_dir ./src/llmtuner/data \
60 |     --split "train" \
61 |     --only_update_config $only_update_config \
62 |     --layer_drop_norm True \
63 |     --target_layer ${target_layer} \
64 |     --prune_data_type ${prune_data_type} \
65 |     --cutoff_len ${seq_len} \
66 |     --output_dir ${output_dir} \
67 |     --logging_steps 10 \
68 |     --bf16 \
69 |     --n_calibration_samples ${n_calibration_samples} \
70 |     --prune_method ${prune_method} \
71 |     --layer_drop_method ${layer_drop_method} \
72 |     --drop_n ${drop_n} \
73 |     --similarity_cache_file ${similarity_cache_file} \
74 |     --prune_model_save_path ${prune_model_save_path}
75 |   model_name_or_path=$prune_model_save_path
76 | done


--------------------------------------------------------------------------------
/scripts/dropping/layer_drop_joint.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | port="21304"
 3 | GPUs="0,1,2,3"
 4 | 
 5 | dataset="c4_val"
 6 | prune_data_type="pt"
 7 | n_calibration_samples=256
 8 | seq_len=2048
 9 | 
10 | prune_method="layer_drop"
11 | layer_drop_method="discrete"
12 | target_layer="all"
13 | drop_n=64
14 | 
15 | model_name=llama2-13b-base
16 | model_name_or_path=mistralai/Mistral-7B-v0.1
17 | 
18 | folder_name="${model_name}-${prune_method}_${target_layer}-${layer_drop_method}-drop${drop_n}"
19 | similarity_cache_file="../results_prune/cache/${model_name}-${prune_method}_${target_layer}-${dataset}-${n_calibration_samples}samples.pt"
20 | 
21 | echo ${folder_name}
22 | 
23 | output_dir=./results_prune/${folder_name}
24 | prune_model_save_path=${output_dir}/checkpoint
25 | 
26 | CUDA_VISIBLE_DEVICES=$GPUs accelerate launch --main_process_port $port \
27 |   src/compress.py \
28 |   --stage prune \
29 |   --model_name_or_path ${model_name_or_path} \
30 |   --dataset ${dataset} \
31 |   --dataset_dir ./src/llmtuner/data \
32 |   --split "train" \
33 |   --layer_drop_norm True \
34 |   --target_layer ${target_layer} \
35 |   --only_update_config True \
36 |   --prune_data_type ${prune_data_type} \
37 |   --cutoff_len ${seq_len} \
38 |   --output_dir ${output_dir} \
39 |   --logging_steps 10 \
40 |   --bf16 \
41 |   --n_calibration_samples ${n_calibration_samples} \
42 |   --prune_method ${prune_method} \
43 |   --layer_drop_method ${layer_drop_method} \
44 |   --drop_n ${drop_n} \
45 |   --similarity_cache_file ${similarity_cache_file} \
46 |   --prune_model_save_path ${prune_model_save_path}
47 | 
48 | 
49 | layer_drop_method="post_dropping"
50 | # set only_update_config to True to save the disk memory
51 | only_update_config=False
52 | 
53 | python \
54 |   src/compress.py \
55 |   --stage prune \
56 |   --model_name_or_path ${model_name_or_path} \
57 |   --dataset ${dataset} \
58 |   --dataset_dir ./src/llmtuner/data \
59 |   --split "train" \
60 |   --only_update_config $only_update_config \
61 |   --layer_drop_norm True \
62 |   --target_layer ${target_layer} \
63 |   --prune_data_type ${prune_data_type} \
64 |   --cutoff_len ${seq_len} \
65 |   --output_dir ${output_dir} \
66 |   --logging_steps 10 \
67 |   --bf16 \
68 |   --n_calibration_samples ${n_calibration_samples} \
69 |   --prune_method ${prune_method} \
70 |   --layer_drop_method ${layer_drop_method} \
71 |   --drop_n ${drop_n} \
72 |   --similarity_cache_file ${similarity_cache_file} \
73 |   --prune_model_save_path ${prune_model_save_path}


--------------------------------------------------------------------------------
/scripts/quantization/awq.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | model_path="########PATH_TO_HUGGING_FACE_CHECKPOINT#########"
 4 | quant_path="########PATH_TO_SAVE_THE_QUANTIZED_MODEL########"
 5 | bits=4
 6 | 
 7 | python AutoAWQ/quantize.py \
 8 |   $model_path \
 9 |   $quant_path \
10 |   $bits


--------------------------------------------------------------------------------
/scripts/quantization/gptq.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | model_path="########PATH_TO_HUGGING_FACE_CHECKPOINT#########"
 4 | quant_path="########PATH_TO_SAVE_THE_QUANTIZED_MODEL########"
 5 | 
 6 | bits=4
 7 | seed=0
 8 | num_samples=16
 9 | calibration_template=default
10 | 
11 | python AutoGPTQ/quantize.py \
12 |   --pretrained_model_dir $model_path \
13 |   --quantized_model_dir $quant_path \
14 |   --bits $bits \
15 |   --save_and_reload \
16 |   --desc_act \
17 |   --seed $seed \
18 |   --num_samples $num_samples \
19 |   --calibration-template $calibration_template \
20 |   --trust_remote_code \
21 |   --use_triton


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | from setuptools import setup, find_packages
 4 | 
 5 | 
 6 | def get_version():
 7 |     with open(os.path.join("src", "llmtuner", "__init__.py"), "r", encoding="utf-8") as f:
 8 |         file_content = f.read()
 9 |         pattern = r"{0}\W*=\W*\"([^\"]+)\"".format("__version__")
10 |         version, = re.findall(pattern, file_content)
11 |         return version
12 | 
13 | 
14 | def get_requires():
15 |     with open("requirements.txt", "r", encoding="utf-8") as f:
16 |         file_content = f.read()
17 |         lines = [line.strip() for line in file_content.strip().split("\n") if not line.startswith("#")]
18 |         return lines
19 | 
20 | 
21 | def main():
22 | 
23 |     setup(
24 |         name="llmtuner",
25 |         version=get_version(),
26 |         author="hiyouga",
27 |         author_email="hiyouga" "@" "buaa.edu.cn",
28 |         description="Easy-to-use LLM fine-tuning framework",
29 |         long_description=open("README.md", "r", encoding="utf-8").read(),
30 |         long_description_content_type="text/markdown",
31 |         keywords=["LLaMA", "BLOOM", "Falcon", "LLM", "ChatGPT", "transformer", "pytorch", "deep learning"],
32 |         license="Apache 2.0 License",
33 |         url="https://github.com/hiyouga/LLaMA-Factory",
34 |         package_dir={"": "src"},
35 |         packages=find_packages("src"),
36 |         python_requires=">=3.8.0",
37 |         install_requires=get_requires(),
38 |         classifiers=[
39 |             "Development Status :: 3 - Alpha",
40 |             "Intended Audience :: Developers",
41 |             "Intended Audience :: Education",
42 |             "Intended Audience :: Science/Research",
43 |             "License :: OSI Approved :: Apache Software License",
44 |             "Operating System :: OS Independent",
45 |             "Programming Language :: Python :: 3",
46 |             "Programming Language :: Python :: 3.8",
47 |             "Programming Language :: Python :: 3.9",
48 |             "Programming Language :: Python :: 3.10",
49 |             "Topic :: Scientific/Engineering :: Artificial Intelligence",
50 |         ]
51 |     )
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     main()
56 | 


--------------------------------------------------------------------------------
/src/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/.DS_Store


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/__init__.py


--------------------------------------------------------------------------------
/src/compress.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | 
 4 | sys.path = [os.getcwd()] + sys.path
 5 | 
 6 | from llmtuner import run_exp
 7 | 
 8 | 
 9 | def main():
10 |     run_exp()
11 | 
12 | 
13 | def _mp_fn(index):
14 |     # For xla_spawn (TPUs)
15 |     main()
16 | 
17 | 
18 | if __name__ == "__main__":
19 |     main()
20 | 


--------------------------------------------------------------------------------
/src/llmtuner/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/.DS_Store


--------------------------------------------------------------------------------
/src/llmtuner/__init__.py:
--------------------------------------------------------------------------------
1 | # Level: api, webui > chat, eval, compression > data, model > extras, hparams
2 | 
3 | from .compression import export_model, run_exp
4 | 
5 | __version__ = "0.5.2"
6 | __all__ = ["export_model", "run_exp",]
7 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/compression/.DS_Store


--------------------------------------------------------------------------------
/src/llmtuner/compression/__init__.py:
--------------------------------------------------------------------------------
1 | from .tuner import export_model, run_exp
2 | 
3 | 
4 | __all__ = ["export_model", "run_exp"]
5 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/prune/__init__.py:
--------------------------------------------------------------------------------
1 | from .workflow import run_prune
2 | 
3 | 
4 | __all__ = ["run_prune"]
5 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/prune/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/compression/prune/models/__init__.py


--------------------------------------------------------------------------------
/src/llmtuner/compression/prune/wrapper.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | logger = logging.getLogger(__name__)
 4 | 
 5 | 
 6 | """For recording weights"""
 7 | class HiddenStatesRecordWrapper:
 8 |     def __init__(self, layer, layer_name="none", record_input=True, record_output=True):
 9 |         self.layer = layer
10 |         self.layer_name = layer_name
11 | 
12 |         self.record_input = record_input
13 |         self.record_output = record_output
14 | 
15 |         if record_input:
16 |             self.input_hidden_states = []
17 |         if record_output:
18 |             self.output_hidden_states = []
19 | 
20 |     def record(self, input, output):
21 |         # input: (1, seq_len, hidden_size)
22 |         
23 |         if self.record_input:
24 |             self.input_hidden_states.append(input.squeeze(0).clone().cpu())
25 |         if self.record_output:
26 |             self.output_hidden_states.append(output.squeeze(0).clone().cpu())
27 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Casper
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/README.md:
--------------------------------------------------------------------------------
 1 | # AutoAWQ Kernels
 2 | 
 3 | AutoAWQ Kernels is a new package that is split up from the [main repository](https://github.com/casper-hansen/AutoAWQ) in order to avoid compilation times.
 4 | 
 5 | ## Requirements
 6 | 
 7 | - Windows: Must use WSL2.
 8 | 
 9 | - NVIDIA:
10 |   - GPU: Must be compute capability 7.5 or higher.
11 |   - CUDA Toolkit: Must be 11.8 or higher.
12 | - AMD:
13 |   - ROCm: Must be 5.6 or higher.
14 | 
15 | ## Install
16 | 
17 | ### Install from PyPi
18 | 
19 | The package is available on PyPi with CUDA 12.1.1 wheels:
20 | 
21 | ```
22 | pip install autoawq-kernels
23 | ```
24 | 
25 | ### Install release wheels
26 | 
27 | For ROCm and other CUDA versions, you can use the wheels published at each [release](https://github.com/casper-hansen/AutoAWQ_kernels/releases/):
28 | 
29 | ```
30 | pip install https://github.com/casper-hansen/AutoAWQ_kernels/releases/download/v0.0.2/autoawq_kernels-0.0.2+rocm561-cp310-cp310-linux_x86_64.whl
31 | ```
32 | 
33 | ### Build from source
34 | You can also build from source:
35 | 
36 | ```
37 | git clone https://github.com/casper-hansen/AutoAWQ_kernels
38 | cd AutoAWQ_kernels
39 | pip install -e .
40 | ```
41 | 
42 | To build for ROCm, you need to first install the following packages `rocsparse-dev hipsparse-dev rocthrust-dev rocblas-dev hipblas-dev`.


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/__init__.py


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/attention/cuda_bf16_wrapper.h:
--------------------------------------------------------------------------------
 1 | // Downloaded from from FasterTransformer v5.2.1
 2 | // https://github.com/NVIDIA/FasterTransformer/blob/release/v5.2.1_tag/src/fastertransformer/utils/cuda_bf16_wrapper.h
 3 | /*
 4 |  * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License");
 7 |  * you may not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | 
19 | #pragma once
20 | 
21 | #ifdef ENABLE_BF16
22 | #include <cuda_bf16.h>
23 | #endif
24 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/attention/ft_attention.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <torch/extension.h>
 3 | 
 4 | 
 5 | torch::Tensor single_query_attention(const torch::Tensor q,
 6 |                                      const torch::Tensor k,
 7 |                                      const torch::Tensor v,
 8 |                                      torch::Tensor k_cache,
 9 |                                      torch::Tensor v_cache,
10 |                                      c10::optional<const torch::Tensor> length_per_sample_,
11 |                                      c10::optional<const torch::Tensor> alibi_slopes_,
12 |                                      const int timestep,
13 |                                      const int rotary_embedding_dim = 0,
14 |                                      const float rotary_base = 10000.0f,
15 |                                      const bool neox_rotary_style=true);


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllama/cu_compat.cuh:
--------------------------------------------------------------------------------
 1 | // Adapted from turboderp exllama: https://github.com/turboderp/exllama
 2 | 
 3 | #ifndef _cuda_compat_cuh
 4 | #define _cuda_compat_cuh
 5 | 
 6 | // atomicAdd for half types, to support CC < 7.x
 7 | 
 8 | __device__ __forceinline__ void atomicAdd_half(half* address, half val)
 9 | {
10 |     unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
11 |     unsigned int old = *address_as_ui;
12 |     unsigned int assumed;
13 | 
14 |     do
15 |     {
16 |         assumed = old;
17 |         __half_raw hsum;
18 |         hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
19 |         half tmpres = __hadd(hsum, val);
20 |         hsum = __half_raw(tmpres);
21 |         old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
22 |         old = atomicCAS(address_as_ui, assumed, old);
23 |     }
24 |     while (assumed != old);
25 | }
26 | 
27 | // atomicAdd for half2 types
28 | 
29 | __device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val)
30 | {
31 |     unsigned int* address_as_ui = (unsigned int*)address;
32 |     unsigned int old = *address_as_ui;
33 |     unsigned int assumed;
34 |     do
35 |     {
36 |         assumed = old;
37 |         half2 old_val = *((half2*)&old);
38 |         half2 new_val = __hadd2(old_val, val);
39 |         old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val));
40 |     }
41 |     while (assumed != old);
42 | }
43 | 
44 | //
45 | 
46 | #if defined(__CUDA_ARCH__) || defined(USE_ROCM)
47 | #if __CUDA_ARCH__ < 700 || defined(USE_ROCM)
48 | 
49 | __device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
50 | 
51 | #if __CUDA_ARCH__ < 600 || defined(USE_ROCM)
52 | __device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
53 | #endif
54 | 
55 | #endif
56 | #endif
57 | 
58 | #endif
59 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllama/cuda_buffers.cu:
--------------------------------------------------------------------------------
 1 | // Adapted from turboderp exllama: https://github.com/turboderp/exllama
 2 | 
 3 | #define _cuda_buffers_cu
 4 | #include "cuda_buffers.cuh"
 5 | 
 6 | CudaBuffers* g_buffers[CUDA_MAX_DEVICES] = {NULL};
 7 | // __constant__ half2 q4_table[16][256];
 8 | // half2 q4_table_host[16][256];
 9 | // bool q4_table_init = false;
10 | 
11 | CudaBuffers::CudaBuffers
12 | (
13 |     int _device,
14 |     int _temp_state_size,
15 |     half* _temp_state,
16 |     half* _temp_dq
17 | ) :
18 |     device(_device),
19 |     temp_state_size(_temp_state_size),
20 |     temp_state(_temp_state),
21 |     temp_dq(_temp_dq)
22 | {
23 |     cudaSetDevice(_device);
24 | 
25 |     cudaStreamCreate(&alt_stream_1);
26 |     cudaStreamCreate(&alt_stream_2);
27 |     cudaStreamCreate(&alt_stream_3);
28 |     cudaEventCreate(&alt_stream_1_done);
29 |     cudaEventCreate(&alt_stream_2_done);
30 |     cudaEventCreate(&alt_stream_3_done);
31 | }
32 | 
33 | CudaBuffers::~CudaBuffers()
34 | {
35 |     cudaStreamDestroy(alt_stream_1);
36 |     cudaStreamDestroy(alt_stream_2);
37 |     cudaStreamDestroy(alt_stream_3);
38 |     cudaEventDestroy(alt_stream_1_done);
39 |     cudaEventDestroy(alt_stream_2_done);
40 |     cudaEventDestroy(alt_stream_3_done);
41 | }
42 | 
43 | CudaBuffers* get_buffers(const int device_index)
44 | {
45 |     return g_buffers[device_index];
46 | }
47 | 
48 | void prepare_buffers_cuda
49 | (
50 |     int _device,
51 |     int _temp_state_size,
52 |     half* _temp_state,
53 |     half* _temp_dq
54 | )
55 | {
56 |     CudaBuffers* buffers = new CudaBuffers
57 |     (
58 |         _device,
59 |         _temp_state_size,
60 |         _temp_state,
61 |         _temp_dq
62 |     );
63 | 
64 |     g_buffers[_device] = buffers;
65 | }
66 | 
67 | void cleanup_buffers_cuda()
68 | {
69 |     for (int i = 0; i < CUDA_MAX_DEVICES; i++)
70 |     {
71 |         if (!g_buffers[i]) continue;
72 |         delete g_buffers[i];
73 |         g_buffers[i] = NULL;
74 |     }
75 | }
76 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllama/cuda_buffers.cuh:
--------------------------------------------------------------------------------
 1 | // Adapted from turboderp exllama: https://github.com/turboderp/exllama
 2 | 
 3 | #ifndef _cuda_buffers_cuh
 4 | #define _cuda_buffers_cuh
 5 | 
 6 | #include <cuda_runtime.h>
 7 | #include <cuda_fp16.h>
 8 | #include <cstdint>
 9 | #include <cstdio>
10 | 
11 | const int CUDA_MAX_DEVICES = 16;
12 | 
13 | // #ifndef _cuda_buffers_cu
14 | // extern __constant__ half2 q4_table[16][256];
15 | // #endif
16 | 
17 | class CudaBuffers
18 | {
19 | public:
20 |     int device;
21 | 
22 |     half* temp_state;           // [max_hidden_rows * intermediate_size]
23 |     int temp_state_size;
24 |     half* temp_dq;              // size of largest quant tensor * 8
25 | 
26 |     cudaStream_t alt_stream_1;
27 |     cudaStream_t alt_stream_2;
28 |     cudaStream_t alt_stream_3;
29 |     cudaEvent_t alt_stream_1_done;
30 |     cudaEvent_t alt_stream_2_done;
31 |     cudaEvent_t alt_stream_3_done;
32 | 
33 |     CudaBuffers
34 |     (
35 |         int _device,
36 |         int _temp_state_size,
37 |         half* _temp_state,
38 |         half* _temp_dq
39 |     );
40 |     ~CudaBuffers();
41 | };
42 | 
43 | CudaBuffers* get_buffers(const int device_index);
44 | 
45 | void prepare_buffers_cuda
46 | (
47 |     int _device,
48 |     int _temp_state_size,
49 |     half* _temp_state,
50 |     half* _temp_dq
51 | );
52 | 
53 | void cleanup_buffers_cuda();
54 | 
55 | #endif
56 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllama/cuda_func/column_remap.cu:
--------------------------------------------------------------------------------
 1 | // Adapted from turboderp exllama: https://github.com/turboderp/exllama
 2 | 
 3 | #include "column_remap.cuh"
 4 | #include "../util.cuh"
 5 | 
 6 | const int SHUF_BLOCKSIZE_X = 256;
 7 | const int SHUF_BLOCKSIZE_Y = 16;
 8 | 
 9 | __global__ void column_remap_kernel
10 | (
11 |     const half* __restrict__ x,
12 |     half* __restrict__ x_new,
13 |     const int x_width,
14 |     const int x_height,
15 |     const uint32_t* x_map
16 | )
17 | {
18 |     int x_column = SHUF_BLOCKSIZE_X * blockIdx.x + threadIdx.x;
19 |     int x_row = SHUF_BLOCKSIZE_Y * blockIdx.y;
20 |     if (x_column >= x_width) return;
21 |     //if (x_row >= x_height) return;
22 | 
23 |     int x_stride = x_width;
24 |     int x_idx = x_row * x_stride + x_column;
25 | 
26 |     int x_row_end = min(x_row + SHUF_BLOCKSIZE_Y, x_height);
27 |     int x_idx_end = x_row_end * x_stride + x_column;
28 | 
29 |     int s_column = x_map[x_column];
30 |     int s_idx = x_row * x_stride + s_column;
31 | 
32 |     while (x_idx < x_idx_end)
33 |     {
34 |         x_new[x_idx] = x[s_idx];
35 |         x_idx += x_stride;
36 |         s_idx += x_stride;
37 |     }
38 | }
39 | 
40 | // Remap columns in x to correspond to sequential group index before matmul
41 | //
42 | // perform x -> seq_x such that seq_x @ seq_w == x @ w
43 | 
44 | void column_remap_cuda
45 | (
46 |     const half* x,
47 |     half* x_new,
48 |     const int x_height,
49 |     const int x_width,
50 |     const uint32_t* x_map
51 | )
52 | {
53 |     dim3 threads(SHUF_BLOCKSIZE_X, 1, 1);
54 | 
55 |     dim3 blocks
56 |     (
57 |         (x_width + SHUF_BLOCKSIZE_X - 1) / SHUF_BLOCKSIZE_X,
58 |         (x_height + SHUF_BLOCKSIZE_Y - 1) / SHUF_BLOCKSIZE_Y,
59 |         1
60 |     );
61 | 
62 |     column_remap_kernel<<<blocks, threads>>>(x, x_new, x_width, x_height, x_map);
63 | }
64 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllama/cuda_func/column_remap.cuh:
--------------------------------------------------------------------------------
 1 | // Adapted from turboderp exllama: https://github.com/turboderp/exllama
 2 | 
 3 | #ifndef _column_remap_cuh
 4 | #define _column_remap_cuh
 5 | 
 6 | #include <cuda_runtime.h>
 7 | #include <cuda_fp16.h>
 8 | #include <cstdint>
 9 | 
10 | void column_remap_cuda
11 | (
12 |     const half* x,
13 |     half* x_new,
14 |     const int x_height,
15 |     const int x_width,
16 |     const uint32_t* x_map
17 | );
18 | 
19 | #endif


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllama/cuda_func/q4_matmul.cuh:
--------------------------------------------------------------------------------
 1 | // Adapted from turboderp exllama: https://github.com/turboderp/exllama
 2 | 
 3 | #ifndef _q4_matmul_cuh
 4 | #define _q4_matmul_cuh
 5 | 
 6 | #include <cuda_runtime.h>
 7 | #include <cuda_fp16.h>
 8 | #include <cstdint>
 9 | #include <cstdio>
10 | #include <ATen/cuda/CUDAContext.h>
11 | 
12 | #include "q4_matrix.cuh"
13 | #include "../tuning.h"
14 | 
15 | // Workaround for hipify_python using rocblas instead of hipblas.
16 | #if defined(USE_ROCM)
17 | #include <hipblas/hipblas.h>
18 | #define rocblas_handle hipblasHandle_t
19 | #endif
20 | 
21 | void q4_matmul_cuda
22 | (
23 |     ExLlamaTuning* tuningParams,
24 |     const half* x,
25 |     const int x_height,
26 |     const Q4Matrix* w,
27 |     half* out,
28 |     bool no_zero = false,
29 |     cudaStream_t alt_stream = NULL
30 | );
31 | 
32 | void q4_matmul_recons_cuda
33 | (
34 |     ExLlamaTuning* tuningParams,
35 |     const half* x,
36 |     const int x_height,
37 |     Q4Matrix* w,
38 |     half* out,
39 |     const cublasHandle_t handle,
40 |     bool no_zero = false
41 | );
42 | 
43 | #endif
44 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllama/cuda_func/q4_matrix.cuh:
--------------------------------------------------------------------------------
 1 | // Adapted from turboderp exllama: https://github.com/turboderp/exllama
 2 | 
 3 | #ifndef _q4_matrix_cuh
 4 | #define _q4_matrix_cuh
 5 | 
 6 | #include <cuda_runtime.h>
 7 | #include <cuda_fp16.h>
 8 | #include <cstdint>
 9 | 
10 | class Q4Matrix
11 | {
12 | public:
13 | 
14 |     int device;
15 | 
16 |     int height;
17 |     int width;
18 |     int groups;
19 |     int groupsize;
20 | 
21 |     uint32_t* cuda_qweight = NULL;
22 |     uint32_t* cuda_qzeros = NULL;
23 |     half* cuda_scales = NULL;
24 |     uint32_t* cuda_x_map = NULL;
25 | 
26 |     Q4Matrix
27 |     (
28 |         const int _height,
29 |         const int _width,
30 |         const int _groups,
31 | 
32 |         uint32_t* _qweight,
33 |         uint32_t* _qzeros,
34 |         half* _scales,
35 |         uint32_t* _g_idx,
36 | 
37 |         const int _device
38 |     );
39 | 
40 |     ~Q4Matrix();
41 | 
42 |     void reconstruct(half* out);
43 | 
44 | private:
45 | 
46 |     void make_sequential(const uint32_t* cpu_g_idx);
47 | 
48 | };
49 | 
50 | void g_q4_keep_matrix(Q4Matrix* m);
51 | void g_q4_free_matrices();
52 | 
53 | #endif


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllama/hip_compat.cuh:
--------------------------------------------------------------------------------
 1 | // Adapted from turboderp exllama: https://github.com/turboderp/exllama
 2 | 
 3 | #ifndef _hip_compat_cuh
 4 | #define _hip_compat_cuh
 5 | 
 6 | // Workaround for a bug in hipamd, backported from upstream, this is fixed in ROCm 5.6.
 7 | __device__ __forceinline__ __half __compat_hrcp(__half x) {
 8 |     return __half_raw{
 9 |         static_cast<_Float16>(__builtin_amdgcn_rcph(static_cast<__half_raw>(x).data))};
10 | }
11 | 
12 | // ROCm 6.0 compatible from: /opt/rocm-6.0.0/include/hip/amd_detail/amd_hip_fp16.h:1708
13 | __device__ __forceinline__ __half2 __compat_h2rcp(__half2 x) {
14 |     return _Float16_2{_Float16_2{static_cast<_Float16>(1.0f), static_cast<_Float16>(1.0f)} / x.data};
15 | }
16 | 
17 | #define hrcp __compat_hrcp
18 | #define h2rcp __compat_h2rcp
19 | 
20 | // Automatic conversion of hipblasHgemm doesn't convert half to hipblasHalf.
21 | __host__ __forceinline__ hipblasStatus_t __compat_hipblasHgemm(hipblasHandle_t    handle,
22 |                                                                hipblasOperation_t transA,
23 |                                                                hipblasOperation_t transB,
24 |                                                                int                m,
25 |                                                                int                n,
26 |                                                                int                k,
27 |                                                                const half*        alpha,
28 |                                                                const half*        AP,
29 |                                                                int                lda,
30 |                                                                const half*        BP,
31 |                                                                int                ldb,
32 |                                                                const half*        beta,
33 |                                                                half*              CP,
34 |                                                                int                ldc) {
35 |     return hipblasHgemm(handle, transA, transB, m, n, k,
36 |                         reinterpret_cast<const hipblasHalf *>(alpha),
37 |                         reinterpret_cast<const hipblasHalf *>(AP), lda,
38 |                         reinterpret_cast<const hipblasHalf *>(BP), ldb,
39 |                         reinterpret_cast<const hipblasHalf *>(beta),
40 |                         reinterpret_cast<hipblasHalf *>(CP), ldc);
41 | }
42 | #define hipblasHgemm __compat_hipblasHgemm
43 | 
44 | // Previous version of PyTorch were converting to rocBLAS instead of hipBLAS.
45 | #define rocblas_handle hipblasHandle_t
46 | #define rocblas_operation_none HIPBLAS_OP_N
47 | #define rocblas_get_stream hipblasGetStream
48 | #define rocblas_set_stream hipblasSetStream
49 | #define rocblas_hgemm __compat_hipblasHgemm
50 | 
51 | #endif
52 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllama/tuning.h:
--------------------------------------------------------------------------------
 1 | // Adapted from turboderp exllama: https://github.com/turboderp/exllama
 2 | 
 3 | #ifndef _tuning_h
 4 | #define _tuning_h
 5 | 
 6 | struct ExLlamaTuning
 7 | {
 8 |     int matmul_recons_thd;
 9 |     bool matmul_fused_remap;
10 |     bool matmul_no_half2;
11 | };
12 | 
13 | #endif
14 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllama/util.cuh:
--------------------------------------------------------------------------------
 1 | // Adapted from turboderp exllama: https://github.com/turboderp/exllama
 2 | 
 3 | #ifndef _util_cuh
 4 | #define _util_cuh
 5 | 
 6 | #include <cuda_runtime.h>
 7 | #include <cuda_fp16.h>
 8 | #include <cstdint>
 9 | #include <cstdio>
10 | 
11 | #if defined(USE_ROCM)
12 | #define cudaUnspecified hipErrorUnknown
13 | #else
14 | #define cudaUnspecified cudaErrorApiFailureBase
15 | #endif
16 | 
17 | // React to failure on return code != cudaSuccess
18 | 
19 | #define _cuda_check(fn) \
20 | do { \
21 |     {_cuda_err = fn;} \
22 |     if (_cuda_err != cudaSuccess) goto _cuda_fail; \
23 | } while(false)
24 | 
25 | // React to failure on return code == 0
26 | 
27 | #define _alloc_check(fn) \
28 | do { \
29 |     if (!(fn)) { _cuda_err = cudaUnspecified; goto _cuda_fail; } \
30 |     else _cuda_err = cudaSuccess; \
31 | } while(false)
32 | 
33 | #endif
34 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllamav2/config.h:
--------------------------------------------------------------------------------
 1 | #ifndef _config_h
 2 | #define _config_h
 3 | 
 4 | #define MAX_Q_GEMM_ROWS 50
 5 | 
 6 | #define QMODE_2BIT 1
 7 | #define QMODE_3BIT 1
 8 | #define QMODE_4BIT 1
 9 | #define QMODE_5BIT 1
10 | #define QMODE_6BIT 0
11 | #define QMODE_8BIT 0
12 | 
13 | #endif
14 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllamav2/cpp/util.h:
--------------------------------------------------------------------------------
 1 | #ifndef _util_h
 2 | #define _util_h
 3 | 
 4 | #define DBGS(__x) printf("%s\n", __x)
 5 | #define DBGI(__x) printf("%s: %i\n", #__x, __x)
 6 | #define DBGI2(__x, __y) printf("%s, %s: %i, %i\n", #__x, #__y, __x, __y)
 7 | #define DBGI3(__x, __y, __z) printf("%s, %s, %s: %i, %i, %i\n", #__x, #__y, #__z, __x, __y, __z)
 8 | #define DBGF(__x) printf("%s: %f\n", #__x, __x)
 9 | #define DBGF2(__x, __y) printf("%s, %s: %f, %f\n", #__x, #__y, __x, __y)
10 | #define DBGF3(__x, __y, __z) printf("%s, %s, %s: %f, %f, %f\n", #__x, #__y, #__z, __x, __y, __z)
11 | 
12 | #endif
13 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllamav2/cuda/compat.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef _compat_cuh
 2 | #define _compat_cuh
 3 | 
 4 | // atomicAdd for half types, to support CC < 7.x
 5 | 
 6 | __device__ __forceinline__ void atomicAdd_half(half* address, half val)
 7 | {
 8 |     unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
 9 |     unsigned int old = *address_as_ui;
10 |     unsigned int assumed;
11 | 
12 |     do
13 |     {
14 |         assumed = old;
15 |         __half_raw hsum;
16 |         hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
17 |         half tmpres = __hadd(hsum, val);
18 |         hsum = __half_raw(tmpres);
19 |         old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
20 |         old = atomicCAS(address_as_ui, assumed, old);
21 |     }
22 |     while (assumed != old);
23 | }
24 | 
25 | // atomicAdd for half2 types
26 | 
27 | __device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val)
28 | {
29 |     unsigned int* address_as_ui = (unsigned int*)address;
30 |     unsigned int old = *address_as_ui;
31 |     unsigned int assumed;
32 |     do
33 |     {
34 |         assumed = old;
35 |         half2 old_val = *((half2*)&old);
36 |         half2 new_val = __hadd2(old_val, val);
37 |         old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val));
38 |     }
39 |     while (assumed != old);
40 | }
41 | 
42 | //
43 | 
44 | #if defined(__CUDA_ARCH__) || defined(USE_ROCM)
45 | #if __CUDA_ARCH__ < 700 || defined(USE_ROCM)
46 | 
47 | __device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
48 | 
49 | #if __CUDA_ARCH__ < 600 || defined(USE_ROCM)
50 | __device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
51 | #endif
52 | 
53 | #endif
54 | #endif
55 | 
56 | #endif
57 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllamav2/cuda/compat_gemm.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef _compat_gemm_cuh
 2 | #define _compat_gemm_cuh
 3 | 
 4 | #if defined(USE_ROCM)
 5 | 
 6 | // For some reason this include is not present anywhere in exllama_v2 codebase, but it is required
 7 | // for symbols as hipblasHalf.
 8 | #include <hipblas/hipblas.h>
 9 | 
10 | __host__ __forceinline__ hipblasStatus_t __compat_hipblasHgemm(hipblasHandle_t    handle,
11 |                                                                hipblasOperation_t transA,
12 |                                                                hipblasOperation_t transB,
13 |                                                                int                m,
14 |                                                                int                n,
15 |                                                                int                k,
16 |                                                                const half*        alpha,
17 |                                                                const half*        AP,
18 |                                                                int                lda,
19 |                                                                const half*        BP,
20 |                                                                int                ldb,
21 |                                                                const half*        beta,
22 |                                                                half*              CP,
23 |                                                                int                ldc) {
24 |     return hipblasHgemm(handle, transA, transB, m, n, k,
25 |                         reinterpret_cast<const hipblasHalf *>(alpha),
26 |                         reinterpret_cast<const hipblasHalf *>(AP), lda,
27 |                         reinterpret_cast<const hipblasHalf *>(BP), ldb,
28 |                         reinterpret_cast<const hipblasHalf *>(beta),
29 |                         reinterpret_cast<hipblasHalf *>(CP), ldc);
30 | }
31 | #define hipblasHgemm __compat_hipblasHgemm
32 | 
33 | // Previous version of PyTorch were converting to rocBLAS instead of hipBLAS.
34 | #define rocblas_operation_none HIPBLAS_OP_N
35 | #define rocblas_hgemm __compat_hipblasHgemm
36 | #endif
37 | 
38 | #endif
39 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllamav2/cuda/q_gemm.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef _q_gemm_cuh
 2 | #define _q_gemm_cuh
 3 | 
 4 | #include <cuda_runtime.h>
 5 | #include <cuda_fp16.h>
 6 | #include <cstdint>
 7 | #include <cstdio>
 8 | #include <ATen/cuda/CUDAContext.h>
 9 | 
10 | #include "q_matrix.cuh"
11 | 
12 | void gemm_half_q_half_cuda
13 | (
14 |     cublasHandle_t cublas_handle,
15 |     const half* a,
16 |     QMatrix* b,
17 |     half* c,
18 |     int size_m,
19 |     int size_n,
20 |     int size_k,
21 |     bool clear = false,
22 |     half* reconstruct = NULL,
23 |     bool force_cuda = false
24 | );
25 | 
26 | void clear_tensor_cuda
27 | (
28 |     half* c,
29 |     int size_m,
30 |     int size_n
31 | );
32 | 
33 | #endif


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllamav2/cuda/q_matrix.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef _q_matrix_cuh
 2 | #define _q_matrix_cuh
 3 | 
 4 | #include <cuda_runtime.h>
 5 | #include <cuda_fp16.h>
 6 | #include <cstdint>
 7 | #include <cstdio>
 8 | 
 9 | #define MAX_SUPERGROUPS 16
10 | 
11 | class QMatrix
12 | {
13 | public:
14 | 
15 |     int device;
16 |     bool is_gptq;
17 | 
18 |     int height;
19 |     int width;
20 |     int groups;
21 |     int groupsize;
22 | 
23 |     int rows_8;
24 |     int rows_6;
25 |     int rows_5;
26 |     int rows_4;
27 |     int rows_3;
28 |     int rows_2;
29 | 
30 |     uint32_t* cuda_q_weight = NULL;
31 |     uint16_t* cuda_q_perm = NULL;
32 |     uint16_t* cuda_q_invperm = NULL;
33 |     uint32_t* cuda_q_scale = NULL;
34 |     half* cuda_q_scale_max = NULL;
35 |     uint16_t* cuda_q_groups = NULL;
36 |     uint32_t* cuda_gptq_qzeros = NULL;
37 |     half* cuda_gptq_scales = NULL;
38 | 
39 |     half* temp_dq;
40 | 
41 |     bool failed;
42 | 
43 |     QMatrix
44 |     (
45 |         const int _device,
46 |         const int _height,
47 |         const int _width,
48 |         const int _groups,
49 | 
50 |         uint32_t* _q_weight,
51 |         uint16_t* _q_perm,
52 |         uint16_t* _q_invperm,
53 |         uint32_t* _q_scale,
54 |         half* _q_scale_max,
55 |         uint16_t* _q_groups,
56 | 
57 |         uint32_t* _gptq_qzeros,
58 |         half* _gptq_scales,
59 |         uint32_t* _gptq_g_idx,
60 | 
61 |         half* _temp_dq
62 |     );
63 | 
64 |     ~QMatrix();
65 | 
66 |     void reconstruct(half* out);
67 |     bool make_sequential(const uint32_t* cpu_g_idx);
68 | 
69 | private:
70 | 
71 | };
72 | 
73 | #endif
74 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllamav2/cuda/quant/qdq_2.cuh:
--------------------------------------------------------------------------------
  1 | #ifndef _qdq_2_cuh
  2 | #define _qdq_2_cuh
  3 | 
  4 | #include "qdq_util.cuh"
  5 | #include "../../config.h"
  6 | 
  7 | #if QMODE_2BIT == 1
  8 | 
  9 | // Permutation:
 10 | //
 11 | // ffddbb99 77553311  eeccaa88 66442200
 12 | 
 13 | __forceinline__ __device__ void shuffle_2bit_16
 14 | (
 15 |     uint32_t* q,
 16 |     int stride
 17 | )
 18 | {
 19 |     uint32_t qa = q[0];
 20 |     uint32_t qb = 0;
 21 | 
 22 |     #pragma unroll
 23 |     for (int i = 0; i < 8; i++)
 24 |     {
 25 |         uint32_t qa0 = qa & 0x03;
 26 |         uint32_t qa1 = (qa & 0x0c) >> 2;
 27 |         qa >>= 4;
 28 |         qb |= (qa1 << (i * 2 + 16));
 29 |         qb |= (qa0 << (i * 2));
 30 |     }
 31 |     q[0] = qb;
 32 | }
 33 | 
 34 | __forceinline__ __device__ void dequant_2bit_16
 35 | (
 36 |     const uint32_t q_0,
 37 |     half2 (&dq)[8],
 38 |     int stride
 39 | )
 40 | {
 41 |     const uint32_t c0 = 0x64006400;
 42 |     const half y4_  = __float2half_rn(1.0f /  4.0f);
 43 |     const half y16_ = __float2half_rn(1.0f / 16.0f);
 44 |     const half y64_ = __float2half_rn(1.0f / 64.0f);
 45 |     const half2 y4  = __halves2half2(y4_,  y4_);
 46 |     const half2 y16 = __halves2half2(y16_, y16_);
 47 |     const half2 y64 = __halves2half2(y64_, y64_);
 48 |     const half z1_  = __float2half_rn(-1024.0f         - 2.0f);
 49 |     const half z4_  = __float2half_rn(-1024.0f /  4.0f - 2.0f);
 50 |     const half z16_ = __float2half_rn(-1024.0f / 16.0f - 2.0f);
 51 |     const half z64_ = __float2half_rn(-1024.0f / 64.0f - 2.0f);
 52 |     const half2 z1  = __halves2half2(z1_,  z1_);
 53 |     const half2 z4  = __halves2half2(z4_,  z4_);
 54 |     const half2 z16 = __halves2half2(z16_, z16_);
 55 |     const half2 z64 = __halves2half2(z64_, z64_);
 56 | 
 57 |     uint32_t qa = q_0;
 58 |     half2_uint32 q0((qa & 0x00030003) | c0); // half2(q[ 0], q[ 1])      + 1024
 59 |     half2_uint32 q1((qa & 0x000c000c) | c0); // half2(q[ 2], q[ 3]) *  4 + 1024
 60 |     half2_uint32 q2((qa & 0x00300030) | c0); // half2(q[ 4], q[ 5]) * 16 + 1024
 61 |     half2_uint32 q3((qa & 0x00c000c0) | c0); // half2(q[ 6], q[ 7]) * 64 + 1024
 62 |     qa >>= 8;
 63 |     half2_uint32 q4((qa & 0x00030003) | c0); // half2(q[ 8], q[ 8])      + 1024
 64 |     half2_uint32 q5((qa & 0x000c000c) | c0); // half2(q[10], q[11]) *  4 + 1024
 65 |     half2_uint32 q6((qa & 0x00300030) | c0); // half2(q[12], q[13]) * 16 + 1024
 66 |     half2_uint32 q7((qa & 0x00c000c0) | c0); // half2(q[14], q[15]) * 64 + 1024
 67 | 
 68 |     dq[0] = __hadd2(q0.as_half2, z1);
 69 |     dq[1] = __hfma2(q1.as_half2, y4,  z4);
 70 |     dq[2] = __hfma2(q2.as_half2, y16, z16);
 71 |     dq[3] = __hfma2(q3.as_half2, y64, z64);
 72 |     dq[4] = __hadd2(q4.as_half2, z1);
 73 |     dq[5] = __hfma2(q5.as_half2, y4,  z4);
 74 |     dq[6] = __hfma2(q6.as_half2, y16, z16);
 75 |     dq[7] = __hfma2(q7.as_half2, y64, z64);
 76 | }
 77 | 
 78 | #else
 79 | 
 80 | __forceinline__ __device__ void shuffle_2bit_16
 81 | (
 82 |     uint32_t* q,
 83 |     int stride
 84 | )
 85 | {
 86 | }
 87 | 
 88 | __forceinline__ __device__ void dequant_2bit_16
 89 | (
 90 |     const uint32_t q_0,
 91 |     half2 (&dq)[8],
 92 |     int stride
 93 | )
 94 | {
 95 |     half dqh[16];
 96 |     for (int i = 0; i < 16; i++) dqh[i] = dq_ns(exb(q_0, i * 2, 0x03), 2);
 97 | 
 98 |     for (int i = 0; i < 8; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
 99 | }
100 | 
101 | #endif
102 | 
103 | #endif


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllamav2/cuda/quant/qdq_6.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef _qdq_6_cuh
 2 | #define _qdq_6_cuh
 3 | 
 4 | #include "qdq_util.cuh"
 5 | #include "../../config.h"
 6 | 
 7 | #if QMODE_6BIT == 1
 8 | 
 9 |   // Not implemented
10 | 
11 | #else
12 | 
13 | __forceinline__ __device__ void shuffle_6bit_16
14 | (
15 |     uint32_t* q,
16 |     int stride
17 | )
18 | {
19 | }
20 | 
21 | __forceinline__ __device__ void dequant_6bit_16
22 | (
23 |     const uint32_t q_0,
24 |     const uint32_t q_1,
25 |     const uint32_t q_2,
26 |     half2 (&dq)[8],
27 |     int stride
28 | )
29 | {
30 |     half dqh[16];
31 |     for (int i = 0; i < 5; i++) dqh[     i] = dq_ns(exb(     q_0, i * 6    , 0x3f), 32);
32 |                                 dqh[ 5    ] = dq_ns(exb(q_1, q_0,        30, 0x3f), 32);
33 |     for (int i = 0; i < 4; i++) dqh[ 6 + i] = dq_ns(exb(     q_1, i * 6 + 4, 0x3f), 32);
34 |                                 dqh[10    ] = dq_ns(exb(q_2, q_1,        28, 0x3f), 32);
35 |     for (int i = 0; i < 5; i++) dqh[11 + i] = dq_ns(exb(     q_2, i * 6 + 2, 0x3f), 32);
36 | 
37 |     for (int i = 0; i < 8; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
38 | }
39 | 
40 | #endif
41 | 
42 | #endif
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllamav2/cuda/quant/qdq_8.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef _qdq_8_cuh
 2 | #define _qdq_8_cuh
 3 | 
 4 | #include "qdq_util.cuh"
 5 | #include "../../config.h"
 6 | 
 7 | #if QMODE_8BIT == 1
 8 | 
 9 |   // Not implemented
10 | 
11 | #else
12 | 
13 | __forceinline__ __device__ void shuffle_8bit_4
14 | (
15 |     uint32_t* q,
16 |     int stride
17 | )
18 | {
19 | }
20 | 
21 | __forceinline__ __device__ void dequant_8bit_8
22 | (
23 |     const uint32_t q_0,
24 |     const uint32_t q_1,
25 |     half2 (&dq)[4],
26 |     int stride
27 | )
28 | {
29 |     half dqh[8];
30 |     for (int i = 0; i < 4; i++) dqh[i    ] = dq_ns(exb(q_0, i * 8, 0xff), 128);
31 |     for (int i = 0; i < 4; i++) dqh[i + 4] = dq_ns(exb(q_1, i * 8, 0xff), 128);
32 | 
33 |     for (int i = 0; i < 4; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
34 | }
35 | 
36 | #endif
37 | 
38 | #endif


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllamav2/cuda/quant/qdq_util.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef _qdq_util_cuh
 2 | #define _qdq_util_cuh
 3 | 
 4 | union half2_uint32
 5 | {
 6 |     uint32_t as_uint32;
 7 |     half2 as_half2;
 8 |     __device__ half2_uint32(uint32_t val) : as_uint32(val) {}
 9 |     __device__ half2_uint32(half2 val) : as_half2(val) {}
10 | };
11 | 
12 | union half_uint16
13 | {
14 |     uint16_t as_uint16;
15 |     half as_half;
16 |     __device__ half_uint16(uint16_t val) : as_uint16(val) {}
17 |     __device__ half_uint16(half val) : as_half(val) {}
18 | };
19 | 
20 | // Max_scale premultiplied by 1/256
21 | 
22 | __forceinline__ __device__ half dq_scale(const int qs, const half max_scale)
23 | {
24 |     int qs_i = qs + 1;
25 |     half qs_h = __int2half_rn(qs_i * qs_i);
26 |     qs_h = __hmul(qs_h, max_scale);
27 |     return qs_h;
28 | }
29 | 
30 | __forceinline__ __device__ half dq(const int q, const int qzero, const half scale)
31 | {
32 |     return __hmul(__int2half_rn(q - qzero), scale);
33 | }
34 | 
35 | __forceinline__ __device__ half dq_ns(const int q, const int qzero)
36 | {
37 |     //return __hsub(__int2half_rn(q), __int2half_rn(qzero));
38 |     return __int2half_rn(q - qzero);
39 | }
40 | 
41 | __forceinline__ __device__ int exb(const uint32_t q, const int shift, const int mask)
42 | {
43 |     return (int)((q >> shift) & mask);
44 | }
45 | 
46 | __forceinline__ __device__ int exb(const uint32_t q1, const uint32_t q0, const int shift, const int mask)
47 | {
48 |     return (int)(__funnelshift_rc(q0, q1, shift) & mask);
49 | }
50 | 
51 | #endif
52 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllamav2/cuda/util.cuh:
--------------------------------------------------------------------------------
 1 | 
 2 | #define DIVIDE(x, size) (((x) + (size) - 1) / (size))
 3 | 
 4 | #define DBGS(__x) printf("%s\n", __x)
 5 | #define DBGI(__x) printf("%s: %i\n", #__x, __x)
 6 | #define DBGI2(__x, __y) printf("%s, %s: %i, %i\n", #__x, #__y, __x, __y)
 7 | #define DBGI3(__x, __y, __z) printf("%s, %s, %s: %i, %i, %i\n", #__x, #__y, #__z, __x, __y, __z)
 8 | #define DBGX(__x) printf("%s: %x\n", #__x, __x)
 9 | #define DBGX2(__x, __y) printf("%s, %s: %x, %x\n", #__x, #__y, __x, __y)
10 | #define DBGX3(__x, __y, __z) printf("%s, %s, %s: %x, %x, %x\n", #__x, #__y, #__z, __x, __y, __z)
11 | #define DBGF(__x) printf("%s: %f\n", #__x, __x)
12 | #define DBGF2(__x, __y) printf("%s, %s: %f, %f\n", #__x, #__y, __x, __y)
13 | #define DBGF3(__x, __y, __z) printf("%s, %s, %s: %f, %f, %f\n", #__x, #__y, #__z, __x, __y, __z)
14 | #define DBGH(__x) printf("%s: %f\n", #__x, __half2float(__x))
15 | #define DBGH2(__x, __y) printf("%s, %s: %f, %f\n", #__x, #__y, __half2float(__x), __half2float(__y))
16 | #define DBGH3(__x, __y, __z) printf("%s, %s, %s: %f, %f, %f\n", #__x, #__y, #__z, __half2float(__x), __half2float(__y), __half2float(__z))
17 | 
18 | #define DBGIH(__x, __y) printf("%s, %s: %i, %f\n", #__x, #__y, __x, __half2float(__y))
19 | #define DBGIH2(__x, __y, __z) printf("%s, %s, %s: %i, %f, %f\n", #__x, #__y, #__z, __x, __half2float(__y), __half2float(__z))
20 | 
21 | __forceinline__ __device__ half dq_scale_(const int qs, const half max_scale)
22 | {
23 |     half qs_h = __hmul(__int2half_rn(qs + 1), __float2half_rn(1.0f / 16.0f));
24 |     qs_h = __hmul(qs_h, qs_h);
25 |     qs_h = __hmul(qs_h, max_scale);
26 |     return qs_h;
27 | }
28 | 
29 | __forceinline__ __device__ float clamp(float x, float a, float b)
30 | {
31 |     return fmaxf(a, fminf(b, x));
32 | }
33 | 
34 | #define cuda_check(ans) { gpu_assert((ans), __FILE__, __LINE__); }
35 | inline void gpu_assert(cudaError_t code, const char *file, int line, bool abort=true)
36 | {
37 |    if (code != cudaSuccess)
38 |    {
39 |       fprintf(stderr,"CUDA error: %s %s %d\n", cudaGetErrorString(code), file, line);
40 |       if (abort) exit(code);
41 |    }
42 | }
43 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/layernorm/layernorm.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 | 
  3 | Adapted from NVIDIA FasterTransformer:
  4 | https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/kernels/layernorm_kernels.cu
  5 | 
  6 | */
  7 | 
  8 | #include <torch/extension.h>
  9 | #include <cuda_fp16.h>
 10 | #include "reduction.cuh"
 11 | #include "layernorm.h"
 12 | #include <cuda_runtime.h>
 13 | #include <c10/cuda/CUDAGuard.h>
 14 | 
 15 | static inline __device__ float to_float(half src)
 16 | {
 17 |     return __half2float(src);
 18 | }
 19 | 
 20 | static inline __device__ float to_float(float src)
 21 | {
 22 |     return src;
 23 | }
 24 | 
 25 | template<typename T>
 26 | __global__ void generalT5LayerNorm(
 27 |     const T* __restrict input, const T* __restrict gamma, T* output, const float layernorm_eps, int m, int n)
 28 | {
 29 |     // layernorm module in the T5 style No bias and no subtraction of mean.
 30 |     const int tid = threadIdx.x;
 31 | 
 32 |     __shared__ float s_variance;
 33 |     float            variance = 0.0f;
 34 | 
 35 |     float local_var_sum = 0.0f;
 36 |     for (int i = tid; i < n; i += blockDim.x) {
 37 |         float diff = to_float(__ldg(&input[blockIdx.x * n + i]));
 38 |         local_var_sum += diff * diff;
 39 |     }
 40 |     variance = blockReduceSum(local_var_sum);
 41 | 
 42 |     if (threadIdx.x == 0) {
 43 |         s_variance = rsqrtf(variance / (float)n + layernorm_eps);
 44 |     }
 45 |     __syncthreads();
 46 | 
 47 |     for (int i = tid; i < n; i += blockDim.x) {
 48 |         output[blockIdx.x * n + i] =
 49 |             clamp_inf_for_half<T>((to_float(input[blockIdx.x * n + i]) * s_variance) * to_float(__ldg(&gamma[i])));
 50 |     }
 51 | }
 52 | 
 53 | 
 54 | template<typename T>
 55 | void invokeGeneralT5LayerNorm(T*           out,
 56 |                               const T*     input,
 57 |                               const T*     gamma,
 58 |                               // const T*     beta,
 59 |                               const float  layernorm_eps,
 60 |                               const int    m,
 61 |                               const int    n)
 62 | {
 63 |     dim3 grid(m);
 64 |     dim3 block(min(n, 1024));
 65 | 
 66 |     /* For general cases, n is equal to hidden_units, e.g., 512/1024.
 67 |         Since we have warp shuffle inside the code, block.x % 32 should be 0.
 68 |     */
 69 |     if (n % 32 != 0) {
 70 |         block.x = 1024;
 71 |     }
 72 | 
 73 |     block.x = block.x / (4 / sizeof(T));  // if using half, only need half of block.x
 74 | 
 75 |     /* should pay attention to the rsqrt precision*/
 76 |     generalT5LayerNorm<T><<<grid, block>>>(input, gamma, out, layernorm_eps, m, n);  // For gpt-3
 77 | }
 78 | 
 79 | template void invokeGeneralT5LayerNorm(half*           out,
 80 |                               const half*     input,
 81 |                               const half*     gamma,
 82 |                               // const half*     beta,
 83 |                               const float  layernorm_eps,
 84 |                               const int    m,
 85 |                               const int    n);
 86 | 
 87 | template void invokeGeneralT5LayerNorm(float*           out,
 88 |                               const float*     input,
 89 |                               const float*     gamma,
 90 |                               // const half*     beta,
 91 |                               const float  layernorm_eps,
 92 |                               const int    m,
 93 |                               const int    n);
 94 | 
 95 | 
 96 | 
 97 | // input b, n, c
 98 | void layernorm_forward_cuda(
 99 |     torch::Tensor _input,
100 |     torch::Tensor _gamma,
101 |     torch::Tensor _out,
102 |     float eps)
103 | {
104 |     int m = _input.size(0) * _input.size(1);
105 |     int n = _input.size(2);
106 |     const at::cuda::OptionalCUDAGuard device_guard(device_of(_input));
107 | 
108 |     auto input = reinterpret_cast<half*>(_input.data_ptr<at::Half>());
109 |     auto gamma = reinterpret_cast<half*>(_gamma.data_ptr<at::Half>());
110 |     auto out = reinterpret_cast<half*>(_out.data_ptr<at::Half>());
111 | 
112 |     invokeGeneralT5LayerNorm(out, input, gamma, eps, m, n);
113 | }
114 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/layernorm/layernorm.h:
--------------------------------------------------------------------------------
1 | #include <torch/extension.h>
2 | 
3 | void layernorm_forward_cuda(torch::Tensor _input, torch::Tensor _gamma, torch::Tensor _out, float eps);
4 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/layernorm/reduction.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 | 
 3 | Adapted from NVIDIA FasterTransformer:
 4 | https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/kernels/reduce_kernel_utils.cuh
 5 | */
 6 | 
 7 | #pragma once
 8 | #include <assert.h>
 9 | #if ((__CUDACC_VER_MAJOR__ > 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))
10 | #include <cooperative_groups/reduce.h>
11 | #else
12 | #include <cooperative_groups.h>
13 | #endif
14 | #include <cuda_fp16.h>
15 | #include <cuda_runtime.h>
16 | #include <float.h>
17 | #include <type_traits>
18 | 
19 | #define HALF_FLT_MAX 65504.F
20 | #define FINAL_MASK 0xffffffff
21 | 
22 | 
23 | template<typename T>
24 | inline __device__ T add(T a, T b) {
25 |     return a + b;
26 | }
27 | 
28 | template<>
29 | inline __device__ half2 add(half2 a, half2 b) {
30 |     return __hadd2(a, b);
31 | }
32 | 
33 | template<>
34 | inline __device__ half add(half a, half b) {
35 |     return __hadd(a, b);
36 | }
37 | 
38 | template<typename T>
39 | __inline__ __device__ T warpReduceSum(T val)
40 | {
41 | #pragma unroll
42 |     for (int mask = 16; mask > 0; mask >>= 1)
43 |         val = add(val, __shfl_xor_sync(FINAL_MASK, val, mask, 32));  //__shfl_sync bf16 return float when sm < 80
44 |     return val;
45 | }
46 | 
47 | /* Calculate the sum of all elements in a block */
48 | template<typename T>
49 | __inline__ __device__ T blockReduceSum(T val)
50 | {
51 |     static __shared__ T shared[32];
52 |     int                 lane = threadIdx.x & 0x1f;
53 |     int                 wid  = threadIdx.x >> 5;
54 | 
55 |     val = warpReduceSum<T>(val);
56 | 
57 |     if (lane == 0)
58 |         shared[wid] = val;
59 | 
60 |     __syncthreads();
61 | 
62 |     // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent
63 |     // blockDim.x is not divided by 32
64 |     val = (threadIdx.x < (blockDim.x / 32.f)) ? shared[lane] : (T)(0.0f);
65 |     val = warpReduceSum<T>(val);
66 | 
67 |     return val;
68 | }
69 | 
70 | 
71 | template<typename T>
72 | __device__ __forceinline__ T clamp_inf_for_half(const float input)
73 | {
74 |     return input;
75 | }
76 | 
77 | template<>
78 | __device__ __forceinline__ half clamp_inf_for_half(const float input)
79 | {
80 |     // clamp inf values to enable fp16 training
81 |     return input > 0.0f ? __float2half(min(input, HALF_FLT_MAX - 1000)) : __float2half(max(input, -HALF_FLT_MAX + 1000));
82 | }
83 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/position_embedding/pos_encoding.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include <torch/extension.h>
3 | 
4 | void rotary_embedding_neox(
5 |   torch::Tensor& positions,
6 |   torch::Tensor& query,
7 |   torch::Tensor& key,
8 |   int head_size,
9 |   torch::Tensor& cos_sin_cache);


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/position_embedding/pos_encoding_kernels.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 | 
 3 | Adapted from the VLLM project:
 4 | https://github.com/vllm-project/vllm/blob/main/csrc/pos_encoding_kernels.cu
 5 | 
 6 | */
 7 | 
 8 | #include <torch/extension.h>
 9 | #include <ATen/cuda/CUDAContext.h>
10 | #include "pos_encoding.h"
11 | 
12 | template<typename scalar_t>
13 | __global__ void rotary_embedding_neox_kernel(
14 |   const int64_t* __restrict__ positions,        // [num_tokens]
15 |   scalar_t* __restrict__ query,                 // [num_tokens, num_heads, head_size]
16 |   scalar_t* __restrict__ key,                   // [num_tokens, num_heads, head_size]
17 |   const scalar_t* __restrict__ cos_sin_cache,   // [max_position, 2, rot_dim // 2]
18 |   const int rot_dim,
19 |   const int stride,
20 |   const int num_heads,
21 |   const int head_size) {
22 |   // Each thread block is responsible for one token.
23 |   const int token_idx = blockIdx.x;
24 |   int64_t pos = positions[token_idx];
25 |   const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
26 | 
27 |   const int embed_dim = rot_dim / 2;
28 |   const int n = num_heads * embed_dim;
29 |   for (int i = threadIdx.x; i < n; i += blockDim.x) {
30 |     const int head_idx = i / embed_dim;
31 |     const int token_head = token_idx * stride + head_idx * head_size;
32 | 
33 |     const int rot_offset = i % embed_dim;
34 |     const int x_index = rot_offset;
35 |     const int y_index = embed_dim + rot_offset;
36 | 
37 |     const int out_x = token_idx * stride + head_idx * head_size + x_index;
38 |     const int out_y = token_idx * stride + head_idx * head_size + y_index;
39 | 
40 |     const scalar_t cos = __ldg(cache_ptr + x_index);
41 |     const scalar_t sin = __ldg(cache_ptr + y_index);
42 | 
43 |     const scalar_t q_x = query[token_head + x_index];
44 |     const scalar_t q_y = query[token_head + y_index];
45 |     query[out_x] = q_x * cos - q_y * sin;
46 |     query[out_y] = q_y * cos + q_x * sin;
47 | 
48 |     const scalar_t k_x = key[token_head + x_index];
49 |     const scalar_t k_y = key[token_head + y_index];
50 |     key[out_x] = k_x * cos - k_y * sin;
51 |     key[out_y] = k_y * cos + k_x * sin;
52 |   }
53 | }
54 | 
55 | void rotary_embedding_neox(
56 |   torch::Tensor& positions,         // [b, num_tokens]
57 |   torch::Tensor& query,             // [b, num_tokens, 1, num_heads, head_size]
58 |   torch::Tensor& key,               // [b, num_tokens, 1, num_heads, head_size]
59 |   int head_size,
60 |   torch::Tensor& cos_sin_cache)     // [max_position, rot_dim]
61 | {
62 |   int num_tokens = query.size(0) * query.size(1);
63 |   int rot_dim = cos_sin_cache.size(1);
64 |   int num_heads = query.size(-2);
65 |   int stride = num_heads * head_size;
66 |   // TORCH_CHECK(stride == key.stride(0));
67 | 
68 |   dim3 grid(num_tokens);
69 |   dim3 block(std::min(num_heads * rot_dim / 2, 512));
70 |   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
71 |   AT_DISPATCH_FLOATING_TYPES_AND2(
72 |     at::ScalarType::Half,
73 |     at::ScalarType::BFloat16,
74 |     query.scalar_type(),
75 |     "rotary_embedding_neox",
76 |     [&] {
77 |       rotary_embedding_neox_kernel<scalar_t><<<grid, block, 0, stream>>>(
78 |         positions.data_ptr<int64_t>(),
79 |         query.data_ptr<scalar_t>(),
80 |         key.data_ptr<scalar_t>(),
81 |         cos_sin_cache.data_ptr<scalar_t>(),
82 |         rot_dim,
83 |         stride,
84 |         num_heads,
85 |         head_size);
86 |     });
87 | }
88 | 
89 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/pybind_awq.cpp:
--------------------------------------------------------------------------------
 1 | #include <pybind11/pybind11.h>
 2 | #include <torch/extension.h>
 3 | #include "layernorm/layernorm.h"
 4 | #include "quantization/gemm_cuda.h"
 5 | #include "quantization/gemv_cuda.h"
 6 | #include "position_embedding/pos_encoding.h"
 7 | #include "vllm/moe_alig_block.h"
 8 | #include "vllm/activation.h"
 9 | #include "vllm/topk_softmax_kernels.h"
10 | 
11 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
12 | {
13 |     m.def("layernorm_forward_cuda", &layernorm_forward_cuda, "FasterTransformer layernorm kernel");
14 |     m.def("gemm_forward_cuda", &gemm_forward_cuda, "Quantized GEMM kernel.");
15 |     m.def("grouped_gemm_forward", &grouped_gemm_forward, "Quantized grouped GEMM kernel.");
16 |     m.def("gemmv2_forward_cuda", &gemmv2_forward_cuda, "Quantized v2 GEMM kernel.");
17 |     m.def("gemv_forward_cuda", &gemv_forward_cuda, "Quantized GEMV kernel.");
18 |     m.def("rotary_embedding_neox", &rotary_embedding_neox, "Apply GPT-NeoX style rotary embedding to query and key");
19 |     m.def("dequantize_weights_cuda", &dequantize_weights_cuda, "Dequantize weights.");
20 |     m.def("moe_alig_block_size", &moe_alig_block_size, "Aligning the number of tokens to be processed by each expert such that it is divisible by the block size.");
21 |     m.def("silu_and_mul", &silu_and_mul, "Activation function used in SwiGLU.");
22 |     m.def("topk_softmax", &topk_softmax, "Computes fused topk and softmax operation.");
23 | }


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/pybind_awq_ft.cpp:
--------------------------------------------------------------------------------
 1 | #include <pybind11/pybind11.h>
 2 | #include <torch/extension.h>
 3 | #include "attention/ft_attention.h"
 4 | 
 5 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 6 | {
 7 |     m.def("single_query_attention", &single_query_attention, "Attention with a single query",
 8 |           py::arg("q"), py::arg("k"), py::arg("v"), py::arg("k_cache"), py::arg("v_cache"),
 9 |           py::arg("length_per_sample_"), py::arg("alibi_slopes_"), py::arg("timestep"), py::arg("rotary_embedding_dim")=0,
10 |           py::arg("rotary_base")=10000.0f, py::arg("neox_rotary_style")=true);
11 | }


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/pybind_awq_v2.cpp:
--------------------------------------------------------------------------------
 1 | #include <pybind11/pybind11.h>
 2 | #include <torch/extension.h>
 3 | #include "quantization_new/gemm/gemm_cuda.h"
 4 | #include "quantization_new/gemv/gemv_cuda.h"
 5 | 
 6 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 7 | {
 8 |     m.def("gemm_forward_cuda_prefill", &gemm_forward_cuda_prefill, "New quantized GEMM kernel.");
 9 |     m.def("gemv_forward_cuda_decode", &gemv_forward_cuda_decode, "New quantized GEMM kernel.");
10 | }


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/quantization/dequantize.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 | Modified from NVIDIA FasterTransformer: https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
 3 | 
 4 | @article{lin2023awq,
 5 |   title={AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration},
 6 |   author={Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang, Shang and Dang, Xingyu and Han, Song},
 7 |   journal={arXiv},
 8 |   year={2023}
 9 | }
10 | */
11 | 
12 | #pragma once
13 | 
14 | 
15 | __device__ uint4 dequantize_s4_to_fp16x2(uint32_t const& source)
16 | {
17 |     uint4 result;
18 | 
19 |     uint32_t*      h   = reinterpret_cast<uint32_t*>(&result);
20 |     uint32_t const i4s = reinterpret_cast<uint32_t const&>(source);
21 | 
22 |     // First, we extract the i4s and construct an intermediate fp16 number.
23 |     static constexpr uint32_t immLut                = (0xf0 & 0xcc) | 0xaa;
24 |     static constexpr uint32_t BOTTOM_MASK           = 0x000f000f;
25 |     static constexpr uint32_t TOP_MASK              = 0x00f000f0;
26 |     static constexpr uint32_t I4s_TO_F16s_MAGIC_NUM = 0x64006400;
27 | 
28 |     // Note that the entire sequence only requires 1 shift instruction. This is thanks to the register packing
29 |     // format and the fact that we force our integers to be unsigned, and account for this in the fp16 subtractions.
30 |     // In addition, I exploit the fact that sub and fma have the same throughput in order to convert elt_23 and
31 |     // elt_67 to fp16 without having to shift them to the bottom bits before hand.
32 | 
33 |     // Shift right by 8 to now consider elt_45 and elt_67. Issue first to hide RAW dependency if we issue
34 |     // immediately before required.
35 |     const uint32_t top_i4s = i4s >> 8;
36 |     // Extract elt_01 - (i4s & 0x000f000f) | 0x64006400
37 |     asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
38 |                     : "=r"(h[0])
39 |                     : "r"(i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
40 |     // Extract elt_23 (i4s & 0x00f000f0) | 0x64006400
41 |     asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
42 |                     : "=r"(h[1])
43 |                     : "r"(i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
44 |     // Extract elt_45 (top_i4s & 0x000f000f) | 0x64006400
45 |     asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
46 |                     : "=r"(h[2])
47 |                     : "r"(top_i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
48 |     // Extract elt_67 (top_i4s & 0x00f000f0) | 0x64006400
49 |     asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
50 |                     : "=r"(h[3])
51 |                     : "r"(top_i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
52 | 
53 |     // I use inline PTX below because I am not sure if the compiler will emit float2half instructions if I use the
54 |     // half2 ctor. In this case, I chose performance reliability over code readability.
55 | 
56 |     // This is the half2 {1032, 1032} represented as an integer.
57 |     // static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64086408;
58 |     // Haotian: subtract {1024, 1024} instead, we do not need to map to [-8, 7]
59 |     static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64006400;
60 |     // This is the half2 {1 / 16, 1 / 16} represented as an integer.
61 |     static constexpr uint32_t ONE_SIXTEENTH = 0x2c002c00;
62 |     // This is the half2 {-72, -72} represented as an integer.
63 |     // static constexpr uint32_t NEG_72 = 0xd480d480;
64 |     // Haotian: Let's use {-64, -64}.
65 |     static constexpr uint32_t NEG_64 = 0xd400d400;
66 | 
67 |     // Finally, we construct the output numbers.
68 |     // Convert elt_01
69 |     asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[0]) : "r"(h[0]), "r"(FP16_TOP_MAGIC_NUM));
70 |     // Convert elt_23
71 |     asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[1]) : "r"(h[1]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
72 |     // Convert elt_45
73 |     asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[2]) : "r"(h[2]), "r"(FP16_TOP_MAGIC_NUM));
74 |     // Convert elt_67
75 |     asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[3]) : "r"(h[3]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
76 | 
77 |     return result;
78 | }
79 | 
80 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/quantization/gemm_cuda.h:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | torch::Tensor gemm_forward_cuda(torch::Tensor _in_feats, torch::Tensor _kernel,
 4 |     torch::Tensor _scaling_factors, torch::Tensor _zeros, int split_k_iters);
 5 | 
 6 | torch::Tensor grouped_gemm_forward(
 7 |     torch::Tensor _in_feats,
 8 |     torch::Tensor _kernel,
 9 |     torch::Tensor _scaling_factors,
10 |     torch::Tensor _zeros,
11 |     torch::Tensor _topk_weights,
12 |     torch::Tensor _sorted_token_ids_ptr,
13 |     torch::Tensor _expert_ids_ptr,
14 |     torch::Tensor _num_tokens_post_padded,
15 |     bool mul_weights,
16 |     int split_k_iters);
17 | 
18 | torch::Tensor gemmv2_forward_cuda(torch::Tensor _in_feats, torch::Tensor _kernel,
19 |     torch::Tensor _scaling_factors, torch::Tensor _zeros, int group_size, int split_k_iters);
20 | 
21 | // Source - https://github.com/compressa-ai/AutoAWQ/blob/6673333456b8871522b11a7fb110de612edfdf95/awq_cuda/quantization/gemm_cuda.h#L9C1-L10C106
22 | torch::Tensor dequantize_weights_cuda(torch::Tensor _kernel,
23 |     torch::Tensor _scaling_factors, torch::Tensor _zeros, int split_k_iters, int thx, int thy, bool dbg);


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/quantization/gemv_cuda.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <torch/extension.h>
 3 | 
 4 | torch::Tensor gemv_forward_cuda(
 5 |     torch::Tensor _in_feats,
 6 |     torch::Tensor _kernel,
 7 |     torch::Tensor _scaling_factors,
 8 |     torch::Tensor _zeros,
 9 |     int group_size);
10 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/quantization_new/dequantize.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 | Modified from NVIDIA FasterTransformer: https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
 3 | 
 4 | @article{lin2023awq,
 5 |   title={AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration},
 6 |   author={Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang, Shang and Dang, Xingyu and Han, Song},
 7 |   journal={arXiv},
 8 |   year={2023}
 9 | }
10 | */
11 | #include <cuda_fp16.h>
12 | #pragma once
13 | 
14 | __inline__ __device__ void dequantize_s4_to_fp16x2(half2 const &source, uint4 *result)
15 | {
16 |     // uint4 result;
17 | 
18 |     uint32_t *h = reinterpret_cast<uint32_t *>(result);
19 |     uint32_t const i4s = reinterpret_cast<uint32_t const &>(source);
20 | 
21 |     // First, we extract the i4s and construct an intermediate fp16 number.
22 |     static constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa;
23 |     static constexpr uint32_t BOTTOM_MASK = 0x000f000f;
24 |     static constexpr uint32_t TOP_MASK = 0x00f000f0;
25 |     static constexpr uint32_t I4s_TO_F16s_MAGIC_NUM = 0x64006400;
26 | 
27 |     // Note that the entire sequence only requires 1 shift instruction. This is thanks to the register packing
28 |     // format and the fact that we force our integers to be unsigned, and account for this in the fp16 subtractions.
29 |     // In addition, I exploit the fact that sub and fma have the same throughput in order to convert elt_23 and
30 |     // elt_67 to fp16 without having to shift them to the bottom bits before hand.
31 | 
32 |     // Shift right by 8 to now consider elt_45 and elt_67. Issue first to hide RAW dependency if we issue
33 |     // immediately before required.
34 |     const uint32_t top_i4s = i4s >> 8;
35 |     // Extract elt_01 - (i4s & 0x000f000f) | 0x64006400
36 |     asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
37 |                  : "=r"(h[0])
38 |                  : "r"(i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
39 |     // Extract elt_23 (i4s & 0x00f000f0) | 0x64006400
40 |     asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
41 |                  : "=r"(h[1])
42 |                  : "r"(i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
43 |     // Extract elt_45 (top_i4s & 0x000f000f) | 0x64006400
44 |     asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
45 |                  : "=r"(h[2])
46 |                  : "r"(top_i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
47 |     // Extract elt_67 (top_i4s & 0x00f000f0) | 0x64006400
48 |     asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
49 |                  : "=r"(h[3])
50 |                  : "r"(top_i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
51 | 
52 |     // I use inline PTX below because I am not sure if the compiler will emit float2half instructions if I use the
53 |     // half2 ctor. In this case, I chose performance reliability over code readability.
54 | 
55 |     // This is the half2 {1032, 1032} represented as an integer.
56 |     // static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64086408;
57 |     // Haotian: subtract {1024, 1024} instead, we do not need to map to [-8, 7]
58 |     static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64006400;
59 |     // This is the half2 {1 / 16, 1 / 16} represented as an integer.
60 |     static constexpr uint32_t ONE_SIXTEENTH = 0x2c002c00;
61 |     // This is the half2 {-72, -72} represented as an integer.
62 |     // static constexpr uint32_t NEG_72 = 0xd480d480;
63 |     // Haotian: Let's use {-64, -64}.
64 |     static constexpr uint32_t NEG_64 = 0xd400d400;
65 | 
66 |     // Finally, we construct the output numbers.
67 |     // Convert elt_01
68 |     asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[0]) : "r"(h[0]), "r"(FP16_TOP_MAGIC_NUM));
69 |     // Convert elt_23
70 |     asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[1]) : "r"(h[1]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
71 |     // Convert elt_45
72 |     asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[2]) : "r"(h[2]), "r"(FP16_TOP_MAGIC_NUM));
73 |     // Convert elt_67
74 |     asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[3]) : "r"(h[3]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
75 | 
76 |     // return result;
77 | }


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/quantization_new/gemm/gemm_cuda.h:
--------------------------------------------------------------------------------
1 | #include <torch/extension.h>
2 | 
3 | torch::Tensor gemm_forward_cuda_prefill(torch::Tensor _in_feats, torch::Tensor _kernel, torch::Tensor _scales, torch::Tensor _zeros);
4 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/quantization_new/gemv/gemv_cuda.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <torch/extension.h>
 3 | 
 4 | torch::Tensor gemv_forward_cuda_decode(
 5 |     torch::Tensor _in_feats,
 6 |     torch::Tensor _kernel,
 7 |     torch::Tensor _scaling_factors,
 8 |     torch::Tensor _zeros,
 9 |     int m,
10 |     int n,
11 |     int k,
12 |     int group_size);
13 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/vllm/activation.cu:
--------------------------------------------------------------------------------
 1 | #include <ATen/cuda/CUDAContext.h>
 2 | #include <torch/extension.h>
 3 | #include <c10/cuda/CUDAGuard.h>
 4 | 
 5 | #define VLLM_LDG(arg) *(arg)
 6 | 
 7 | #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)              \
 8 |   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)      \
 9 |   AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)       \
10 |   AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
11 | 
12 | #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)             \
13 |   AT_DISPATCH_SWITCH(                                             \
14 |     TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
15 | 
16 | template<typename T>
17 | __device__ __forceinline__ T silu(const T& x) {
18 |   // x * sigmoid(x)
19 |   return (T) (((float) x) / (1.0f + expf((float) -x)));
20 | }
21 | 
22 | template<typename scalar_t>
23 | __global__ void silu_and_mul_kernel(
24 |   scalar_t* __restrict__ out,               // [..., d]
25 |   const scalar_t* __restrict__ input,       // [..., 2, d]
26 |   const int d) {
27 |   const int64_t token_idx = blockIdx.x;
28 |   for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
29 |     const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
30 |     const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
31 |     out[token_idx * d + idx] = silu(x) * y;
32 |   }
33 | }
34 | 
35 | 
36 | void silu_and_mul(
37 |   torch::Tensor& out,      // [..., d]
38 |   torch::Tensor& input)    // [..., 2 * d]
39 | {
40 |   int64_t num_tokens = input.numel() / input.size(-1);
41 |   int d = input.size(-1) / 2;
42 | 
43 |   dim3 grid(num_tokens);
44 |   dim3 block(std::min(d, 1024));
45 |   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
46 |   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
47 |   VLLM_DISPATCH_FLOATING_TYPES(
48 |     input.scalar_type(),
49 |     "silu_and_mul_kernel",
50 |     [&] {
51 |       silu_and_mul_kernel<scalar_t><<<grid, block, 0, stream>>>(
52 |         out.data_ptr<scalar_t>(),
53 |         input.data_ptr<scalar_t>(),
54 |         d);
55 |     });
56 | }


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/vllm/activation.h:
--------------------------------------------------------------------------------
1 | void silu_and_mul(
2 |   torch::Tensor& out,
3 |   torch::Tensor& input);


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/vllm/moe_alig_block.cu:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | #include <ATen/cuda/CUDAContext.h>
 3 | #include <c10/cuda/CUDAGuard.h>
 4 | 
 5 | #include <ATen/ATen.h>
 6 | #include <THC/THCAtomics.cuh>
 7 | 
 8 | const static size_t NUM_MAX_EXPERTS = 64;
 9 | 
10 | #define VLLM_DISPATCH_CASE_INTEGRAL_TYPES(...)             \
11 |   AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)      \
12 |   AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)      \
13 |   AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__)     \
14 |   AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__)       \
15 |   AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__)
16 | 
17 | #define VLLM_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...)             \
18 |   AT_DISPATCH_SWITCH(                                             \
19 |     TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__))
20 | 
21 | template <typename scalar_t>
22 | __global__ void moe_alig_block_size_kernel(scalar_t *__restrict__ topk_ids, 
23 |                                 int32_t *sorted_token_ids, 
24 |                                 int32_t *expert_ids, 
25 |                                 int32_t *total_tokens_post_pad,
26 |                                 int32_t num_experts, 
27 |                                 int32_t block_size, 
28 |                                 size_t numel) {
29 |     const size_t tokens_per_thread = ((numel + blockDim.x - 1) / blockDim.x);
30 |     const size_t start_idx = threadIdx.x * tokens_per_thread;
31 |     __shared__ int32_t tokens_cnts[NUM_MAX_EXPERTS + 1][NUM_MAX_EXPERTS];
32 |     __shared__ int32_t cumsum[NUM_MAX_EXPERTS + 1];
33 |     for(int i = 0;i < num_experts;i++){
34 |         tokens_cnts[threadIdx.x + 1][i] = 0;
35 |     }
36 | 
37 |     for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
38 |         ++tokens_cnts[threadIdx.x + 1][topk_ids[i]]; 
39 |     }
40 | 
41 |     __syncthreads();
42 | 
43 |     tokens_cnts[0][threadIdx.x] = 0;
44 |     for(int i=1;i<=blockDim.x;++i){
45 |         tokens_cnts[i][threadIdx.x] += tokens_cnts[i-1][threadIdx.x];
46 |     }
47 | 
48 |     __syncthreads();
49 | 
50 |     if(threadIdx.x ==0){
51 |         cumsum[0] = 0;
52 |         for(int i=1;i<=num_experts;++i){
53 |             cumsum[i] = cumsum[i-1] + (tokens_cnts[blockDim.x][i - 1] + block_size - 1) / block_size * block_size;
54 |         }
55 |         *total_tokens_post_pad = cumsum[num_experts];
56 |     }
57 | 
58 |     __syncthreads();
59 | 
60 |     for(int i= cumsum[threadIdx.x];i<cumsum[threadIdx.x + 1];i += block_size){
61 |         expert_ids[i / block_size] = threadIdx.x;
62 |     }
63 | 
64 |     for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
65 |         int32_t expert_id = topk_ids[i];
66 |         int32_t rank_post_pad = tokens_cnts[threadIdx.x][expert_id] + cumsum[expert_id];
67 |         sorted_token_ids[rank_post_pad] = i;
68 |         ++tokens_cnts[threadIdx.x][expert_id];
69 |     }
70 | }
71 | 
72 | void moe_alig_block_size(
73 |     torch::Tensor topk_ids,
74 |     int num_experts,
75 |     int block_size,
76 |     torch::Tensor sorted_token_ids,
77 |     torch::Tensor experts_ids,
78 |     torch::Tensor num_tokens_post_pad) {
79 |     const at::cuda::OptionalCUDAGuard device_guard_topk_ids(device_of(topk_ids));
80 |     const at::cuda::OptionalCUDAGuard device_guard_sorted(device_of(sorted_token_ids));
81 |     const at::cuda::OptionalCUDAGuard device_guard_experts(device_of(experts_ids));
82 |     const at::cuda::OptionalCUDAGuard device_guard_num_tokens(device_of(num_tokens_post_pad));
83 |     const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
84 |     assert(num_experts <= NUM_MAX_EXPERTS);
85 |     VLLM_DISPATCH_INTEGRAL_TYPES(
86 |         topk_ids.scalar_type(), "moe_alig_block_size_kernel", [&] {
87 |         moe_alig_block_size_kernel<scalar_t><<<1, num_experts, 0, stream>>>(
88 |             topk_ids.data_ptr<scalar_t>(), 
89 |             sorted_token_ids.data_ptr<int32_t>(), 
90 |             experts_ids.data_ptr<int32_t>(), 
91 |             num_tokens_post_pad.data_ptr<int32_t>(), 
92 |             num_experts,
93 |             block_size,
94 |             topk_ids.numel());
95 |     });
96 | }


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/vllm/moe_alig_block.h:
--------------------------------------------------------------------------------
1 | void moe_alig_block_size(
2 |   torch::Tensor topk_ids,
3 |   int num_experts,
4 |   int block_size,
5 |   torch::Tensor sorted_token_ids,
6 |   torch::Tensor experts_ids,
7 |   torch::Tensor num_tokens_post_pad
8 |   );


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/vllm/topk_softmax_kernels.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #include <torch/extension.h>
4 | 
5 | void topk_softmax(
6 |   torch::Tensor& topk_weights,
7 |   torch::Tensor& topk_indices,
8 |   torch::Tensor& token_expert_indices,
9 |   torch::Tensor& gating_output);


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/scripts/download_wheels.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Set variables
 4 | AWQ_KERNELS_VERSION="0.0.6"
 5 | RELEASE_URL="https://api.github.com/repos/casper-hansen/AutoAWQ_kernels/releases/tags/v${AWQ_KERNELS_VERSION}"
 6 | 
 7 | # Create a directory to download the wheels
 8 | mkdir -p dist
 9 | cd dist
10 | 
11 | # Download all the wheel files from the GitHub release
12 | # excluding ones with '+cu' (%2B is + but encoded)
13 | curl -s $RELEASE_URL | \
14 |     jq -r ".assets[].browser_download_url" | \
15 |     grep '\.whl' | \
16 |     grep -v '%2Bcu' | \
17 |     grep -v '%2Brocm' | \
18 |     xargs -n 1 -P 4 wget
19 | 
20 | # Rename the wheels from 'linux_x86_64' to 'manylinux_x86_64'
21 | for file in *linux_x86_64.whl; do
22 |     mv "$file" "$(echo $file | sed 's/linux_x86_64/manylinux2014_x86_64/')"
23 | done
24 | 
25 | cd ..
26 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 MIT HAN Lab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/compression/quantization/AutoAWQ/__init__.py


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/awq/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.2.4"
2 | 
3 | import sys
4 | 
5 | transformers_path = "/mnt/petrelfs/dongdaize.d/workspace/compression/src"
6 | sys.path = [transformers_path] + sys.path
7 | 
8 | from awq.models.auto import AutoAWQForCausalLM
9 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/awq/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | from awq.evaluation.eval_utils import (
2 |     evaluate_perplexity,
3 |     eval_librispeech,
4 |     eval_mmlu,
5 | )
6 | from awq.evaluation.humaneval_utils import eval_humaneval
7 | from awq.evaluation.kl_divergence import eval_kl_divergence
8 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/awq/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from .mpt import MptAWQForCausalLM
 2 | from .llama import LlamaAWQForCausalLM
 3 | from .opt import OptAWQForCausalLM
 4 | from .falcon import FalconAWQForCausalLM
 5 | from .bloom import BloomAWQForCausalLM
 6 | from .gptj import GPTJAWQForCausalLM
 7 | from .gpt_bigcode import GptBigCodeAWQForCausalLM
 8 | from .mistral import MistralAWQForCausalLM
 9 | from .gpt_neox import GPTNeoXAWQForCausalLM
10 | from .aquila import AquilaAWQForCausalLM
11 | from .yi import YiAWQForCausalLM
12 | from .qwen import QwenAWQForCausalLM
13 | from .baichuan import BaichuanAWQForCausalLM
14 | from .llava import LlavaAWQForCausalLM
15 | from .mixtral import MixtralAWQForCausalLM
16 | from .qwen2 import Qwen2AWQForCausalLM
17 | from .gemma import GemmaAWQForCausalLM
18 | from .stablelm import StableLmAWQForCausalLM
19 | from .deepseek import DeepseekAWQForCausalLM
20 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/awq/models/_config.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | from typing import Dict, Optional, List
  4 | from dataclasses import dataclass, field
  5 | from transformers.utils.hub import PushToHubMixin, cached_file
  6 | 
  7 | 
  8 | @dataclass
  9 | class AwqConfig(PushToHubMixin):
 10 |     quant_method: str = field(default="awq")
 11 |     zero_point: bool = field(default=True)
 12 |     q_group_size: int = field(default=128)
 13 |     w_bit: int = field(default=4)
 14 |     version: str = field(default="gemm")
 15 |     config_file_name = "config.json"
 16 |     modules_to_not_convert: Optional[List] = None
 17 | 
 18 |     @classmethod
 19 |     def from_dict(cls, quant_config: Dict = {}):
 20 |         if not quant_config:
 21 |             quant_config = cls()
 22 |         else:
 23 |             quant_config = cls(**quant_config)
 24 |             quant_config.version = quant_config.version.lower()
 25 | 
 26 |         return quant_config
 27 | 
 28 |     @classmethod
 29 |     def from_pretrained(cls, save_dir: str, **kwargs):
 30 |         cache_dir = kwargs.pop("cache_dir", None)
 31 |         force_download = kwargs.pop("force_download", False)
 32 |         resume_download = kwargs.pop("resume_download", False)
 33 |         proxies = kwargs.pop("proxies", None)
 34 |         local_files_only = kwargs.pop("local_files_only", False)
 35 |         use_auth_token = kwargs.pop("use_auth_token", None)
 36 |         revision = kwargs.pop("revision", None)
 37 |         subfolder = kwargs.pop("subfolder", None)
 38 |         commit_hash = kwargs.pop("_commit_hash", None)
 39 | 
 40 |         if os.path.isdir(save_dir):  # Local
 41 |             resolved_config_file = os.path.join(save_dir, cls.config_file_name)
 42 |         else:  # Remote
 43 |             resolved_config_file = cached_file(
 44 |                 save_dir,
 45 |                 cls.config_file_name,
 46 |                 cache_dir=cache_dir,
 47 |                 force_download=force_download,
 48 |                 resume_download=resume_download,
 49 |                 proxies=proxies,
 50 |                 use_auth_token=use_auth_token,
 51 |                 revision=revision,
 52 |                 local_files_only=local_files_only,
 53 |                 subfolder=subfolder,
 54 |                 _raise_exceptions_for_missing_entries=False,
 55 |                 _raise_exceptions_for_connection_errors=False,
 56 |                 _commit_hash=commit_hash,
 57 |             )
 58 | 
 59 |         quant_config = None
 60 |         if os.path.exists(resolved_config_file):
 61 |             with open(resolved_config_file, "r", encoding="utf-8") as file:
 62 |                 loaded_config = json.loads(file.read())
 63 | 
 64 |             quant_config = loaded_config.get("quantization_config")
 65 | 
 66 |             if quant_config is not None:
 67 |                 awq_config = cls.from_transformers_dict(cls, quant_config)
 68 |                 quant_config = cls(**awq_config)
 69 | 
 70 |         if quant_config is None:
 71 |             quant_config = cls()
 72 | 
 73 |         return quant_config
 74 | 
 75 |     def to_dict(self):
 76 |         return {
 77 |             "zero_point": self.zero_point,
 78 |             "q_group_size": self.q_group_size,
 79 |             "w_bit": self.w_bit,
 80 |             "version": self.version,
 81 |             "modules_to_not_convert": self.modules_to_not_convert,
 82 |         }
 83 | 
 84 |     def to_transformers_dict(self):
 85 |         return {
 86 |             "quant_method": self.quant_method,
 87 |             "zero_point": self.zero_point,
 88 |             "group_size": self.q_group_size,
 89 |             "bits": self.w_bit,
 90 |             "version": self.version.lower(),
 91 |             "modules_to_not_convert": self.modules_to_not_convert,
 92 |         }
 93 | 
 94 |     def from_transformers_dict(self, transformers_dict: Dict):
 95 |         return {
 96 |             "quant_method": transformers_dict.get("quant_method"),
 97 |             "zero_point": transformers_dict.get("zero_point"),
 98 |             "q_group_size": transformers_dict.get("group_size"),
 99 |             "w_bit": transformers_dict.get("bits"),
100 |             "version": transformers_dict.get("version"),
101 |             "modules_to_not_convert": transformers_dict.get("modules_to_not_convert"),
102 |         }
103 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/awq/models/bloom.py:
--------------------------------------------------------------------------------
 1 | from .base import BaseAWQForCausalLM
 2 | from transformers.models.bloom.modeling_bloom import BloomForCausalLM, BloomBlock
 3 | 
 4 | 
 5 | class BloomAWQForCausalLM(BaseAWQForCausalLM):
 6 |     layer_type = "BloomBlock"
 7 | 
 8 |     @staticmethod
 9 |     def get_model_layers(model: BloomForCausalLM):
10 |         return model.transformer.h
11 | 
12 |     @staticmethod
13 |     def get_act_for_scaling(module: BloomBlock):
14 |         return dict(
15 |             is_scalable=True,
16 |             scale_name="mlp.gelu_impl",
17 |             scale_layer=module.mlp.gelu_impl,
18 |             scale_shape=module.mlp.dense_h_to_4h.out_features,
19 |         )
20 | 
21 |     @staticmethod
22 |     def move_embed(model: BloomForCausalLM, device: str):
23 |         model.transformer.word_embeddings = model.transformer.word_embeddings.to(device)
24 |         model.transformer.word_embeddings_layernorm = (
25 |             model.transformer.word_embeddings_layernorm.to(device)
26 |         )
27 | 
28 |     @staticmethod
29 |     def get_layers_for_scaling(module: BloomBlock, input_feat, module_kwargs):
30 |         layers = []
31 | 
32 |         # attention input
33 |         layers.append(
34 |             dict(
35 |                 prev_op=module.input_layernorm,
36 |                 layers=[module.self_attention.query_key_value],
37 |                 inp=input_feat["self_attention.query_key_value"],
38 |                 module2inspect=module,
39 |                 kwargs=module_kwargs,
40 |             )
41 |         )
42 |         # attention out
43 |         # Please refer to https://github.com/mit-han-lab/llm-awq/issues/2#issuecomment-1606297469
44 |         """
45 |         scales_list.append(_auto_get_scale(
46 |             prev_op=module.self_attention.query_key_value,
47 |             layers=[module.self_attention.dense],
48 |             inp=input_feat['self_attention.dense'],
49 |         ))
50 |         """
51 |         # linear 1
52 |         layers.append(
53 |             dict(
54 |                 prev_op=module.post_attention_layernorm,
55 |                 layers=[module.mlp.dense_h_to_4h],
56 |                 inp=input_feat["mlp.dense_h_to_4h"],
57 |                 module2inspect=module,
58 |                 kwargs=module_kwargs,
59 |             )
60 |         )
61 |         # linear 2
62 |         layers.append(
63 |             dict(
64 |                 prev_op=module.mlp.gelu_impl,
65 |                 layers=[module.mlp.dense_4h_to_h],
66 |                 inp=input_feat["mlp.dense_4h_to_h"],
67 |             )
68 |         )
69 | 
70 |         return layers
71 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/awq/models/deepseek_moe/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/compression/quantization/AutoAWQ/awq/models/deepseek_moe/__init__.py


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/awq/models/gpt_bigcode.py:
--------------------------------------------------------------------------------
 1 | from .base import BaseAWQForCausalLM
 2 | from transformers.models.gpt_bigcode.modeling_gpt_bigcode import (
 3 |     GPTBigCodeForCausalLM,
 4 |     GPTBigCodeBlock as OldGptBigCodeBlock,
 5 | )
 6 | 
 7 | 
 8 | class GptBigCodeAWQForCausalLM(BaseAWQForCausalLM):
 9 |     layer_type = "GPTBigCodeBlock"
10 |     max_seq_len_key = "n_positions"
11 | 
12 |     @staticmethod
13 |     def get_model_layers(model: GPTBigCodeForCausalLM):
14 |         return model.transformer.h
15 | 
16 |     @staticmethod
17 |     def get_act_for_scaling(module: OldGptBigCodeBlock):
18 |         return dict(
19 |             is_scalable=True,
20 |             scale_name="mlp.act",
21 |             scale_layer=module.mlp.act,
22 |             scale_shape=module.mlp.c_fc.out_features,
23 |         )
24 | 
25 |     @staticmethod
26 |     def move_embed(model: GPTBigCodeForCausalLM, device):
27 |         model.transformer.wte = model.transformer.wte.to(device)
28 |         model.transformer.wpe = model.transformer.wpe.to(device)
29 |         model.transformer.drop = model.transformer.drop.to(device)
30 | 
31 |     @staticmethod
32 |     def get_layers_for_scaling(module: OldGptBigCodeBlock, input_feat, module_kwargs):
33 |         layers = []
34 | 
35 |         # attention input
36 |         layers.append(
37 |             dict(
38 |                 prev_op=module.ln_1,
39 |                 layers=[module.attn.c_attn],
40 |                 inp=input_feat["attn.c_attn"],
41 |                 module2inspect=module.attn,
42 |                 kwargs=module_kwargs,
43 |             )
44 |         )
45 | 
46 |         # linear 1
47 |         layers.append(
48 |             dict(
49 |                 prev_op=module.ln_2,
50 |                 layers=[module.mlp.c_fc],
51 |                 inp=input_feat["mlp.c_fc"],
52 |                 module2inspect=module.mlp,
53 |             )
54 |         )
55 | 
56 |         # linear 2
57 |         layers.append(
58 |             dict(
59 |                 prev_op=module.mlp.act,
60 |                 layers=[module.mlp.c_proj],
61 |                 inp=input_feat["mlp.c_proj"],
62 |             )
63 |         )
64 | 
65 |         return layers
66 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/awq/models/gpt_neox.py:
--------------------------------------------------------------------------------
 1 | from .base import BaseAWQForCausalLM
 2 | from transformers.models.gpt_neox.modeling_gpt_neox import (
 3 |     GPTNeoXLayer,
 4 |     GPTNeoXForCausalLM,
 5 | )
 6 | 
 7 | 
 8 | class GPTNeoXAWQForCausalLM(BaseAWQForCausalLM):
 9 |     layer_type = "GPTNeoXDecoderLayer"
10 |     max_seq_len_key = "max_position_embeddings"
11 | 
12 |     @staticmethod
13 |     def get_model_layers(model: GPTNeoXForCausalLM):
14 |         return model.gpt_neox.layers
15 | 
16 |     @staticmethod
17 |     def get_act_for_scaling(module: GPTNeoXLayer):
18 |         return dict(
19 |             is_scalable=True,
20 |             scale_name="mlp.act",
21 |             scale_layer=module.mlp.act,
22 |             scale_shape=module.mlp.dense_h_to_4h.out_features,
23 |         )
24 | 
25 |     @staticmethod
26 |     def move_embed(model: GPTNeoXForCausalLM, device: str):
27 |         model.gpt_neox.embed_in = model.gpt_neox.embed_in.to(device)
28 | 
29 |     @staticmethod
30 |     def get_layers_for_scaling(module: GPTNeoXLayer, input_feat, module_kwargs):
31 |         layers = []
32 | 
33 |         # attention input
34 |         layers.append(
35 |             dict(
36 |                 prev_op=module.input_layernorm,
37 |                 layers=[module.attention.query_key_value],
38 |                 inp=input_feat["attention.query_key_value"],
39 |             )
40 |         )
41 | 
42 |         # attention out
43 |         # Please refer to https://github.com/mit-han-lab/llm-awq/issues/2#issuecomment-1606297469
44 |         """
45 |         layers.append(dict(
46 |             prev_op=module.attention.query_key_value,
47 |             layers=[module.attention.dense],
48 |             inp=input_feat['attention.dense'],
49 |         ))
50 |         """
51 | 
52 |         # linear 1
53 |         layers.append(
54 |             dict(
55 |                 prev_op=module.post_attention_layernorm,
56 |                 layers=[module.mlp.dense_h_to_4h],
57 |                 inp=input_feat["mlp.dense_h_to_4h"],
58 |             )
59 |         )
60 | 
61 |         # linear 2
62 |         layers.append(
63 |             dict(
64 |                 prev_op=module.mlp.act,
65 |                 layers=[module.mlp.dense_4h_to_h],
66 |                 inp=input_feat["mlp.dense_4h_to_h"],
67 |             )
68 |         )
69 | 
70 |         return layers
71 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/awq/models/gptj.py:
--------------------------------------------------------------------------------
 1 | from .base import BaseAWQForCausalLM
 2 | from transformers.models.gptj.modeling_gptj import GPTJForCausalLM, GPTJBlock
 3 | 
 4 | 
 5 | class GPTJAWQForCausalLM(BaseAWQForCausalLM):
 6 |     layer_type = "GPTJBlock"
 7 |     max_seq_len_key = "n_positions"
 8 | 
 9 |     @staticmethod
10 |     def get_model_layers(model: GPTJForCausalLM):
11 |         return model.transformer.h
12 | 
13 |     @staticmethod
14 |     def get_act_for_scaling(module: GPTJBlock):
15 |         return dict(
16 |             is_scalable=True,
17 |             scale_name="mlp.act",
18 |             scale_layer=module.mlp.act,
19 |             scale_shape=module.mlp.fc_in.out_features,
20 |         )
21 | 
22 |     @staticmethod
23 |     def move_embed(model: GPTJForCausalLM, device: str):
24 |         model.transformer.wte = model.transformer.wte.to(device)
25 | 
26 |     @staticmethod
27 |     def get_layers_for_scaling(module: GPTJBlock, input_feat, module_kwargs):
28 |         layers = []
29 | 
30 |         # attention input + linear 1
31 |         layers.append(
32 |             dict(
33 |                 prev_op=module.ln_1,
34 |                 layers=[
35 |                     module.attn.q_proj,
36 |                     module.attn.k_proj,
37 |                     module.attn.v_proj,
38 |                     module.mlp.fc_in,
39 |                 ],
40 |                 inp=input_feat["attn.q_proj"],
41 |                 module2inspect=module,
42 |                 kwargs=module_kwargs,
43 |             )
44 |         )
45 | 
46 |         # attention out
47 |         layers.append(
48 |             dict(
49 |                 prev_op=module.attn.v_proj,
50 |                 layers=[module.attn.out_proj],
51 |                 inp=input_feat["attn.out_proj"],
52 |             )
53 |         )
54 | 
55 |         # linear 2
56 |         layers.append(
57 |             dict(
58 |                 prev_op=module.mlp.act,
59 |                 layers=[module.mlp.fc_out],
60 |                 inp=input_feat["mlp.fc_out"],
61 |             )
62 |         )
63 | 
64 |         return layers
65 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/awq/models/mpt.py:
--------------------------------------------------------------------------------
  1 | from .base import BaseAWQForCausalLM
  2 | from transformers.models.mpt.modeling_mpt import MptBlock as OldMptBlock, MptForCausalLM
  3 | 
  4 | 
  5 | class MptAWQForCausalLM(BaseAWQForCausalLM):
  6 |     layer_type = "MPTBlock"
  7 |     max_seq_len_key = "max_seq_len"
  8 | 
  9 |     @staticmethod
 10 |     def fuse_layers(model: MptForCausalLM):
 11 |         fuser = MptFuser(model)
 12 |         fuser.fuse_transformer()
 13 | 
 14 |     @staticmethod
 15 |     def get_model_layers(model: MptForCausalLM):
 16 |         return model.transformer.blocks
 17 | 
 18 |     @staticmethod
 19 |     def get_act_for_scaling(module: OldMptBlock):
 20 |         return dict(
 21 |             is_scalable=True,
 22 |             scale_name="ffn.act",
 23 |             scale_layer=module.ffn.act,
 24 |             scale_shape=module.ffn.up_proj.out_features,
 25 |         )
 26 | 
 27 |     @staticmethod
 28 |     def move_embed(model: MptForCausalLM, device: str):
 29 |         model.transformer.wte = model.transformer.wte.to(device)
 30 |         model.transformer.emb_drop = model.transformer.emb_drop.to(device)
 31 | 
 32 |     @staticmethod
 33 |     def get_layers_for_scaling(module: OldMptBlock, input_feat, module_kwargs):
 34 |         layers = []
 35 | 
 36 |         if module_kwargs.get("output_attentions") is not None:
 37 |             module_kwargs.pop("output_attentions")
 38 | 
 39 |         # attention input
 40 |         layers.append(
 41 |             dict(
 42 |                 prev_op=module.norm_1,
 43 |                 layers=[module.attn.Wqkv],
 44 |                 inp=input_feat["attn.Wqkv"],
 45 |                 module2inspect=module.attn,
 46 |                 kwargs=module_kwargs,
 47 |             )
 48 |         )
 49 | 
 50 |         # attention output
 51 |         layers.append(
 52 |             dict(
 53 |                 prev_op=module.attn.Wqkv,
 54 |                 layers=[module.attn.out_proj],
 55 |                 inp=input_feat["attn.out_proj"],
 56 |             )
 57 |         )
 58 | 
 59 |         # linear 1
 60 |         layers.append(
 61 |             dict(
 62 |                 prev_op=module.norm_2,
 63 |                 layers=[module.ffn.up_proj],
 64 |                 inp=input_feat["ffn.up_proj"],
 65 |                 module2inspect=module.ffn,
 66 |             )
 67 |         )
 68 | 
 69 |         # linear 2
 70 |         layers.append(
 71 |             dict(
 72 |                 prev_op=module.ffn.act,
 73 |                 layers=[module.ffn.down_proj],
 74 |                 inp=input_feat["ffn.down_proj"],
 75 |             )
 76 |         )
 77 | 
 78 |         return layers
 79 | 
 80 | 
 81 | from typing import List, Tuple
 82 | from awq.utils.utils import set_module_name
 83 | from awq.modules.fused.block import MPTBlock
 84 | from awq.modules.fused.model import MPTModel
 85 | 
 86 | 
 87 | class MptFuser:
 88 |     def __init__(self, model: MptForCausalLM):
 89 |         self.model = model
 90 | 
 91 |         self.mpt_blocks: List[Tuple[str, OldMptBlock]] = [
 92 |             (name, module)
 93 |             for name, module in self.model.named_modules()
 94 |             if "mptblock" in module.__class__.__name__.lower()
 95 |         ]
 96 | 
 97 |     def fuse_transformer(self):
 98 |         blocks = []
 99 | 
100 |         module: OldMptBlock
101 |         for module in self.model.transformer.blocks:
102 |             blocks.append(
103 |                 MPTBlock(
104 |                     self.model.config.d_model,
105 |                     self.model.config.n_heads,
106 |                     module.attn.Wqkv,
107 |                     module.attn.out_proj,
108 |                     module.ffn,
109 |                     module.norm_1,
110 |                     module.norm_2,
111 |                     next(iter(module.state_dict().values())).device,
112 |                     self.model.config.max_seq_len,
113 |                 )
114 |             )
115 | 
116 |         self.model.transformer = MPTModel(
117 |             self.model.config.vocab_size,
118 |             blocks,
119 |             self.model.transformer.wte,
120 |             self.model.transformer.norm_f,
121 |         )
122 | 
123 |         setattr(self.model.transformer, "blocks", self.model.transformer.blocks)
124 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/awq/models/opt.py:
--------------------------------------------------------------------------------
 1 | from .base import BaseAWQForCausalLM
 2 | from transformers.models.opt.modeling_opt import OPTForCausalLM, OPTDecoderLayer
 3 | 
 4 | 
 5 | class OptAWQForCausalLM(BaseAWQForCausalLM):
 6 |     layer_type = "OPTDecoderLayer"
 7 |     max_seq_len_key = "max_position_embeddings"
 8 | 
 9 |     @staticmethod
10 |     def get_model_layers(model: OPTForCausalLM):
11 |         return model.model.decoder.layers
12 | 
13 |     @staticmethod
14 |     def get_act_for_scaling(module: OPTDecoderLayer):
15 |         return dict(is_scalable=False)
16 | 
17 |     @staticmethod
18 |     def move_embed(model: OPTForCausalLM, device: str):
19 |         model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(device)
20 |         model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(
21 |             device
22 |         )
23 | 
24 |     @staticmethod
25 |     def get_layers_for_scaling(module: OPTDecoderLayer, input_feat, module_kwargs):
26 |         layers = []
27 | 
28 |         # attention input
29 |         layers.append(
30 |             dict(
31 |                 prev_op=module.self_attn_layer_norm,
32 |                 layers=[
33 |                     module.self_attn.q_proj,
34 |                     module.self_attn.k_proj,
35 |                     module.self_attn.v_proj,
36 |                 ],
37 |                 inp=input_feat["self_attn.q_proj"],
38 |                 module2inspect=module.self_attn,
39 |                 kwargs=module_kwargs,
40 |             )
41 |         )
42 | 
43 |         # attention out
44 |         layers.append(
45 |             dict(
46 |                 prev_op=module.self_attn.v_proj,
47 |                 layers=[module.self_attn.out_proj],
48 |                 inp=input_feat["self_attn.out_proj"],
49 |             )
50 |         )
51 | 
52 |         # linear 1
53 |         layers.append(
54 |             dict(
55 |                 prev_op=module.final_layer_norm,
56 |                 layers=[module.fc1],
57 |                 inp=input_feat["fc1"],
58 |             )
59 |         )
60 | 
61 |         # linear 2
62 |         layers.append(
63 |             dict(
64 |                 prev_op=module.fc1,
65 |                 layers=[module.fc2],
66 |                 inp=input_feat["fc2"],
67 |             )
68 |         )
69 | 
70 |         return layers
71 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/awq/models/qwen.py:
--------------------------------------------------------------------------------
 1 | from .base import BaseAWQForCausalLM
 2 | 
 3 | 
 4 | class QwenAWQForCausalLM(BaseAWQForCausalLM):
 5 |     layer_type = "QWenBlock"
 6 |     max_seq_len_key = "seq_length"
 7 | 
 8 |     @staticmethod
 9 |     def get_model_layers(model):
10 |         return model.transformer.h
11 | 
12 |     @staticmethod
13 |     def get_act_for_scaling(module):
14 |         return dict(is_scalable=False)
15 | 
16 |     @staticmethod
17 |     def move_embed(model, device: str):
18 |         model.transformer.wte = model.transformer.wte.to(device)
19 |         model.transformer.rotary_emb = model.transformer.rotary_emb.to(device)
20 | 
21 |     @staticmethod
22 |     def get_layers_for_scaling(module, input_feat, module_kwargs):
23 |         layers = []
24 | 
25 |         # attention
26 |         layers.append(
27 |             dict(
28 |                 prev_op=module.ln_1,
29 |                 layers=[module.attn.c_attn],
30 |                 inp=input_feat["attn.c_attn"],
31 |                 module2inspect=module.attn,
32 |                 kwargs=module_kwargs,
33 |             )
34 |         )
35 | 
36 |         # mlp
37 |         layers.append(
38 |             dict(
39 |                 prev_op=module.ln_2,
40 |                 layers=[module.mlp.w2, module.mlp.w1],
41 |                 inp=input_feat["mlp.w2"],
42 |                 module2inspect=module.mlp,
43 |             )
44 |         )
45 | 
46 |         # linear 2
47 |         layers.append(
48 |             dict(
49 |                 prev_op=module.mlp.w1,
50 |                 layers=[module.mlp.c_proj],
51 |                 inp=input_feat["mlp.c_proj"],
52 |             )
53 |         )
54 | 
55 |         return layers
56 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/awq/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/compression/quantization/AutoAWQ/awq/modules/__init__.py


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/awq/modules/act.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | 
 3 | 
 4 | class ScaledActivation(nn.Module):
 5 |     def __init__(self, module, scales):
 6 |         super().__init__()
 7 |         self.act = module
 8 |         self.scales = nn.Parameter(scales.data)
 9 | 
10 |     def forward(self, x):
11 |         return self.act(x) / self.scales.view(1, 1, -1).to(x.device)
12 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/awq/modules/fused/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/compression/quantization/AutoAWQ/awq/modules/fused/__init__.py


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/awq/modules/fused/cache.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class WindowedCache:
 5 |     def __init__(self, cache_v_shape, cache_k_shape, max_seq_len, device):
 6 |         """
 7 |         The window size is the same as the max_seq_len. The window will
 8 |         automatically roll once max_seq_len is exceeded.
 9 |         """
10 |         # [batch_size, n_kv_heads, max_seq_len, head_dim]
11 |         self.v = torch.zeros(cache_v_shape).to(device).half()
12 |         # [batch_size, n_kv_heads, head_dim // pack_factor, max_seq_len, pack_factor]
13 |         self.k = torch.zeros(cache_k_shape).to(device).half()
14 |         self.max_seq_len = max_seq_len
15 | 
16 |     def get_kv(self, batch_size, start_pos, seqlen, head_dim):
17 |         """
18 |         Gets the key-value store in correct shapes.
19 |         """
20 |         xv = (
21 |             self.v[:batch_size, :, : start_pos + seqlen, :].transpose(1, 2).contiguous()
22 |         )
23 |         xk = (
24 |             self.k[:batch_size, :, :, : start_pos + seqlen, :]
25 |             .transpose(2, 3)
26 |             .contiguous()
27 |         )
28 |         xk = xk.reshape(xk.shape[:-2] + (head_dim,)).transpose(1, 2).contiguous()
29 | 
30 |         return xv, xk
31 | 
32 |     def update_kv(self, values_store, keys_store, batch_size, start_pos, seqlen):
33 |         """
34 |         Updates the values in the key-value store.
35 |         """
36 |         self.v[:batch_size, :, start_pos : start_pos + seqlen, :] = values_store
37 |         self.k[:batch_size, :, :, start_pos : start_pos + seqlen, :] = keys_store
38 | 
39 |     def roll_kv_n_steps(self, start_pos, n=100):
40 |         """
41 |         Roll cache n to the left.
42 |         """
43 |         n = min(n, self.max_seq_len)
44 |         # Roll cache to the left
45 |         self.v = torch.roll(self.v, shifts=-n, dims=2)
46 |         self.k = torch.roll(self.k, shifts=-n, dims=3)
47 | 
48 |         # Zero out the new part
49 |         self.v[:, :, -n:, :] = 0
50 |         self.k[:, :, :, -n:, :] = 0
51 | 
52 |         return start_pos - n
53 | 
54 |     def to(self, device):
55 |         self.k = self.k.to(device)
56 |         self.v = self.v.to(device)
57 | 
58 |     def increase_batch_size(self, to_bsz):
59 |         """Dynamically allocate new kv when batch size changes."""
60 |         self.v = torch.zeros(
61 |             to_bsz, *self.v.shape[1:], dtype=self.v.dtype, device=self.v.device
62 |         )
63 |         self.k = torch.zeros(
64 |             to_bsz, *self.k.shape[1:], dtype=self.k.dtype, device=self.k.device
65 |         )
66 | 
67 |     def decrease_batch_size(self, to_bsz):
68 |         """Dynamically remove part of cache if batch size changes."""
69 |         self.v = self.v[:to_bsz, :, :, :]
70 |         self.k = self.k[:to_bsz, :, :, :, :]
71 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/awq/modules/fused/mlp.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch.nn.functional as F
 3 | from awq.modules.linear.gemm import WQLinear_GEMM
 4 | from awq.modules.linear.gemv import WQLinear_GEMV
 5 | 
 6 | try:
 7 |     import awq_ext  # with CUDA kernels
 8 | 
 9 |     AWQ_INSTALLED = True
10 | except:
11 |     AWQ_INSTALLED = False
12 | 
13 | 
14 | class QuantFusedMLP(nn.Module):
15 |     def __init__(
16 |         self,
17 |         gate_proj,
18 |         down_proj,
19 |         up_proj,
20 |         activation=F.silu,
21 |     ):
22 |         super().__init__()
23 | 
24 |         self.register_buffer("gate_proj_qweight", gate_proj.qweight)
25 |         self.register_buffer("gate_proj_scales", gate_proj.scales)
26 |         self.register_buffer("gate_proj_qzeros", gate_proj.qzeros)
27 |         self.register_buffer("up_proj_qweight", up_proj.qweight)
28 |         self.register_buffer("up_proj_scales", up_proj.scales)
29 |         self.register_buffer("up_proj_qzeros", up_proj.qzeros)
30 | 
31 |         self.in_features = gate_proj.in_features
32 |         self.intermediate_size = gate_proj.out_features
33 |         self.out_features = down_proj.out_features
34 |         self.w_bit = gate_proj.w_bit
35 |         self.down_proj = down_proj
36 | 
37 |         if isinstance(down_proj, WQLinear_GEMV):
38 |             self.linear = awq_ext.gemv_forward_cuda
39 |             self.group_size = down_proj.group_size
40 |         else:
41 |             self.linear = awq_ext.gemm_forward_cuda
42 |             self.group_size = 8
43 | 
44 |         self.activation = activation
45 | 
46 |     def forward(self, x, routing_weights=None):
47 |         out_shape = x.shape[:-1] + (self.intermediate_size,)
48 |         x = x.reshape(-1, x.shape[-1])
49 |         gate_output = self.linear(
50 |             x,
51 |             self.gate_proj_qweight,
52 |             self.gate_proj_scales,
53 |             self.gate_proj_qzeros,
54 |             self.group_size,
55 |         )
56 |         up_output = self.linear(
57 |             x,
58 |             self.up_proj_qweight,
59 |             self.up_proj_scales,
60 |             self.up_proj_qzeros,
61 |             self.group_size,
62 |         )
63 |         x = self.activation(gate_output) * up_output
64 |         x = x.reshape(out_shape)
65 |         x = self.down_proj(x)
66 | 
67 |         if routing_weights is not None:
68 |             x = routing_weights * x
69 | 
70 |         return x
71 | 
72 | 
73 | class QuantLlamaMLP(QuantFusedMLP):
74 |     r"""
75 |     QuantLlamaMLP class kept for backward compatibilty, in the future, users
76 |     should always use `QuantFusedMLP` class instead.
77 |     """
78 | 
79 |     def __init__(self, gate_proj, down_proj, up_proj):
80 |         super().__init__(gate_proj, down_proj, up_proj)
81 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/awq/modules/fused/norm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | 
 4 | try:
 5 |     import awq_ext  # with CUDA kernels
 6 | 
 7 |     AWQ_INSTALLED = True
 8 | except:
 9 |     AWQ_INSTALLED = False
10 | 
11 | 
12 | class FasterTransformerRMSNorm(nn.Module):
13 |     def __init__(self, weight, eps=1e-6):
14 |         super().__init__()
15 |         self.weight = weight
16 |         self.variance_epsilon = eps
17 | 
18 |     def forward(self, x):
19 |         assert AWQ_INSTALLED, (
20 |             "AWQ kernels could not be loaded. "
21 |             "Please install them from https://github.com/casper-hansen/AutoAWQ_kernels"
22 |         )
23 | 
24 |         output = torch.empty_like(x)
25 |         awq_ext.layernorm_forward_cuda(x, self.weight, output, self.variance_epsilon)
26 | 
27 |         return output
28 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/awq/modules/linear/__init__.py:
--------------------------------------------------------------------------------
1 | from .exllama import WQLinear_Exllama, exllama_post_init
2 | from .exllamav2 import WQLinear_ExllamaV2, exllamav2_post_init
3 | from .gemm import WQLinear_GEMM
4 | from .gemv import WQLinear_GEMV
5 | from .marlin import WQLinear_Marlin, marlin_post_init
6 | from .gemv_fast import WQLinear_GEMVFast
7 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/awq/quantize/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/compression/quantization/AutoAWQ/awq/quantize/__init__.py


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/awq/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/compression/quantization/AutoAWQ/awq/utils/__init__.py


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/awq/utils/calib_data.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import logging
 3 | from typing import List, Union
 4 | from datasets import load_dataset
 5 | 
 6 | 
 7 | def get_calib_dataset(
 8 |     data: Union[str, List[str], List[List[int]]] = "pileval",
 9 |     tokenizer=None,
10 |     n_samples=512,
11 |     block_size=512,
12 |     split="compression",
13 |     text_column="text",
14 | ):
15 |     if isinstance(data, str):
16 |         if data == "pileval":
17 |             dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
18 |         else:
19 |             dataset = load_dataset(data, split=split)
20 | 
21 |         dataset = dataset.shuffle(seed=42)
22 | 
23 |     elif isinstance(data, list):
24 |         if isinstance(data[0], str):
25 |             dataset = [{text_column: text} for text in data]
26 |         elif isinstance(data[0][0], int):
27 |             dataset = data
28 |         else:
29 |             raise NotImplementedError(
30 |                 "Either pass a string to a huggingface dataset or a list"
31 |                 "that is preprocessed with one sample of text per element"
32 |                 " or a list of list of int for tokenized words."
33 |             )
34 |     else:
35 |         raise NotImplementedError(
36 |             "Either pass a string to a huggingface dataset or a list"
37 |             "that is preprocessed with one sample of text per element"
38 |             " or a list of list of int for tokenized words."
39 |         )
40 | 
41 |     samples = []
42 |     n_run = 0
43 |     for data in dataset:
44 |         if isinstance(data, list):
45 |             line_encoded = data
46 |         else:
47 |             line = data[text_column]
48 |             line = line.strip()
49 |             line_encoded = tokenizer.encode(line)
50 |         if len(line_encoded) > 512:
51 |             continue
52 |         sample = torch.tensor([line_encoded])
53 |         if sample.numel() == 0:
54 |             continue
55 |         samples.append(sample)
56 |         n_run += 1
57 |         if n_run == n_samples:
58 |             break
59 |     # now concatenate all samples and split according to block size
60 |     cat_samples = torch.cat(samples, dim=1)
61 |     n_split = cat_samples.shape[1] // block_size
62 |     logging.debug(f" * Split into {n_split} blocks")
63 |     return [
64 |         cat_samples[:, i * block_size : (i + 1) * block_size] for i in range(n_split)
65 |     ]
66 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/awq/utils/module.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | 
 3 | 
 4 | def get_named_linears(module):
 5 |     return {name: m for name, m in module.named_modules() if isinstance(m, nn.Linear)}
 6 | 
 7 | 
 8 | def get_op_by_name(module, op_name):
 9 |     # get the op by its name relative to the module
10 |     for name, m in module.named_modules():
11 |         if name == op_name:
12 |             return m
13 |     raise ValueError(f"Cannot find op {op_name} in module {module}")
14 | 
15 | 
16 | def set_op_by_name(layer, name, new_module):
17 |     levels = name.split(".")
18 |     if len(levels) > 1:
19 |         mod_ = layer
20 |         for l_idx in range(len(levels) - 1):
21 |             if levels[l_idx].isdigit():
22 |                 mod_ = mod_[int(levels[l_idx])]
23 |             else:
24 |                 mod_ = getattr(mod_, levels[l_idx])
25 |         setattr(mod_, levels[-1], new_module)
26 |     else:
27 |         setattr(layer, name, new_module)
28 | 
29 | 
30 | def get_op_name(module, op):
31 |     # get the name of the op relative to the module
32 |     for name, m in module.named_modules():
33 |         if m is op:
34 |             return name
35 |     raise ValueError(f"Cannot find op {op} in module {module}")
36 | 
37 | 
38 | def append_str_prefix(x, prefix):
39 |     if isinstance(x, str):
40 |         return prefix + x
41 |     elif isinstance(x, tuple):
42 |         return tuple([append_str_prefix(y, prefix) for y in x])
43 |     elif isinstance(x, list):
44 |         return [append_str_prefix(y, prefix) for y in x]
45 |     else:
46 |         return x
47 | 
48 | 
49 | def exclude_layers_to_not_quantize(linear_layers, modules_to_not_convert):
50 |     if modules_to_not_convert is None:
51 |         return linear_layers
52 | 
53 |     filtered_layers = {}
54 |     for name, linear_layer in linear_layers.items():
55 |         if not any(key in name for key in modules_to_not_convert):
56 |             filtered_layers[name] = linear_layer
57 |         elif "gate_proj" in name: # 🔍 add gate_proj to filtered_layers. 
58 |             filtered_layers[name] = linear_layer
59 |     return filtered_layers
60 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/awq/utils/packing_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | AWQ_ORDER = [0, 2, 4, 6, 1, 3, 5, 7]
 5 | AWQ_REVERSE_ORDER = [0, 4, 1, 5, 2, 6, 3, 7]
 6 | 
 7 | 
 8 | def unpack_awq(qweight: torch.Tensor, qzeros: torch.Tensor, bits: int):
 9 |     shifts = torch.arange(0, 32, bits, device=qzeros.device)
10 | 
11 |     # unpacking columnwise
12 |     iweights = torch.bitwise_right_shift(qweight[:, :, None], shifts[None, None, :]).to(
13 |         torch.int8  # smallest dtype available
14 |     )
15 |     iweights = iweights.view(iweights.shape[0], -1)
16 | 
17 |     # unpacking columnwise
18 |     izeros = torch.bitwise_right_shift(qzeros[:, :, None], shifts[None, None, :]).to(
19 |         torch.int8  # smallest dtype available
20 |     )
21 |     izeros = izeros.view(izeros.shape[0], -1)
22 | 
23 |     return iweights, izeros
24 | 
25 | 
26 | def reverse_awq_order(iweights: torch.Tensor, izeros: torch.Tensor, bits: int):
27 |     reverse_order_tensor = torch.arange(
28 |         izeros.shape[-1],
29 |         dtype=torch.int32,
30 |         device=izeros.device,
31 |     )
32 |     reverse_order_tensor = reverse_order_tensor.view(-1, 32 // bits)
33 |     reverse_order_tensor = reverse_order_tensor[:, AWQ_REVERSE_ORDER]
34 |     reverse_order_tensor = reverse_order_tensor.view(-1)
35 | 
36 |     izeros = izeros[:, reverse_order_tensor]
37 |     iweights = iweights[:, reverse_order_tensor]
38 | 
39 |     return iweights, izeros
40 | 
41 | 
42 | def pack_exllama(iweights: torch.Tensor, izeros: torch.Tensor, bits: int):
43 |     shifts = torch.arange(0, 32, bits, device=iweights.device)
44 | 
45 |     # packing rowwise
46 |     iweights = iweights.view(iweights.shape[0] // (32 // bits), 32 // bits, -1)
47 |     qweight = (
48 |         torch.bitwise_left_shift(iweights, shifts[None, :, None])
49 |         .sum(dim=1)
50 |         .to(torch.int32)
51 |     )
52 | 
53 |     # packing columnwise
54 |     izeros = izeros.view(-1, izeros.shape[1] // (32 // bits), 32 // bits)
55 |     qzeros = (
56 |         torch.bitwise_left_shift(izeros, shifts[None, None, :])
57 |         .sum(dim=-1)
58 |         .to(torch.int32)
59 |     )
60 | 
61 |     return qweight, qzeros
62 | 
63 | 
64 | def unpack_reorder_pack(qweight, qzeros, bits):
65 |     # Unpack the qweight and qzeros tensors
66 |     iweight, izeros = unpack_awq(qweight, qzeros, bits)
67 |     # Reverse the order of the iweight and izeros tensors
68 |     iweight, izeros = reverse_awq_order(iweight, izeros, bits)
69 | 
70 |     # overflow checks
71 |     iweight = torch.bitwise_and(iweight, (2**bits) - 1)
72 |     izeros = torch.bitwise_and(izeros, (2**bits) - 1)
73 | 
74 |     # Subtract 1 from the izeros tensor (exllama adds 1 during inference)
75 |     # We can remove it if we remove the +1 in the exllama code
76 |     izeros = izeros - 1
77 |     # Pack the qweight and qzeros tensors
78 |     qweight, qzeros = pack_exllama(iweight, izeros, bits)
79 | 
80 |     return qweight, qzeros
81 | 
82 | 
83 | def dequantize_gemm(qweight, qzeros, scales, bits, group_size):
84 |     # Unpack the qweight and qzeros tensors
85 |     iweight, izeros = unpack_awq(qweight, qzeros, bits)
86 |     # Reverse the order of the iweight and izeros tensors
87 |     iweight, izeros = reverse_awq_order(iweight, izeros, bits)
88 | 
89 |     # overflow checks
90 |     iweight = torch.bitwise_and(iweight, (2**bits) - 1)
91 |     izeros = torch.bitwise_and(izeros, (2**bits) - 1)
92 | 
93 |     # fp16 weights
94 |     scales = scales.repeat_interleave(group_size, dim=0)
95 |     izeros = izeros.repeat_interleave(group_size, dim=0)
96 |     iweight = (iweight - izeros) * scales
97 | 
98 |     return iweight
99 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/awq/utils/parallel.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import gc
 4 | import logging
 5 | 
 6 | 
 7 | def auto_parallel(args):
 8 |     model_size = args.model_path.split("-")[-1]
 9 |     if model_size.endswith("m"):
10 |         model_gb = 1
11 |     else:
12 |         model_gb = float(model_size[:-1])
13 |     if model_gb < 20:
14 |         n_gpu = 1
15 |     elif model_gb < 50:
16 |         n_gpu = 4
17 |     else:
18 |         n_gpu = 8
19 |     args.parallel = n_gpu > 1
20 |     cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
21 |     if isinstance(cuda_visible_devices, str):
22 |         cuda_visible_devices = cuda_visible_devices.split(",")
23 |     else:
24 |         cuda_visible_devices = list(range(8))
25 |     os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
26 |         [str(dev) for dev in cuda_visible_devices[:n_gpu]]
27 |     )
28 |     logging.debug("CUDA_VISIBLE_DEVICES: ", os.environ["CUDA_VISIBLE_DEVICES"])
29 |     return cuda_visible_devices
30 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/awq/utils/utils.py:
--------------------------------------------------------------------------------
  1 | import gc
  2 | import torch
  3 | import accelerate
  4 | 
  5 | 
  6 | def get_module_by_name_suffix(model, module_name: str):
  7 |     for name, module in model.named_modules():
  8 |         if name.endswith(module_name):
  9 |             return module
 10 | 
 11 | 
 12 | def simple_dispatch_model(model, device_map):
 13 |     from accelerate.hooks import add_hook_to_module, AlignDevicesHook
 14 | 
 15 |     if "" in device_map:
 16 |         d = device_map[""]
 17 |         model = model.to(torch.device(d))
 18 |         model.hf_device_map = device_map
 19 |         return model
 20 | 
 21 |     tied_params = accelerate.utils.modeling.find_tied_parameters(model)
 22 |     if set(device_map.values()) == {"cpu"} or set(device_map.values()) == {
 23 |         "cpu",
 24 |         "disk",
 25 |     }:
 26 |         main_device = "cpu"
 27 |     else:
 28 |         main_device = [d for d in device_map.values() if d not in ["cpu", "disk"]][0]
 29 | 
 30 |     cpu_offload_group = [(n, d) for n, d in device_map.items() if d == "cpu"]
 31 |     prev_hook = None
 32 |     for idx, (n, d) in enumerate(cpu_offload_group):
 33 |         m = get_module_by_name_suffix(model, n)
 34 |         _, prev_hook = accelerate.cpu_offload_with_hook(
 35 |             m, execution_device=main_device, prev_module_hook=prev_hook
 36 |         )
 37 |     # set first cpu offload module's prev_module_hook to the last cpu offload module's hook
 38 |     if len(cpu_offload_group) > 1:
 39 |         get_module_by_name_suffix(
 40 |             model, cpu_offload_group[0][0]
 41 |         )._hf_hook.prev_module_hook = prev_hook
 42 | 
 43 |     for n, d in device_map.items():
 44 |         m = get_module_by_name_suffix(model, n)
 45 |         if d != "cpu":
 46 |             d = torch.device(d)
 47 |             hook = AlignDevicesHook(d, io_same_device=True, place_submodules=True)
 48 |             add_hook_to_module(m, hook)
 49 |     accelerate.utils.modeling.retie_parameters(model, tied_params)
 50 |     model.hf_device_map = device_map
 51 | 
 52 |     return model
 53 | 
 54 | 
 55 | def set_module_name(model, name, value):
 56 |     if "." in name:
 57 |         parent_name = name.rsplit(".", 1)[0]
 58 |         child_name = name[len(parent_name) + 1 :]
 59 |         parent = model.get_submodule(parent_name)
 60 |     else:
 61 |         parent_name = ""
 62 |         parent = model
 63 |         child_name = name
 64 | 
 65 |     setattr(parent, child_name, value)
 66 | 
 67 | 
 68 | def clear_memory(weight=None):
 69 |     if weight is not None:
 70 |         del weight
 71 |     gc.collect()
 72 |     torch.cuda.empty_cache()
 73 | 
 74 | 
 75 | def compute_memory_used_pct(device):
 76 |     memory_used = torch.cuda.max_memory_allocated(device) / (1024**3)
 77 |     memory_pct = (
 78 |         memory_used
 79 |         / (torch.cuda.get_device_properties(device).total_memory / (1024**3))
 80 |         * 100
 81 |     )
 82 |     return memory_pct
 83 | 
 84 | 
 85 | def get_best_device():
 86 |     if torch.backends.mps.is_available():
 87 |         return "mps"
 88 |     elif torch.cuda.is_available():
 89 |         return "cuda:0"
 90 |     else:
 91 |         return "cpu"
 92 | 
 93 | 
 94 | def get_lowest_memory_device_index():
 95 |     device = None
 96 |     curr_device_memory_pct = 0
 97 |     for device_index in range(torch.cuda.device_count()):
 98 |         device_memory_pct = compute_memory_used_pct(device_index)
 99 |         if device is None or device_memory_pct < curr_device_memory_pct:
100 |             device = device_index
101 |             curr_device_memory_pct = device_memory_pct
102 | 
103 |     return device
104 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoAWQ/quantize.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import sys
 3 | import os
 4 | 
 5 | from transformers import AutoTokenizer
 6 | from llmtuner.compression.quantization.AutoAWQ.awq import AutoAWQForCausalLM
 7 | 
 8 | 
 9 | model_path = sys.argv[1]
10 | quant_path = sys.argv[2]
11 | bits = sys.argv[3]
12 | 
13 | quant_config = {
14 |                 "zero_point": True, 
15 |                 "q_group_size": q_group_size, 
16 |                 "w_bit": int(bits), 
17 |                 "version": "GEMM", 
18 |                 }
19 | 
20 | print(f"quant_config: {quant_config}")
21 | # Load model
22 | model = AutoAWQForCausalLM.from_pretrained(
23 |     model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
24 | )
25 | 
26 | try:
27 |     tokenizer = AutoTokenizer.from_pretrained(
28 |         model_path, 
29 |         use_fast=False, 
30 |         trust_remote_code=True,
31 |         )
32 | except:
33 |     tokenizer = AutoTokenizer.from_pretrained(
34 |         model_path, 
35 |         use_fast=True, 
36 |         trust_remote_code=True,
37 |         )
38 | 
39 | # Quantize
40 | model.quantize(tokenizer, quant_config=quant_config)
41 | 
42 | # Save quantized model
43 | model.save_quantized(quant_path)
44 | tokenizer.save_pretrained(quant_path)
45 | f = open(os.path.join(quant_path, "quantize_config.json"), 'w')
46 | config_to_save = json.dumps(quant_config, indent=2, sort_keys=True)
47 | f.write(config_to_save)
48 | f.close()
49 | print(f'Model is quantized and saved at "{quant_path}"')


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/compression/quantization/AutoGPTQ/__init__.py


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/__init__.py:
--------------------------------------------------------------------------------
1 | from .modeling import AutoGPTQForCausalLM, BaseQuantizeConfig
2 | from .utils.exllama_utils import exllama_set_max_input_length
3 | from .utils.peft_utils import get_gptq_peft_model
4 | 
5 | __version__ = "0.7.1"
6 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/eval_tasks/__init__.py:
--------------------------------------------------------------------------------
1 | from .language_modeling_task import LanguageModelingTask
2 | from .sequence_classification_task import SequenceClassificationTask, get_predictions
3 | from .text_summarization_task import TextSummarizationTask
4 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/eval_tasks/_base.py:
--------------------------------------------------------------------------------
 1 | from abc import abstractmethod
 2 | from typing import Any, Dict, List, Optional, Union
 3 | 
 4 | import torch
 5 | from transformers import PreTrainedModel, PreTrainedTokenizer
 6 | 
 7 | from ..modeling import BaseGPTQForCausalLM
 8 | from ..utils.data_utils import get_dataloader
 9 | 
10 | 
11 | class BaseTask:
12 |     def __init__(
13 |         self,
14 |         model: Union[BaseGPTQForCausalLM, PreTrainedModel],
15 |         tokenizer: PreTrainedTokenizer,
16 |         data_name_or_path: str,
17 |         prompt_col_name: str,
18 |         label_col_name: str,
19 |         device: Optional[str] = None,
20 |         **kwargs,
21 |     ):
22 |         self.model = model
23 |         self.tokenizer = tokenizer
24 |         if self.tokenizer.pad_token_id is None:
25 |             self.tokenizer.pad_token = self.tokenizer.eos_token
26 |             self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
27 |             self.model.config.pad_token_id = self.tokenizer.eos_token_id
28 |         self.dl = get_dataloader(
29 |             data_name_or_path,
30 |             prompt_col_name=prompt_col_name,
31 |             label_col_name=label_col_name,
32 |             tokenizer=tokenizer,
33 |             **kwargs,
34 |         )
35 | 
36 |         self.device = device
37 |         if not self.device:
38 |             self.device = self.model.device
39 |         if isinstance(self.device, str):
40 |             self.device = torch.device(self.device)
41 | 
42 |     @abstractmethod
43 |     def _predict(self, batch_data: Dict[str, Any], **kwargs) -> List[Any]:
44 |         pass
45 | 
46 |     @abstractmethod
47 |     def _parse_labels(self, label_ids: torch.LongTensor) -> List[Any]:
48 |         pass
49 | 
50 |     @abstractmethod
51 |     def _metric(self, pred: List[Any], label: List[Any]) -> Dict[str, float]:
52 |         pass
53 | 
54 |     def run(self, **predict_kwargs) -> Dict[str, float]:
55 |         with torch.inference_mode(), torch.amp.autocast(device_type=self.device.type):
56 |             predictions = []
57 |             labels = []
58 |             for batch_data in self.dl:
59 |                 for k, v in batch_data.items():
60 |                     if isinstance(v, torch.Tensor):
61 |                         batch_data[k] = v.to(self.device)
62 |                 labels += self._parse_labels(batch_data["labels"])
63 |                 predictions += self._predict(batch_data, **predict_kwargs)
64 | 
65 |         return self._metric(predictions, labels)
66 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/eval_tasks/_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/eval_tasks/_utils/__init__.py


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/eval_tasks/_utils/classification_utils.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from typing import List, Sequence
 3 | 
 4 | import numpy as np
 5 | 
 6 | 
 7 | def levenshtein_distance(seq1: Sequence, seq2: Sequence):
 8 |     if seq1 == seq2:
 9 |         return 0
10 |     num_rows = len(seq1) + 1
11 |     num_cols = len(seq2) + 1
12 |     dp_matrix = np.empty((num_rows, num_cols))
13 |     dp_matrix[0, :] = range(num_cols)
14 |     dp_matrix[:, 0] = range(num_rows)
15 | 
16 |     for i in range(1, num_rows):
17 |         for j in range(1, num_cols):
18 |             if seq1[i - 1] == seq2[j - 1]:
19 |                 dp_matrix[i, j] = dp_matrix[i - 1, j - 1]
20 |             else:
21 |                 dp_matrix[i, j] = (
22 |                     min(
23 |                         dp_matrix[i - 1, j - 1],
24 |                         dp_matrix[i - 1, j],
25 |                         dp_matrix[i, j - 1],
26 |                     )
27 |                     + 1
28 |                 )
29 | 
30 |     return dp_matrix[num_rows - 1, num_cols - 1]
31 | 
32 | 
33 | def get_closest_label(pred: Sequence, classes: List[Sequence]) -> int:
34 |     min_id = sys.maxsize
35 |     min_edit_distance = sys.maxsize
36 |     for i, class_label in enumerate(classes):
37 |         edit_distance = levenshtein_distance(pred, class_label)
38 |         if edit_distance < min_edit_distance:
39 |             min_id = i
40 |             min_edit_distance = edit_distance
41 |     return min_id
42 | 
43 | 
44 | __all__ = ["levenshtein_distance", "get_closest_label"]
45 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/eval_tasks/_utils/generation_utils.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional, Union
 2 | 
 3 | from torch import LongTensor
 4 | from transformers import PreTrainedTokenizer
 5 | 
 6 | 
 7 | def postprocess_generation_ids(
 8 |     input_ids: LongTensor,
 9 |     output_ids: LongTensor,
10 |     num_return_sequences: int,
11 |     tokenizer: Optional[PreTrainedTokenizer] = None,
12 |     pad_token_ids: Optional[int] = None,
13 | ) -> List[List[Union[str, List[int]]]]:
14 |     outputs = []
15 |     for idx, start in enumerate(range(0, len(output_ids), num_return_sequences)):
16 |         sub_output_ids = output_ids[start : start + num_return_sequences]
17 |         sub_generated_ids = sub_output_ids[..., input_ids[idx].size(0) :]
18 |         if tokenizer:
19 |             decoded_bach = (
20 |                 generated_text
21 |                 for generated_text in tokenizer.batch_decode(sub_generated_ids, clean_up_tokenization_spaces=True)
22 |             )
23 |             decoded_bach = list(decoded_bach)
24 |             outputs.append(decoded_bach)
25 |         else:
26 |             sub_generated_ids = sub_output_ids.cpu().numpy().tolist()
27 |             for i, one_sub_generated_ids in enumerate(sub_generated_ids):
28 |                 if pad_token_ids is not None and pad_token_ids in one_sub_generated_ids:
29 |                     one_sub_generated_ids = one_sub_generated_ids[: one_sub_generated_ids.index(pad_token_ids)]
30 |                 sub_generated_ids[i] = one_sub_generated_ids
31 |             outputs.append(sub_generated_ids)
32 | 
33 |     return outputs
34 | 
35 | 
36 | __all__ = ["postprocess_generation_ids"]
37 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/eval_tasks/language_modeling_task.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | from typing import Any, Dict, List, Optional
 3 | 
 4 | from torch import LongTensor
 5 | 
 6 | from ._base import BaseTask
 7 | 
 8 | 
 9 | class LanguageModelingTask(BaseTask):
10 |     def __init__(
11 |         self,
12 |         model,
13 |         tokenizer,
14 |         data_name_or_path: str,
15 |         prompt_col_name: str,
16 |         label_col_name: str,
17 |         device: Optional[str] = None,
18 |         **kwargs,
19 |     ):
20 |         kwargs["merge_prompt_label"] = True
21 |         super().__init__(
22 |             model=model,
23 |             tokenizer=tokenizer,
24 |             data_name_or_path=data_name_or_path,
25 |             prompt_col_name=prompt_col_name,
26 |             label_col_name=label_col_name,
27 |             device=device,
28 |             **kwargs,
29 |         )
30 | 
31 |     def _predict(self, batch_data: Dict[str, Any], *args, **kwargs) -> List[float]:
32 |         outputs = self.model(**batch_data)
33 |         loss = outputs.loss.cpu().item()
34 | 
35 |         return [loss]
36 | 
37 |     def _parse_labels(self, label_ids: LongTensor) -> List[Any]:
38 |         return []
39 | 
40 |     def _metric(self, pred: List[Any], label: List[Any]) -> Dict[str, float]:
41 |         return {"ppl": math.exp(sum(pred) / len(pred))}
42 | 
43 |     def run(self) -> Dict[str, float]:
44 |         return super().run()
45 | 
46 | 
47 | __all__ = ["LanguageModelingTask"]
48 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/eval_tasks/sequence_classification_task.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter
  2 | from typing import Any, Dict, List, Optional
  3 | 
  4 | import numpy as np
  5 | from torch import LongTensor
  6 | from transformers import GenerationConfig, PreTrainedTokenizer
  7 | 
  8 | from ._base import BaseTask
  9 | from ._utils.classification_utils import get_closest_label
 10 | from ._utils.generation_utils import postprocess_generation_ids
 11 | 
 12 | 
 13 | def get_predictions(
 14 |     input_ids: LongTensor,
 15 |     output_ids: LongTensor,
 16 |     num_return_sequences: int,
 17 |     tokenizer: PreTrainedTokenizer,
 18 |     classes: List[str],
 19 | ) -> List[int]:
 20 |     predictions = []
 21 |     generated_texts = postprocess_generation_ids(
 22 |         input_ids=input_ids,
 23 |         output_ids=output_ids,
 24 |         num_return_sequences=num_return_sequences,
 25 |         tokenizer=tokenizer,
 26 |     )
 27 |     for sub_generated_texts in generated_texts:
 28 |         sub_predictions = []
 29 |         for gen_text in sub_generated_texts:
 30 |             sub_predictions.append(get_closest_label(gen_text.lower().strip(), classes))
 31 |         predictions.append(Counter(sub_predictions).most_common(1)[0][0])
 32 |     return predictions
 33 | 
 34 | 
 35 | class SequenceClassificationTask(BaseTask):
 36 |     def __init__(
 37 |         self,
 38 |         model,
 39 |         tokenizer: PreTrainedTokenizer,
 40 |         classes: List[str],
 41 |         data_name_or_path: str,
 42 |         prompt_col_name: str,
 43 |         label_col_name: str,
 44 |         device: Optional[str] = None,
 45 |         **kwargs,
 46 |     ):
 47 |         kwargs["merge_prompt_label"] = False
 48 |         super().__init__(
 49 |             model=model,
 50 |             tokenizer=tokenizer,
 51 |             data_name_or_path=data_name_or_path,
 52 |             prompt_col_name=prompt_col_name,
 53 |             label_col_name=label_col_name,
 54 |             device=device,
 55 |             **kwargs,
 56 |         )
 57 |         self.classes = [each.lower().strip() for each in classes]
 58 |         classes_ids = self.tokenizer(classes)
 59 |         self.max_new_tokens = max([len(each) for each in classes_ids])
 60 | 
 61 |     def _predict(self, batch_data: Dict[str, Any], *args, **kwargs) -> List[int]:
 62 |         generation_config = kwargs["generation_config"]
 63 |         output_ids = self.model.generate(
 64 |             input_ids=batch_data["input_ids"],
 65 |             attention_mask=batch_data["attention_mask"],
 66 |             generation_config=generation_config,
 67 |         )
 68 |         return get_predictions(
 69 |             batch_data["input_ids"],
 70 |             output_ids,
 71 |             generation_config.num_return_sequences,
 72 |             self.tokenizer,
 73 |             self.classes,
 74 |         )
 75 | 
 76 |     def _parse_labels(self, label_ids: LongTensor) -> List[int]:
 77 |         labels = []
 78 |         for one_label_ids in label_ids:
 79 |             one_label_ids = one_label_ids[(one_label_ids == -100).sum() :]
 80 |             label = self.tokenizer.decode(one_label_ids, clean_up_tokenization_spaces=True).lower().strip()
 81 |             label = get_closest_label(label, self.classes)
 82 |             labels.append(label)
 83 | 
 84 |         return labels
 85 | 
 86 |     def _metric(self, pred: List[int], label: List[int]) -> Dict[str, float]:
 87 |         pred = np.array(pred)
 88 |         label = np.array(label)
 89 | 
 90 |         acc = (pred == label).mean()
 91 | 
 92 |         return {"acc": acc}
 93 | 
 94 |     def run(self, generation_config: Optional[GenerationConfig] = None) -> Dict[str, float]:
 95 |         if not generation_config:
 96 |             generation_config = GenerationConfig(num_beams=1, do_sample=False, num_return_sequences=1)
 97 |         generation_config.max_new_tokens = self.max_new_tokens
 98 |         generation_config.eos_token_id = self.tokenizer.eos_token_id
 99 |         generation_config.pad_token_id = self.tokenizer.pad_token_id
100 |         return super().run(generation_config=generation_config)
101 | 
102 | 
103 | __all__ = ["SequenceClassificationTask"]
104 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/eval_tasks/text_summarization_task.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict, List, Optional
 2 | 
 3 | import rouge
 4 | from torch import LongTensor
 5 | from transformers import GenerationConfig
 6 | 
 7 | from ._base import BaseTask
 8 | from ._utils.generation_utils import postprocess_generation_ids
 9 | 
10 | 
11 | class TextSummarizationTask(BaseTask):
12 |     def __init__(
13 |         self,
14 |         model,
15 |         tokenizer,
16 |         data_name_or_path: str,
17 |         prompt_col_name: str,
18 |         label_col_name: str,
19 |         device: Optional[str] = None,
20 |         **kwargs,
21 |     ):
22 |         kwargs["merge_prompt_label"] = False
23 |         super().__init__(
24 |             model=model,
25 |             tokenizer=tokenizer,
26 |             data_name_or_path=data_name_or_path,
27 |             prompt_col_name=prompt_col_name,
28 |             label_col_name=label_col_name,
29 |             device=device,
30 |             **kwargs,
31 |         )
32 | 
33 |     def _predict(self, batch_data: Dict[str, Any], *args, **kwargs) -> List[str]:
34 |         generation_config = kwargs["generation_config"]
35 |         output_ids = self.model.generate(
36 |             input_ids=batch_data["input_ids"],
37 |             attention_mask=batch_data["attention_mask"],
38 |             generation_config=generation_config,
39 |         )
40 |         return [
41 |             each[0].lower().strip()
42 |             for each in postprocess_generation_ids(
43 |                 input_ids=batch_data["input_ids"],
44 |                 output_ids=output_ids,
45 |                 num_return_sequences=generation_config.num_return_sequences,
46 |                 tokenizer=self.tokenizer,
47 |             )
48 |         ]
49 | 
50 |     def _parse_labels(self, label_ids: LongTensor) -> List[str]:
51 |         labels = []
52 |         for one_label_ids in label_ids:
53 |             one_label_ids = one_label_ids[(one_label_ids == -100).sum() :]
54 |             label = self.tokenizer.decode(one_label_ids).lower().strip()
55 |             labels.append(label)
56 | 
57 |         return labels
58 | 
59 |     def _metric(self, pred: List[Any], label: List[Any]) -> Dict[str, Dict[str, float]]:
60 |         metric = rouge.Rouge()
61 |         return metric.get_scores(hyps=pred, refs=label, avg=True)
62 | 
63 |     def run(self, generation_config: Optional[GenerationConfig] = None) -> Dict[str, float]:
64 |         if not generation_config:
65 |             generation_config = GenerationConfig(num_beams=1, do_sample=False, max_new_tokens=128)
66 |         generation_config.num_return_sequences = 1
67 |         generation_config.eos_token_id = self.tokenizer.eos_token_id
68 |         generation_config.pad_token_id = self.tokenizer.pad_token_id
69 |         return super().run(generation_config=generation_config)
70 | 
71 | 
72 | __all__ = ["TextSummarizationTask"]
73 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/__init__.py:
--------------------------------------------------------------------------------
 1 | from ._base import BaseGPTQForCausalLM, BaseQuantizeConfig
 2 | from .auto import GPTQ_CAUSAL_LM_MODEL_MAP, AutoGPTQForCausalLM
 3 | from .baichuan import BaiChuanGPTQForCausalLM
 4 | from .bloom import BloomGPTQForCausalLM
 5 | from .codegen import CodeGenGPTQForCausalLM
 6 | from .decilm import DeciLMGPTQForCausalLM
 7 | from .gemma import GemmaGPTQForCausalLM
 8 | from .gpt2 import GPT2GPTQForCausalLM
 9 | from .gpt_bigcode import GPTBigCodeGPTQForCausalLM
10 | from .gpt_neox import GPTNeoXGPTQForCausalLM
11 | from .gptj import GPTJGPTQForCausalLM
12 | from .internlm import InternLMGPTQForCausalLM
13 | from .llama import LlamaGPTQForCausalLM
14 | from .longllama import LongLlamaGPTQForCausalLM
15 | from .mistral import MistralGPTQForCausalLM
16 | from .mixtral import MixtralGPTQForCausalLM
17 | from .moss import MOSSGPTQForCausalLM
18 | from .opt import OPTGPTQForCausalLM
19 | from .qwen import QwenGPTQForCausalLM
20 | from .qwen2 import Qwen2GPTQForCausalLM
21 | from .rw import RWGPTQForCausalLM
22 | from .stablelmepoch import StableLMEpochGPTQForCausalLM
23 | from .xverse import XverseGPTQForCausalLM
24 | from .yi import YiGPTQForCausalLM
25 | from .deepseek import DeepseekGPTQForCausalLM


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/_const.py:
--------------------------------------------------------------------------------
 1 | from torch import device
 2 | 
 3 | from ..utils.import_utils import compare_transformers_version
 4 | 
 5 | 
 6 | CPU = device("cpu")
 7 | CUDA_0 = device("cuda:0")
 8 | 
 9 | SUPPORTED_MODELS = [
10 |     "bloom",
11 |     "gptj",
12 |     "gpt2",
13 |     "gpt_neox",
14 |     "opt",
15 |     "moss",
16 |     "gpt_bigcode",
17 |     "codegen",
18 |     "RefinedWebModel",
19 |     "RefinedWeb",
20 |     "baichuan",
21 |     "internlm",
22 |     "qwen",
23 |     "xverse",
24 |     "deci",
25 |     "stablelm_epoch",
26 |     "deepseek", 
27 | ]
28 | if compare_transformers_version("v4.28.0", op="ge"):
29 |     SUPPORTED_MODELS.append("llama")
30 | if compare_transformers_version("v4.30.0", op="ge"):
31 |     SUPPORTED_MODELS.append("longllama")
32 | if compare_transformers_version("v4.33.0", op="ge"):
33 |     SUPPORTED_MODELS.append("falcon")
34 | if compare_transformers_version("v4.34.0", op="ge"):
35 |     SUPPORTED_MODELS.append("mistral")
36 |     SUPPORTED_MODELS.append("Yi")
37 | if compare_transformers_version("v4.36.0", op="ge"):
38 |     SUPPORTED_MODELS.append("mixtral")
39 | if compare_transformers_version("v4.37.0", op="ge"):
40 |     SUPPORTED_MODELS.append("qwen2")
41 | if compare_transformers_version("v4.38.0", op="ge"):
42 |     SUPPORTED_MODELS.append("gemma")
43 | 
44 | 
45 | EXLLAMA_DEFAULT_MAX_INPUT_LENGTH = 2048
46 | 
47 | __all__ = ["CPU", "CUDA_0", "SUPPORTED_MODELS", "EXLLAMA_DEFAULT_MAX_INPUT_LENGTH"]
48 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/baichuan.py:
--------------------------------------------------------------------------------
 1 | from ._base import BaseGPTQForCausalLM
 2 | 
 3 | 
 4 | class BaiChuanGPTQForCausalLM(BaseGPTQForCausalLM):
 5 |     layer_type = "DecoderLayer"
 6 |     layers_block_name = "model.layers"
 7 |     outside_layer_modules = ["model.embed_tokens", "model.norm"]
 8 |     inside_layer_modules = [
 9 |         ["self_attn.W_pack"],
10 |         ["self_attn.o_proj"],
11 |         ["mlp.up_proj", "mlp.gate_proj"],
12 |         ["mlp.down_proj"],
13 |     ]
14 | 
15 | 
16 | __all__ = ["BaiChuanGPTQForCausalLM"]
17 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/bloom.py:
--------------------------------------------------------------------------------
 1 | from ._base import BaseGPTQForCausalLM
 2 | 
 3 | 
 4 | class BloomGPTQForCausalLM(BaseGPTQForCausalLM):
 5 |     layer_type = "BloomBlock"
 6 |     layers_block_name = "transformer.h"
 7 |     outside_layer_modules = [
 8 |         "transformer.word_embeddings",
 9 |         "transformer.word_embeddings_layernorm",
10 |         "transformer.ln_f",
11 |     ]
12 |     inside_layer_modules = [
13 |         ["self_attention.query_key_value"],
14 |         ["self_attention.dense"],
15 |         ["mlp.dense_h_to_4h"],
16 |         ["mlp.dense_4h_to_h"],
17 |     ]
18 | 
19 | 
20 | __all__ = ["BloomGPTQForCausalLM"]
21 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/codegen.py:
--------------------------------------------------------------------------------
 1 | from ._base import BaseGPTQForCausalLM
 2 | 
 3 | 
 4 | class CodeGenGPTQForCausalLM(BaseGPTQForCausalLM):
 5 |     layer_type = "CodeGenBlock"
 6 |     layers_block_name = "transformer.h"
 7 |     outside_layer_modules = ["transformer.wte", "transformer.ln_f"]
 8 |     inside_layer_modules = [
 9 |         ["attn.qkv_proj"],
10 |         ["attn.out_proj"],
11 |         ["mlp.fc_in"],
12 |         ["mlp.fc_out"],
13 |     ]
14 | 
15 | 
16 | __all__ = ["CodeGenGPTQForCausalLM"]
17 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/decilm.py:
--------------------------------------------------------------------------------
 1 | from logging import getLogger
 2 | 
 3 | from ..utils.import_utils import compare_transformers_version
 4 | from ._base import BaseGPTQForCausalLM
 5 | 
 6 | 
 7 | if compare_transformers_version("v4.28.0", op="ge"):
 8 |     from ..nn_modules.fused_llama_attn import FusedLlamaAttentionForQuantizedModel
 9 |     from ..nn_modules.fused_llama_mlp import FusedLlamaMLPForQuantizedModel
10 | else:
11 |     FusedLlamaAttentionForQuantizedModel = None
12 |     FusedLlamaMLPForQuantizedModel = None
13 | 
14 | logger = getLogger(__name__)
15 | 
16 | 
17 | class DeciLMGPTQForCausalLM(BaseGPTQForCausalLM):
18 |     layer_type = "DeciLMDecoderLayer"
19 |     layers_block_name = "model.layers"
20 |     outside_layer_modules = ["model.embed_tokens", "model.norm"]
21 |     inside_layer_modules = [
22 |         ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
23 |         ["self_attn.o_proj"],
24 |         ["mlp.up_proj", "mlp.gate_proj"],
25 |         ["mlp.down_proj"],
26 |     ]
27 | 
28 |     fused_attn_module_type = FusedLlamaAttentionForQuantizedModel
29 |     fused_mlp_module_type = FusedLlamaMLPForQuantizedModel
30 | 
31 | 
32 | __all__ = ["DeciLMGPTQForCausalLM"]
33 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/deepseek.py:
--------------------------------------------------------------------------------
 1 | from ._base import BaseGPTQForCausalLM
 2 | 
 3 | 
 4 | n_shared_experts = 2
 5 | n_routed_experts = 64
 6 | 
 7 | 
 8 | class DeepseekGPTQForCausalLM(BaseGPTQForCausalLM):
 9 |     layer_type = "DeepseekDecoderLayer"
10 |     layers_block_name = "model.layers"
11 |     outside_layer_modules = ["model.embed_tokens", "model.norm"]
12 |     inside_layer_modules = [
13 |         ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
14 |         ["self_attn.o_proj"],
15 |         [f"mlp.experts.{i}.gate_proj" for i in range(n_routed_experts)] + [f"mlp.experts.{i}.up_proj" for i in range(n_routed_experts)], 
16 |         [f"mlp.experts.{i}.down_proj" for i in range(n_routed_experts)],
17 |         [f"mlp.shared_experts.gate_proj"] + ["mlp.shared_experts.up_proj"], 
18 |         [f"mlp.shared_experts.down_proj"],
19 |     ]
20 | 
21 | 
22 | __all__ = ["MixtralGPTQForCausalLM"]
23 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/gemma.py:
--------------------------------------------------------------------------------
 1 | from logging import getLogger
 2 | 
 3 | from ._base import BaseGPTQForCausalLM
 4 | 
 5 | 
 6 | logger = getLogger(__name__)
 7 | 
 8 | 
 9 | class GemmaGPTQForCausalLM(BaseGPTQForCausalLM):
10 |     layer_type = "GemmaDecoderLayer"
11 |     layers_block_name = "model.layers"
12 |     outside_layer_modules = ["model.embed_tokens", "model.norm"]
13 |     inside_layer_modules = [
14 |         ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
15 |         ["self_attn.o_proj"],
16 |         ["mlp.up_proj", "mlp.gate_proj"],
17 |         ["mlp.down_proj"],
18 |     ]
19 | 
20 | 
21 | __all__ = ["GemmaGPTQForCausalLM"]
22 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/gpt2.py:
--------------------------------------------------------------------------------
 1 | from ._base import BaseGPTQForCausalLM
 2 | 
 3 | 
 4 | class GPT2GPTQForCausalLM(BaseGPTQForCausalLM):
 5 |     layer_type = "GPT2Block"
 6 |     layers_block_name = "transformer.h"
 7 |     outside_layer_modules = ["transformer.wte", "transformer.wpe", "transformer.ln_f"]
 8 |     inside_layer_modules = [
 9 |         ["attn.c_attn"],
10 |         ["attn.c_proj"],
11 |         ["mlp.c_fc"],
12 |         ["mlp.c_proj"],
13 |     ]
14 | 
15 | 
16 | __all__ = ["GPT2GPTQForCausalLM"]
17 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/gpt_bigcode.py:
--------------------------------------------------------------------------------
 1 | from ._base import BaseGPTQForCausalLM
 2 | 
 3 | 
 4 | class GPTBigCodeGPTQForCausalLM(BaseGPTQForCausalLM):
 5 |     layer_type = "GPTBigCodeBlock"
 6 |     layers_block_name = "transformer.h"
 7 |     outside_layer_modules = ["transformer.wpe", "transformer.wte", "transformer.ln_f"]
 8 |     inside_layer_modules = [
 9 |         ["attn.c_attn"],
10 |         ["attn.c_proj"],
11 |         ["mlp.c_fc"],
12 |         ["mlp.c_proj"],
13 |     ]
14 | 
15 | 
16 | __all__ = ["GPTBigCodeGPTQForCausalLM"]
17 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/gpt_neox.py:
--------------------------------------------------------------------------------
 1 | from ._base import BaseGPTQForCausalLM
 2 | 
 3 | 
 4 | class GPTNeoXGPTQForCausalLM(BaseGPTQForCausalLM):
 5 |     layer_type = "GPTNeoXLayer"
 6 |     layers_block_name = "gpt_neox.layers"
 7 |     outside_layer_modules = ["gpt_neox.embed_in", "gpt_neox.final_layer_norm"]
 8 |     inside_layer_modules = [
 9 |         ["attention.query_key_value"],
10 |         ["attention.dense"],
11 |         ["mlp.dense_h_to_4h"],
12 |         ["mlp.dense_4h_to_h"],
13 |     ]
14 |     lm_head_name = "embed_out"
15 | 
16 | 
17 | __all__ = ["GPTNeoXGPTQForCausalLM"]
18 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/gptj.py:
--------------------------------------------------------------------------------
 1 | from ..nn_modules.fused_gptj_attn import FusedGPTJAttentionForQuantizedModel
 2 | from ._base import BaseGPTQForCausalLM
 3 | 
 4 | 
 5 | class GPTJGPTQForCausalLM(BaseGPTQForCausalLM):
 6 |     layer_type = "GPTJBlock"
 7 |     layers_block_name = "transformer.h"
 8 |     outside_layer_modules = ["transformer.wte", "transformer.ln_f"]
 9 |     inside_layer_modules = [
10 |         ["attn.k_proj", "attn.v_proj", "attn.q_proj"],
11 |         ["attn.out_proj"],
12 |         ["mlp.fc_in"],
13 |         ["mlp.fc_out"],
14 |     ]
15 | 
16 |     fused_attn_module_type = FusedGPTJAttentionForQuantizedModel
17 | 
18 | 
19 | __all__ = ["GPTJGPTQForCausalLM"]
20 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/internlm.py:
--------------------------------------------------------------------------------
 1 | from ._base import BaseGPTQForCausalLM
 2 | 
 3 | 
 4 | class InternLMGPTQForCausalLM(BaseGPTQForCausalLM):
 5 |     layer_type = "InternLMDecoderLayer"
 6 |     layers_block_name = "model.layers"
 7 |     outside_layer_modules = ["model.embed_tokens", "model.norm"]
 8 |     inside_layer_modules = [
 9 |         ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
10 |         ["self_attn.o_proj"],
11 |         ["mlp.up_proj", "mlp.gate_proj"],
12 |         ["mlp.down_proj"],
13 |     ]
14 | 
15 | 
16 | __all__ = ["InternLMGPTQForCausalLM"]
17 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/llama.py:
--------------------------------------------------------------------------------
 1 | from logging import getLogger
 2 | 
 3 | from ..utils.import_utils import compare_transformers_version
 4 | from ._base import BaseGPTQForCausalLM
 5 | 
 6 | 
 7 | if compare_transformers_version("v4.28.0", op="ge"):
 8 |     from ..nn_modules.fused_llama_attn import FusedLlamaAttentionForQuantizedModel
 9 |     from ..nn_modules.fused_llama_mlp import FusedLlamaMLPForQuantizedModel
10 | else:
11 |     FusedLlamaAttentionForQuantizedModel = None
12 |     FusedLlamaMLPForQuantizedModel = None
13 | 
14 | logger = getLogger(__name__)
15 | 
16 | 
17 | class LlamaGPTQForCausalLM(BaseGPTQForCausalLM):
18 |     layer_type = "LlamaDecoderLayer"
19 |     layers_block_name = "model.layers"
20 |     outside_layer_modules = ["model.embed_tokens", "model.norm"]
21 |     inside_layer_modules = [
22 |         ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
23 |         ["self_attn.o_proj"],
24 |         ["mlp.up_proj", "mlp.gate_proj"],
25 |         ["mlp.down_proj"],
26 |     ]
27 | 
28 |     fused_attn_module_type = FusedLlamaAttentionForQuantizedModel
29 |     fused_mlp_module_type = FusedLlamaMLPForQuantizedModel
30 | 
31 | 
32 | __all__ = ["LlamaGPTQForCausalLM"]
33 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/longllama.py:
--------------------------------------------------------------------------------
 1 | from logging import getLogger
 2 | 
 3 | from ..utils.import_utils import compare_transformers_version
 4 | from ._base import BaseGPTQForCausalLM
 5 | 
 6 | 
 7 | if compare_transformers_version("v4.28.0", op="ge"):
 8 |     from ..nn_modules.fused_llama_attn import FusedLlamaAttentionForQuantizedModel
 9 |     from ..nn_modules.fused_llama_mlp import FusedLlamaMLPForQuantizedModel
10 | else:
11 |     FusedLlamaAttentionForQuantizedModel = None
12 |     FusedLlamaMLPForQuantizedModel = None
13 | 
14 | logger = getLogger(__name__)
15 | 
16 | 
17 | class LongLlamaGPTQForCausalLM(BaseGPTQForCausalLM):
18 |     layer_type = "LongLlamaDecoderLayer"
19 |     layers_block_name = "model.layers"
20 |     outside_layer_modules = ["model.embed_tokens", "model.norm"]
21 |     inside_layer_modules = [
22 |         ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
23 |         ["self_attn.o_proj"],
24 |         ["mlp.up_proj", "mlp.gate_proj"],
25 |         ["mlp.down_proj"],
26 |     ]
27 | 
28 |     fused_attn_module_type = FusedLlamaAttentionForQuantizedModel
29 |     fused_mlp_module_type = FusedLlamaMLPForQuantizedModel
30 | 
31 | 
32 | __all__ = ["LongLlamaGPTQForCausalLM"]
33 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/mistral.py:
--------------------------------------------------------------------------------
 1 | from ._base import BaseGPTQForCausalLM
 2 | from ..utils.import_utils import compare_transformers_version
 3 | 
 4 | if compare_transformers_version("v4.28.0", op="ge"):
 5 |     from ..nn_modules.fused_llama_attn import FusedLlamaAttentionForQuantizedModel
 6 |     from ..nn_modules.fused_llama_mlp import FusedLlamaMLPForQuantizedModel
 7 | else:
 8 |     FusedLlamaAttentionForQuantizedModel = None
 9 |     FusedLlamaMLPForQuantizedModel = None
10 | 
11 | 
12 | class MistralGPTQForCausalLM(BaseGPTQForCausalLM):
13 |     layer_type = "MistralDecoderLayer"
14 |     layers_block_name = "model.layers"
15 |     outside_layer_modules = ["model.embed_tokens", "model.norm"]
16 |     inside_layer_modules = [
17 |         ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
18 |         ["self_attn.o_proj"],
19 |         ["mlp.up_proj", "mlp.gate_proj"],
20 |         ["mlp.down_proj"],
21 |     ]
22 | 
23 |     fused_attn_module_type = FusedLlamaAttentionForQuantizedModel
24 |     fused_mlp_module_type = FusedLlamaMLPForQuantizedModel
25 | 
26 | __all__ = ["MistralGPTQForCausalLM"]
27 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/mixtral.py:
--------------------------------------------------------------------------------
 1 | from ._base import BaseGPTQForCausalLM
 2 | 
 3 | 
 4 | class MixtralGPTQForCausalLM(BaseGPTQForCausalLM):
 5 |     layer_type = "MixtralDecoderLayer"
 6 |     layers_block_name = "model.layers"
 7 |     outside_layer_modules = ["model.embed_tokens", "model.norm"]
 8 |     inside_layer_modules = [
 9 |         ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
10 |         ["self_attn.o_proj"],
11 |         [
12 |             "block_sparse_moe.experts.0.w1",
13 |             "block_sparse_moe.experts.1.w1",
14 |             "block_sparse_moe.experts.2.w1",
15 |             "block_sparse_moe.experts.3.w1",
16 |             "block_sparse_moe.experts.4.w1",
17 |             "block_sparse_moe.experts.5.w1",
18 |             "block_sparse_moe.experts.6.w1",
19 |             "block_sparse_moe.experts.7.w1",
20 |             "block_sparse_moe.experts.0.w3",
21 |             "block_sparse_moe.experts.1.w3",
22 |             "block_sparse_moe.experts.2.w3",
23 |             "block_sparse_moe.experts.3.w3",
24 |             "block_sparse_moe.experts.4.w3",
25 |             "block_sparse_moe.experts.5.w3",
26 |             "block_sparse_moe.experts.6.w3",
27 |             "block_sparse_moe.experts.7.w3",
28 |         ],
29 |         [
30 |             "block_sparse_moe.experts.0.w2",
31 |             "block_sparse_moe.experts.1.w2",
32 |             "block_sparse_moe.experts.2.w2",
33 |             "block_sparse_moe.experts.3.w2",
34 |             "block_sparse_moe.experts.4.w2",
35 |             "block_sparse_moe.experts.5.w2",
36 |             "block_sparse_moe.experts.6.w2",
37 |             "block_sparse_moe.experts.7.w2",
38 |         ],
39 |     ]
40 | 
41 | 
42 | __all__ = ["MixtralGPTQForCausalLM"]
43 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/moss.py:
--------------------------------------------------------------------------------
 1 | from ._base import BaseGPTQForCausalLM
 2 | 
 3 | 
 4 | class MOSSGPTQForCausalLM(BaseGPTQForCausalLM):
 5 |     layer_type = "MossBlock"
 6 |     layers_block_name = "transformer.h"
 7 |     outside_layer_modules = ["transformer.wte", "transformer.ln_f"]
 8 |     inside_layer_modules = [
 9 |         ["attn.qkv_proj"],
10 |         ["attn.out_proj"],
11 |         ["mlp.fc_in"],
12 |         ["mlp.fc_out"],
13 |     ]
14 | 
15 | 
16 | __all__ = ["MOSSGPTQForCausalLM"]
17 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/opt.py:
--------------------------------------------------------------------------------
 1 | from ._base import BaseGPTQForCausalLM
 2 | 
 3 | 
 4 | class OPTGPTQForCausalLM(BaseGPTQForCausalLM):
 5 |     layer_type = "OPTDecoderLayer"
 6 |     layers_block_name = "model.decoder.layers"
 7 |     outside_layer_modules = [
 8 |         "model.decoder.embed_tokens",
 9 |         "model.decoder.embed_positions",
10 |         "model.decoder.project_out",
11 |         "model.decoder.project_in",
12 |         "model.decoder.final_layer_norm",
13 |     ]
14 |     inside_layer_modules = [
15 |         ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
16 |         ["self_attn.out_proj"],
17 |         ["fc1"],
18 |         ["fc2"],
19 |     ]
20 | 
21 | 
22 | __all__ = ["OPTGPTQForCausalLM"]
23 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/qwen.py:
--------------------------------------------------------------------------------
 1 | from ._base import BaseGPTQForCausalLM
 2 | 
 3 | 
 4 | class QwenGPTQForCausalLM(BaseGPTQForCausalLM):
 5 |     layer_type = "QWenBlock"
 6 |     layers_block_name = "transformer.h"
 7 |     outside_layer_modules = [
 8 |         "transformer.wte",
 9 |         "transformer.wpe",
10 |         "transformer.ln_f",
11 |         "transformer.visual",
12 |     ]
13 |     inside_layer_modules = [
14 |         ["attn.c_attn"],
15 |         ["attn.c_proj"],
16 |         ["mlp.w1", "mlp.w2"],
17 |         ["mlp.c_proj"],
18 |     ]
19 | 
20 | 
21 | __all__ = ["QwenGPTQForCausalLM"]
22 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/qwen2.py:
--------------------------------------------------------------------------------
 1 | from ._base import BaseGPTQForCausalLM
 2 | 
 3 | 
 4 | class Qwen2GPTQForCausalLM(BaseGPTQForCausalLM):
 5 |     layer_type = "Qwen2DecoderLayer"
 6 |     layers_block_name = "model.layers"
 7 |     outside_layer_modules = ["model.embed_tokens", "model.norm"]
 8 |     inside_layer_modules = [
 9 |         ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
10 |         ["self_attn.o_proj"],
11 |         ["mlp.up_proj", "mlp.gate_proj"],
12 |         ["mlp.down_proj"],
13 |     ]
14 | 
15 | 
16 | __all__ = ["Qwen2GPTQForCausalLM"]
17 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/rw.py:
--------------------------------------------------------------------------------
 1 | from ._base import BaseGPTQForCausalLM
 2 | 
 3 | 
 4 | class RWGPTQForCausalLM(BaseGPTQForCausalLM):
 5 |     layer_type = "DecoderLayer"
 6 |     layers_block_name = "transformer.h"
 7 |     outside_layer_modules = ["transformer.word_embeddings", "transformer.ln_f"]
 8 |     inside_layer_modules = [
 9 |         ["self_attention.query_key_value"],
10 |         ["self_attention.dense"],
11 |         ["mlp.dense_h_to_4h"],
12 |         ["mlp.dense_4h_to_h"],
13 |     ]
14 | 
15 | 
16 | __all__ = ["RWGPTQForCausalLM"]
17 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/stablelmepoch.py:
--------------------------------------------------------------------------------
 1 | from logging import getLogger
 2 | 
 3 | from ..utils.import_utils import compare_transformers_version
 4 | from ._base import BaseGPTQForCausalLM
 5 | 
 6 | 
 7 | if compare_transformers_version("v4.28.0", op="ge"):
 8 |     from ..nn_modules.fused_llama_attn import FusedLlamaAttentionForQuantizedModel
 9 |     from ..nn_modules.fused_llama_mlp import FusedLlamaMLPForQuantizedModel
10 | else:
11 |     FusedLlamaAttentionForQuantizedModel = None
12 |     FusedLlamaMLPForQuantizedModel = None
13 | 
14 | logger = getLogger(__name__)
15 | 
16 | 
17 | class StableLMEpochGPTQForCausalLM(BaseGPTQForCausalLM):
18 |     layer_type = "DecoderLayer"
19 |     layers_block_name = "model.layers"
20 |     outside_layer_modules = ["model.embed_tokens", "model.norm"]
21 |     inside_layer_modules = [
22 |         ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
23 |         ["self_attn.o_proj"],
24 |         ["mlp.up_proj", "mlp.gate_proj"],
25 |         ["mlp.down_proj"],
26 |     ]
27 | 
28 |     fused_attn_module_type = FusedLlamaAttentionForQuantizedModel
29 |     fused_mlp_module_type = FusedLlamaMLPForQuantizedModel
30 | 
31 | 
32 | __all__ = ["StableLMEpochGPTQForCausalLM"]
33 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/xverse.py:
--------------------------------------------------------------------------------
 1 | from logging import getLogger
 2 | 
 3 | from ..utils.import_utils import compare_transformers_version
 4 | from ._base import BaseGPTQForCausalLM
 5 | 
 6 | 
 7 | if compare_transformers_version("v4.28.0", op="ge"):
 8 |     from ..nn_modules.fused_llama_attn import FusedLlamaAttentionForQuantizedModel
 9 |     from ..nn_modules.fused_llama_mlp import FusedLlamaMLPForQuantizedModel
10 | else:
11 |     FusedLlamaAttentionForQuantizedModel = None
12 |     FusedLlamaMLPForQuantizedModel = None
13 | 
14 | logger = getLogger(__name__)
15 | 
16 | 
17 | class XverseGPTQForCausalLM(BaseGPTQForCausalLM):
18 |     layer_type = "XverseDecoderLayer"
19 |     layers_block_name = "model.layers"
20 |     outside_layer_modules = ["model.embed_tokens", "model.norm"]
21 |     inside_layer_modules = [
22 |         ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
23 |         ["self_attn.o_proj"],
24 |         ["mlp.up_proj", "mlp.gate_proj"],
25 |         ["mlp.down_proj"],
26 |     ]
27 | 
28 |     fused_attn_module_type = FusedLlamaAttentionForQuantizedModel
29 |     fused_mlp_module_type = FusedLlamaMLPForQuantizedModel
30 | 
31 | 
32 | __all__ = ["XverseGPTQForCausalLM"]
33 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/yi.py:
--------------------------------------------------------------------------------
 1 | from logging import getLogger
 2 | 
 3 | from ..utils.import_utils import compare_transformers_version
 4 | from ._base import BaseGPTQForCausalLM
 5 | 
 6 | 
 7 | if compare_transformers_version("v4.28.0", op="ge"):
 8 |     from ..nn_modules.fused_llama_attn import FusedLlamaAttentionForQuantizedModel
 9 |     from ..nn_modules.fused_llama_mlp import FusedLlamaMLPForQuantizedModel
10 | else:
11 |     FusedLlamaAttentionForQuantizedModel = None
12 |     FusedLlamaMLPForQuantizedModel = None
13 | 
14 | logger = getLogger(__name__)
15 | 
16 | 
17 | class YiGPTQForCausalLM(BaseGPTQForCausalLM):
18 |     layer_type = "YiDecoderLayer"
19 |     layers_block_name = "model.layers"
20 |     outside_layer_modules = ["model.embed_tokens", "model.norm"]
21 |     inside_layer_modules = [
22 |         ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
23 |         ["self_attn.o_proj"],
24 |         ["mlp.up_proj", "mlp.gate_proj"],
25 |         ["mlp.down_proj"],
26 |     ]
27 | 
28 |     fused_attn_module_type = FusedLlamaAttentionForQuantizedModel
29 |     fused_mlp_module_type = FusedLlamaMLPForQuantizedModel
30 | 
31 | 
32 | __all__ = ["YiGPTQForCausalLM"]
33 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/nn_modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/nn_modules/__init__.py


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/nn_modules/_fused_base.py:
--------------------------------------------------------------------------------
 1 | from abc import abstractmethod
 2 | from logging import getLogger
 3 | 
 4 | import torch.nn as nn
 5 | 
 6 | from .triton_utils.mixin import TritonModuleMixin
 7 | 
 8 | 
 9 | logger = getLogger(__name__)
10 | 
11 | 
12 | class FusedBaseModule(nn.Module, TritonModuleMixin):
13 |     @classmethod
14 |     @abstractmethod
15 |     def inject_to_model(cls, *args, **kwargs):
16 |         raise NotImplementedError()
17 | 
18 | 
19 | class FusedBaseAttentionModule(FusedBaseModule):
20 |     @classmethod
21 |     @abstractmethod
22 |     def inject_to_model(
23 |         cls, model, use_triton=False, group_size=-1, use_cuda_fp16=True, desc_act=False, trainable=False, **kwargs
24 |     ):
25 |         raise NotImplementedError()
26 | 
27 |     @classmethod
28 |     def warmup(cls, model, transpose=False, seqlen=2048):
29 |         pass
30 | 
31 | 
32 | class FusedBaseMLPModule(FusedBaseModule):
33 |     @classmethod
34 |     @abstractmethod
35 |     def inject_to_model(cls, model, use_triton=False, **kwargs):
36 |         raise NotImplementedError()
37 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/nn_modules/qlinear/__init__.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | 
 3 | 
 4 | class GeneralQuantLinear(nn.Linear):
 5 |     def __init__(self, quant_linear_module):
 6 |         super().__init__(
 7 |             in_features=quant_linear_module.infeatures,
 8 |             out_features=quant_linear_module.outfeatures,
 9 |             bias=True,
10 |         )
11 |         self.infeatures = quant_linear_module.infeatures
12 |         self.outfeatures = quant_linear_module.outfeatures
13 |         self.bits = quant_linear_module.bits
14 |         self.group_size = quant_linear_module.group_size
15 |         self.maxq = quant_linear_module.maxq
16 | 
17 |         self.weight.requires_grad = False
18 | 
19 |         self.weight.data = quant_linear_module.qweight
20 |         self.register_buffer("qweight", quant_linear_module.qweight)
21 |         self.bias.data = quant_linear_module.bias
22 | 
23 |         self.qweight.requires_grad = False
24 |         self.bias.requires_grad = False
25 | 
26 |         self.register_buffer("qzeros", quant_linear_module.qzeros)
27 |         self.register_buffer("scales", quant_linear_module.scales)
28 |         self.register_buffer("g_idx", quant_linear_module.g_idx)
29 | 
30 |         if hasattr(quant_linear_module, "wf"):
31 |             self.wf = quant_linear_module.wf
32 |         if hasattr(quant_linear_module, "kernel_switch_threshold"):
33 |             self.kernel_switch_threshold = quant_linear_module.kernel_switch_threshold
34 |         if hasattr(quant_linear_module, "autogptq_cuda_available"):
35 |             self.autogptq_cuda_available = quant_linear_module.autogptq_cuda_available
36 | 
37 |         self.trainable = quant_linear_module.trainable
38 | 
39 |         self.forward = quant_linear_module.forward
40 | 
41 |     @classmethod
42 |     def inject_to_model(cls, model, target_module_type):
43 |         for name, m in model.named_modules():
44 |             if not isinstance(m, target_module_type):
45 |                 continue
46 |             new_m = cls(m)
47 |             if "." in name:
48 |                 parent_name = name.rsplit(".", 1)[0]
49 |                 child_name = name[len(parent_name) + 1 :]
50 |                 parent = model.get_submodule(parent_name)
51 |             else:
52 |                 parent_name = ""
53 |                 parent = model
54 |                 child_name = name
55 | 
56 |             setattr(parent, child_name, new_m)
57 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/nn_modules/triton_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/nn_modules/triton_utils/__init__.py


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/nn_modules/triton_utils/mixin.py:
--------------------------------------------------------------------------------
1 | class TritonModuleMixin:
2 |     @classmethod
3 |     def warmup(cls, model, transpose=False, seqlen=2048):
4 |         pass
5 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/quantization/__init__.py:
--------------------------------------------------------------------------------
1 | from .gptq import GPTQ
2 | from .quantizer import Quantizer, quantize
3 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .perplexity_utils import Perplexity
2 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/utils/exllama_utils.py:
--------------------------------------------------------------------------------
 1 | import gc
 2 | 
 3 | import torch
 4 | 
 5 | from ..nn_modules.qlinear.qlinear_exllama import QuantLinear as ExllamaQuantLinear
 6 | 
 7 | 
 8 | def exllama_set_max_input_length(model, max_input_length: int):
 9 |     """
10 |     This method does not necessarily require `model` to inherit from BaseGPTQForCausalLM.
11 | 
12 |     When using the exllama backend with act-order, it is necessary to initialize a buffer that depends on the maximum expected input length. In case the
13 |     default used (EXLLAMA_DEFAULT_MAX_INPUT_LENGTH) is too short, this method can be called to extend the buffer size without reloading the whole model.
14 |     """
15 | 
16 |     # The import is set here to avoid a global import. Arguably this is quite ugly, it would be better to have lazy loading.
17 |     from exllama_kernels import cleanup_buffers_cuda, prepare_buffers
18 | 
19 |     if not model.quantize_config.desc_act:
20 |         raise ValueError(
21 |             "The method exllama_set_max_input_length should be called only when using the exllama backend **with act-order**."
22 |         )
23 | 
24 |     uses_exllama = False
25 |     for name, submodule in model.named_modules():
26 |         if isinstance(submodule, ExllamaQuantLinear):
27 |             uses_exllama = True
28 | 
29 |     if not uses_exllama:
30 |         raise ValueError(
31 |             f"The function exllama_set_max_input_length was called, but the model (instance of {model.__class__.__name__}) does not use the exllama backend for GPTQ. An other implementation is used (exllamav2, cuda, cuda-old, triton) and that the call to exllama_set_max_input_length is unnecessary. Please remove the call to exllama_set_max_input_length or use the exllama v1 backend."
32 |         )
33 | 
34 |     device_to_buffers_size = {}
35 |     for device, buffers in model.device_to_buffers.items():
36 |         device_to_buffers_size[device] = {
37 |             "max_dq_buffer_size": buffers["max_dq_buffer_size"],
38 |             "max_inner_outer_dim": buffers["max_inner_outer_dim"],
39 |         }
40 | 
41 |     # For an unknown reason calling just `del model.device_to_buffers` raises an AttributeError.
42 |     for key in list(model.device_to_buffers.keys()):
43 |         del model.device_to_buffers[key]
44 |     model.device_to_buffers = None
45 |     del model.device_to_buffers
46 | 
47 |     gc.collect()
48 |     torch.cuda.empty_cache()
49 |     cleanup_buffers_cuda()
50 | 
51 |     device_to_buffers = {}
52 |     for device, buffers_size in device_to_buffers_size.items():
53 |         # The temp_state buffer is required to reorder X in the act-order case.
54 |         # The temp_dq buffer is required to dequantize weights when using cuBLAS, typically for the prefill.
55 |         device_to_buffers[device] = {
56 |             "temp_state": torch.zeros(
57 |                 (max_input_length, buffers_size["max_inner_outer_dim"]),
58 |                 dtype=torch.float16,
59 |                 device=device,
60 |             ),
61 |             "temp_dq": torch.zeros(
62 |                 (1, buffers_size["max_dq_buffer_size"]),
63 |                 dtype=torch.float16,
64 |                 device=device,
65 |             ),
66 |             "max_dq_buffer_size": buffers_size["max_dq_buffer_size"],
67 |             "max_inner_outer_dim": buffers_size["max_inner_outer_dim"],
68 |         }
69 | 
70 |         prepare_buffers(
71 |             device,
72 |             device_to_buffers[device]["temp_state"],
73 |             device_to_buffers[device]["temp_dq"],
74 |         )
75 | 
76 |     # Buffers need to be persistent to avoid any bug.
77 |     model.device_to_buffers = device_to_buffers
78 | 
79 |     return model
80 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/utils/import_utils.py:
--------------------------------------------------------------------------------
  1 | from logging import getLogger
  2 | from typing import Optional
  3 | 
  4 | import torch
  5 | from packaging.version import parse as parse_version
  6 | 
  7 | 
  8 | try:
  9 |     import triton  # noqa: F401
 10 | 
 11 |     TRITON_AVAILABLE = True
 12 | except ImportError:
 13 |     TRITON_AVAILABLE = False
 14 | 
 15 | try:
 16 |     import autogptq_cuda_64  # noqa: F401
 17 | 
 18 |     AUTOGPTQ_CUDA_AVAILABLE = True
 19 | except Exception:
 20 |     AUTOGPTQ_CUDA_AVAILABLE = False
 21 | 
 22 | 
 23 | try:
 24 |     import exllama_kernels  # noqa: F401
 25 | 
 26 |     EXLLAMA_KERNELS_AVAILABLE = True
 27 | except Exception:
 28 |     EXLLAMA_KERNELS_AVAILABLE = False
 29 | 
 30 | try:
 31 |     import exllamav2_kernels  # noqa: F401
 32 | 
 33 |     EXLLAMAV2_KERNELS_AVAILABLE = True
 34 | except Exception:
 35 |     EXLLAMAV2_KERNELS_AVAILABLE = False
 36 | 
 37 | try:
 38 |     import cQIGen  # noqa: F401
 39 | 
 40 |     QIGEN_AVAILABLE = True
 41 |     QIGEN_EXCEPTION = None
 42 | except Exception as e:
 43 |     QIGEN_AVAILABLE = False
 44 |     QIGEN_EXCEPTION = e
 45 | 
 46 | try:
 47 |     import autogptq_marlin_cuda  # noqa: F401
 48 | 
 49 |     MARLIN_AVAILABLE = True
 50 |     MARLIN_EXCEPTION = None
 51 | except Exception as e:
 52 |     MARLIN_AVAILABLE = False
 53 |     MARLIN_EXCEPTION = e
 54 | 
 55 | 
 56 | logger = getLogger(__name__)
 57 | 
 58 | 
 59 | def dynamically_import_QuantLinear(
 60 |     use_triton: bool,
 61 |     desc_act: bool,
 62 |     group_size: int,
 63 |     bits: int,
 64 |     disable_exllama: Optional[bool] = None,
 65 |     disable_exllamav2: bool = False,
 66 |     use_qigen: bool = False,
 67 |     disable_marlin: bool = True,
 68 | ):
 69 |     if use_qigen:
 70 |         if not QIGEN_AVAILABLE:
 71 |             raise ValueError(
 72 |                 f"QIGen appears to be not available with the error: {QIGEN_EXCEPTION}. Please check your installation or use `use_qigen=False`."
 73 |             )
 74 |         from ..nn_modules.qlinear.qlinear_qigen import QuantLinear
 75 |     else:
 76 |         if use_triton:
 77 |             if torch.version.hip:
 78 |                 logger.warning(
 79 |                     "Running GPTQ triton version on AMD GPUs is untested and may result in errors or wrong predictions. Please use use_triton=False."
 80 |                 )
 81 | 
 82 |             from ..nn_modules.qlinear.qlinear_triton import QuantLinear
 83 |         else:
 84 |             # If disable_exllamav2 is True, we want to fall back on the exllama kernel and not the cuda/cuda_old ones.
 85 |             if disable_exllama is None:
 86 |                 if disable_exllamav2:
 87 |                     disable_exllama = False
 88 |                 else:
 89 |                     disable_exllama = True
 90 |             if bits == 4 and not disable_marlin:
 91 |                 from ..nn_modules.qlinear.qlinear_marlin import QuantLinear
 92 |             elif bits == 4 and not disable_exllamav2 and EXLLAMAV2_KERNELS_AVAILABLE:
 93 |                 from ..nn_modules.qlinear.qlinear_exllamav2 import QuantLinear
 94 |             elif bits == 4 and not disable_exllama and EXLLAMA_KERNELS_AVAILABLE:
 95 |                 from ..nn_modules.qlinear.qlinear_exllama import QuantLinear
 96 |             elif not desc_act or group_size == -1:
 97 |                 from ..nn_modules.qlinear.qlinear_cuda_old import QuantLinear
 98 |             else:
 99 |                 from ..nn_modules.qlinear.qlinear_cuda import QuantLinear
100 | 
101 |     return QuantLinear
102 | 
103 | 
104 | def compare_transformers_version(version: str = "v4.28.0", op: str = "eq"):
105 |     assert op in ["eq", "lt", "le", "gt", "ge"]
106 | 
107 |     from transformers import __version__
108 | 
109 |     return getattr(parse_version(__version__), f"__{op}__")(parse_version(version))
110 | 
111 | 
112 | def compare_pytorch_version(version: str = "v2.0.0", op: str = "eq"):
113 |     assert op in ["eq", "lt", "le", "gt", "ge"]
114 | 
115 |     from torch import __version__
116 | 
117 |     return getattr(parse_version(__version__), f"__{op}__")(parse_version(version))
118 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/utils/modeling_utils.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | 
 3 | 
 4 | def recurse_getattr(obj, attr: str):
 5 |     """
 6 |     Recursive `getattr`.
 7 | 
 8 |     Args:
 9 |         obj:
10 |             A class instance holding the attribute.
11 |         attr (`str`):
12 |             The attribute that is to be retrieved, e.g. 'attribute1.attribute2'.
13 |     """
14 | 
15 |     def _getattr(obj, attr):
16 |         return getattr(obj, attr)
17 | 
18 |     return functools.reduce(_getattr, [obj] + attr.split("."))
19 | 
20 | 
21 | def recurse_setattr(module, name, value):
22 |     """A function to recursively set attributes to a module."""
23 |     if "." not in name:
24 |         setattr(module, name, value)
25 |     else:
26 |         name, rest = name.split(".", 1)
27 |         recurse_setattr(getattr(module, name), rest, value)
28 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/modelutils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | DEV = torch.device('cuda:0')
 6 | 
 7 | 
 8 | def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''):
 9 |     if type(module) in layers:
10 |         return {name: module}
11 |     res = {}
12 |     for name1, child in module.named_children():
13 |         res.update(find_layers(
14 |             child, layers=layers, name=name + '.' + name1 if name != '' else name1
15 |         ))
16 |     return res
17 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/setup_cuda.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, Extension
 2 | from torch.utils import cpp_extension
 3 | 
 4 | setup(
 5 |     name='quant_cuda',
 6 |     ext_modules=[cpp_extension.CUDAExtension(
 7 |         'quant_cuda', ['quant_cuda.cpp', 'quant_cuda_kernel.cu']
 8 |     )],
 9 |     cmdclass={'build_ext': cpp_extension.BuildExtension}
10 | )
11 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/AutoGPTQ/test_kernel.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | import quant_cuda
 5 | 
 6 | torch.backends.cuda.matmul.allow_tf32 = False
 7 | torch.backends.cudnn.allow_tf32 = False
 8 | 
 9 | print('Benchmarking OPT-175B FC2 matvec ...')
10 | 
11 | DEV = torch.device('cuda:0')
12 | 
13 | M = 12288 * 4
14 | N = 12288
15 | 
16 | DTYPE = torch.half
17 | mat = torch.randn((M, N), device=DEV, dtype=DTYPE)
18 | vec = torch.randn((1, M), device=DEV, dtype=DTYPE)
19 | mul = torch.zeros((1, N), device=DEV, dtype=DTYPE)
20 | 
21 | COUNT = 1000
22 | import time
23 | tick = time.time()
24 | for _ in range(COUNT):
25 |     torch.matmul(vec, mat, out=mul) 
26 |     torch.cuda.synchronize()
27 | print('FP16:', (time.time() - tick) / COUNT)
28 | 
29 | DTYPE = torch.float
30 | mat = mat.to(DTYPE)
31 | vec = vec.to(DTYPE)
32 | mul = mul.to(DTYPE)
33 | 
34 | mat = torch.randint(-1000000000, 1000000000, (M // 1024 * 96, N), device=DEV, dtype=torch.int)
35 | scales = torch.randn(N, device=DEV, dtype=DTYPE)
36 | zeros = torch.randn(N, device=DEV, dtype=DTYPE)
37 | 
38 | COUNT = 1000
39 | import time
40 | tick = time.time()
41 | for _ in range(COUNT):
42 |     quant_cuda.vecquant3matmul(vec, mat, mul, scales, zeros)
43 |     torch.cuda.synchronize()
44 | print('3bit:', (time.time() - tick) / COUNT)
45 | 
46 | COUNT = 1000
47 | import time
48 | tick = time.time()
49 | for _ in range(COUNT):
50 |     quant_cuda.vecquant3matmul_faster(vec, mat, mul, scales, zeros)
51 |     torch.cuda.synchronize()
52 | print('3bit:', (time.time() - tick) / COUNT, '(faster)')
53 | 
54 | print('Verifiying kernel correctness ...')
55 | 
56 | M = 4 * 4096
57 | N = 4096
58 | 
59 | layer = nn.Linear(M, N)
60 | vec = torch.randn(M).to(DEV)
61 | 
62 | from quant import *
63 | quantizer = Quantizer()
64 | quantizer.configure(3, perchannel=True, sym=False, mse=False)
65 | quantizer.find_params(layer.weight.data, weight=True)
66 | layer.weight.data = quantize(
67 |     layer.weight.data, quantizer.scale, quantizer.zero, quantizer.maxq
68 | )
69 | 
70 | qlayer = Quant3Linear(layer.in_features, layer.out_features)
71 | qlayer.pack(layer, quantizer.scale, quantizer.zero)
72 | 
73 | qlayer = qlayer.to(DEV)
74 | layer = layer.to(DEV)
75 | 
76 | with torch.no_grad():
77 |     print('Simu:', layer.to(DEV)(vec))
78 |     print('Kern:', qlayer(vec))
79 |     qlayer.faster = True
80 |     print('Kern:', qlayer(vec.half()), '(faster)')
81 | 


--------------------------------------------------------------------------------
/src/llmtuner/compression/quantization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/compression/quantization/__init__.py


--------------------------------------------------------------------------------
/src/llmtuner/compression/tuner.py:
--------------------------------------------------------------------------------
 1 | from typing import TYPE_CHECKING, Any, Dict, List, Optional
 2 | 
 3 | import torch
 4 | from transformers import PreTrainedModel
 5 | # from .dpo import run_dpo
 6 | # from .ppo import run_ppo
 7 | from llmtuner.compression.prune import run_prune
 8 | # from .pt import run_pt
 9 | # from .rm import run_rm
10 | # from .sft import run_sft
11 | from ..extras.callbacks import LogCallback
12 | from ..extras.logging import get_logger
13 | from ..hparams import get_infer_args, get_train_sparse_args
14 | from ..model import load_model_and_tokenizer
15 | 
16 | if TYPE_CHECKING:
17 |     from transformers import TrainerCallback
18 | 
19 | logger = get_logger(__name__)
20 | 
21 | 
22 | def run_exp(args: Optional[Dict[str, Any]] = None, callbacks: Optional[List["TrainerCallback"]] = None):
23 |     # model_args, data_args, training_args, finetuning_args, generating_args = get_train_args(args)
24 |     model_args, data_args, training_args, finetuning_args, generating_args, pruning_args = get_train_sparse_args(args)
25 |     callbacks = [LogCallback()] if callbacks is None else callbacks
26 |     if finetuning_args.stage == "prune":  # 🔍  
27 |         run_prune(model_args, data_args, training_args, finetuning_args, pruning_args, callbacks)
28 | 
29 | def export_model(args: Optional[Dict[str, Any]] = None):
30 |     model_args, _, finetuning_args, _ = get_infer_args(args)
31 | 
32 |     if model_args.export_dir is None:
33 |         raise ValueError("Please specify `export_dir`.")
34 | 
35 |     if model_args.adapter_name_or_path is not None and model_args.export_quantization_bit is not None:
36 |         raise ValueError("Please merge adapters before quantizing the model.")
37 | 
38 |     model, tokenizer = load_model_and_tokenizer(model_args, finetuning_args)
39 | 
40 |     if getattr(model, "quantization_method", None) and model_args.adapter_name_or_path is not None:
41 |         raise ValueError("Cannot merge adapters to a quantized model.")
42 | 
43 |     if not isinstance(model, PreTrainedModel):
44 |         raise ValueError("The model is not a `PreTrainedModel`, export aborted.")
45 | 
46 |     if getattr(model, "quantization_method", None):
47 |         model = model.to("cpu")
48 |     elif hasattr(model.config, "torch_dtype"):
49 |         model = model.to(getattr(model.config, "torch_dtype")).to("cpu")
50 |     else:
51 |         model = model.to(torch.float16).to("cpu")
52 |         setattr(model.config, "torch_dtype", torch.float16)
53 | 
54 |     model.save_pretrained(
55 |         save_directory=model_args.export_dir,
56 |         max_shard_size="{}GB".format(model_args.export_size),
57 |         safe_serialization=(not model_args.export_legacy_format),
58 |     )
59 |     if model_args.export_hub_model_id is not None:
60 |         model.push_to_hub(
61 |             model_args.export_hub_model_id,
62 |             token=model_args.hf_hub_token,
63 |             max_shard_size="{}GB".format(model_args.export_size),
64 |             safe_serialization=(not model_args.export_legacy_format),
65 |         )
66 | 
67 |     try:
68 |         tokenizer.padding_side = "left"  # restore padding side
69 |         tokenizer.init_kwargs["padding_side"] = "left"
70 |         tokenizer.save_pretrained(model_args.export_dir)
71 |         if model_args.export_hub_model_id is not None:
72 |             tokenizer.push_to_hub(model_args.export_hub_model_id, token=model_args.hf_hub_token)
73 |     except Exception:
74 |         logger.warning("Cannot save tokenizer, please copy the files manually.")
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     run_exp()
79 | 


--------------------------------------------------------------------------------
/src/llmtuner/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .loader import get_dataset
2 | from .template import get_template_and_fix_tokenizer, templates
3 | from .utils import Role, split_dataset
4 | 
5 | 
6 | __all__ = ["get_dataset", "get_template_and_fix_tokenizer", "templates", "Role", "split_dataset"]
7 | 


--------------------------------------------------------------------------------
/src/llmtuner/data/test_data.py:
--------------------------------------------------------------------------------
1 | from loader import load_single_dataset
2 | 
3 | load_single_dataset(dataset_attr, model_args, data_args)


--------------------------------------------------------------------------------
/src/llmtuner/data/utils.py:
--------------------------------------------------------------------------------
 1 | import hashlib
 2 | from enum import Enum, unique
 3 | from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 4 | 
 5 | from ..extras.logging import get_logger
 6 | 
 7 | 
 8 | if TYPE_CHECKING:
 9 |     from datasets import Dataset, IterableDataset
10 |     from transformers import TrainingArguments
11 | 
12 |     from llmtuner.hparams import DataArguments
13 | 
14 | 
15 | logger = get_logger(__name__)
16 | 
17 | 
18 | @unique
19 | class Role(str, Enum):
20 |     USER = "user"
21 |     ASSISTANT = "assistant"
22 |     SYSTEM = "system"
23 |     FUNCTION = "function"
24 |     OBSERVATION = "observation"
25 | 
26 | 
27 | def checksum(data_files: List[str], file_sha1: Optional[str] = None) -> None:
28 |     if file_sha1 is None:
29 |         logger.warning("Checksum failed: missing SHA-1 hash value in dataset_info.json.")
30 |         return
31 | 
32 |     if len(data_files) != 1:
33 |         logger.warning("Checksum failed: too many files.")
34 |         return
35 | 
36 |     with open(data_files[0], "rb") as f:
37 |         sha1 = hashlib.sha1(f.read()).hexdigest()
38 |         if sha1 != file_sha1:
39 |             logger.warning("Checksum failed: mismatched SHA-1 hash value at {}.".format(data_files[0]))
40 | 
41 | 
42 | def infer_max_len(source_len: int, target_len: int, max_len: int, reserved_label_len: int) -> Tuple[int, int]:
43 |     max_target_len = int(max_len * (target_len / (source_len + target_len)))
44 |     max_target_len = max(max_target_len, reserved_label_len)
45 |     max_source_len = max_len - max_target_len
46 |     return max_source_len, max_target_len
47 | 
48 | 
49 | def split_dataset(
50 |     dataset: Union["Dataset", "IterableDataset"], data_args: "DataArguments", training_args: "TrainingArguments"
51 | ) -> Dict[str, "Dataset"]:
52 |     if training_args.do_train:
53 |         if data_args.val_size > 1e-6:  # Split the dataset
54 |             if data_args.streaming:
55 |                 val_set = dataset.take(int(data_args.val_size))
56 |                 train_set = dataset.skip(int(data_args.val_size))
57 |                 dataset = dataset.shuffle(buffer_size=data_args.buffer_size, seed=training_args.seed)
58 |                 return {"train_dataset": train_set, "eval_dataset": val_set}
59 |             else:
60 |                 val_size = int(data_args.val_size) if data_args.val_size > 1 else data_args.val_size
61 |                 dataset = dataset.train_test_split(test_size=val_size, seed=training_args.seed)
62 |                 return {"train_dataset": dataset["compression"], "eval_dataset": dataset["test"]}
63 |         else:
64 |             if data_args.streaming:
65 |                 dataset = dataset.shuffle(buffer_size=data_args.buffer_size, seed=training_args.seed)
66 |             return {"train_dataset": dataset}
67 |     else:  # do_eval or do_predict
68 |         return {"eval_dataset": dataset}
69 | 


--------------------------------------------------------------------------------
/src/llmtuner/extras/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/extras/__init__.py


--------------------------------------------------------------------------------
/src/llmtuner/extras/logging.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import sys
 3 | 
 4 | 
 5 | class LoggerHandler(logging.Handler):
 6 |     r"""
 7 |     Logger handler used in Web UI.
 8 |     """
 9 | 
10 |     def __init__(self):
11 |         super().__init__()
12 |         self.log = ""
13 | 
14 |     def reset(self):
15 |         self.log = ""
16 | 
17 |     def emit(self, record):
18 |         if record.name == "httpx":
19 |             return
20 |         log_entry = self.format(record)
21 |         self.log += log_entry
22 |         self.log += "\n\n"
23 | 
24 | 
25 | def get_logger(name: str) -> logging.Logger:
26 |     r"""
27 |     Gets a standard logger with a stream hander to stdout.
28 |     """
29 |     formatter = logging.Formatter(
30 |         fmt="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S"
31 |     )
32 |     handler = logging.StreamHandler(sys.stdout)
33 |     handler.setFormatter(formatter)
34 | 
35 |     logger = logging.getLogger(name)
36 |     logger.setLevel(logging.INFO)
37 |     logger.addHandler(handler)
38 | 
39 |     return logger
40 | 
41 | 
42 | def reset_logging() -> None:
43 |     r"""
44 |     Removes basic config of root logger. (unused in script)
45 |     """
46 |     root = logging.getLogger()
47 |     list(map(root.removeHandler, root.handlers))
48 |     list(map(root.removeFilter, root.filters))
49 | 


--------------------------------------------------------------------------------
/src/llmtuner/extras/packages.py:
--------------------------------------------------------------------------------
 1 | import importlib.metadata
 2 | import importlib.util
 3 | 
 4 | 
 5 | def _is_package_available(name: str) -> bool:
 6 |     return importlib.util.find_spec(name) is not None
 7 | 
 8 | 
 9 | def _get_package_version(name: str) -> str:
10 |     try:
11 |         return importlib.metadata.version(name)
12 |     except Exception:
13 |         return "0.0.0"
14 | 
15 | 
16 | def is_fastapi_availble():
17 |     return _is_package_available("fastapi")
18 | 
19 | 
20 | def is_flash_attn2_available():
21 |     return _is_package_available("flash_attn") and _get_package_version("flash_attn").startswith("2")
22 | 
23 | 
24 | def is_jieba_available():
25 |     return _is_package_available("jieba")
26 | 
27 | 
28 | def is_matplotlib_available():
29 |     return _is_package_available("matplotlib")
30 | 
31 | 
32 | def is_nltk_available():
33 |     return _is_package_available("nltk")
34 | 
35 | 
36 | def is_requests_available():
37 |     return _is_package_available("requests")
38 | 
39 | 
40 | def is_rouge_available():
41 |     return _is_package_available("rouge_chinese")
42 | 
43 | 
44 | def is_starlette_available():
45 |     return _is_package_available("sse_starlette")
46 | 
47 | 
48 | def is_unsloth_available():
49 |     return _is_package_available("unsloth")
50 | 
51 | 
52 | def is_uvicorn_available():
53 |     return _is_package_available("uvicorn")
54 | 


--------------------------------------------------------------------------------
/src/llmtuner/extras/patches/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/extras/patches/__init__.py


--------------------------------------------------------------------------------
/src/llmtuner/extras/patches/llama_patch.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/extras/patches/llama_patch.py


--------------------------------------------------------------------------------
/src/llmtuner/extras/patches/mixtral_patch.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/extras/patches/mixtral_patch.py


--------------------------------------------------------------------------------
/src/llmtuner/extras/ploting.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import math
 3 | import os
 4 | from typing import List, Optional
 5 | 
 6 | from transformers.trainer import TRAINER_STATE_NAME
 7 | 
 8 | from .logging import get_logger
 9 | from .packages import is_matplotlib_available
10 | 
11 | 
12 | if is_matplotlib_available():
13 |     import matplotlib.pyplot as plt
14 | 
15 | 
16 | logger = get_logger(__name__)
17 | 
18 | 
19 | def smooth(scalars: List[float]) -> List[float]:
20 |     r"""
21 |     EMA implementation according to TensorBoard.
22 |     """
23 |     last = scalars[0]
24 |     smoothed = list()
25 |     weight = 1.8 * (1 / (1 + math.exp(-0.05 * len(scalars))) - 0.5)  # a sigmoid function
26 |     for next_val in scalars:
27 |         smoothed_val = last * weight + (1 - weight) * next_val
28 |         smoothed.append(smoothed_val)
29 |         last = smoothed_val
30 |     return smoothed
31 | 
32 | 
33 | def plot_loss(save_dictionary: os.PathLike, keys: Optional[List[str]] = ["loss"]) -> None:
34 |     with open(os.path.join(save_dictionary, TRAINER_STATE_NAME), "r", encoding="utf-8") as f:
35 |         data = json.load(f)
36 | 
37 |     for key in keys:
38 |         steps, metrics = [], []
39 |         for i in range(len(data["log_history"])):
40 |             if key in data["log_history"][i]:
41 |                 steps.append(data["log_history"][i]["step"])
42 |                 metrics.append(data["log_history"][i][key])
43 | 
44 |         if len(metrics) == 0:
45 |             logger.warning(f"No metric {key} to plot.")
46 |             continue
47 | 
48 |         plt.figure()
49 |         plt.plot(steps, metrics, alpha=0.4, label="original")
50 |         plt.plot(steps, smooth(metrics), label="smoothed")
51 |         plt.title("training {} of {}".format(key, save_dictionary))
52 |         plt.xlabel("step")
53 |         plt.ylabel(key)
54 |         plt.legend()
55 |         plt.savefig(os.path.join(save_dictionary, "training_{}.png".format(key)), format="png", dpi=100)
56 |         print("Figure saved:", os.path.join(save_dictionary, "training_{}.png".format(key)))
57 | 


--------------------------------------------------------------------------------
/src/llmtuner/hparams/__init__.py:
--------------------------------------------------------------------------------
 1 | from .data_args import DataArguments
 2 | from .evaluation_args import EvaluationArguments
 3 | from .finetuning_args import FinetuningArguments
 4 | from .generating_args import GeneratingArguments
 5 | from .model_args import ModelArguments
 6 | from .pruning_args import PruningArguments
 7 | 
 8 | from .parser import get_eval_args, get_infer_args, get_train_args, get_eval_sparse_args, get_train_sparse_args
 9 | 
10 | 
11 | __all__ = [
12 |     "DataArguments",
13 |     "EvaluationArguments",
14 |     "FinetuningArguments",
15 |     "GeneratingArguments",
16 |     "ModelArguments",
17 |     "PruningArguments", 
18 |     "get_eval_args",
19 |     "get_infer_args",
20 |     "get_train_args",
21 |     "get_train_sparse_args"
22 |     "get_eval_sparse_args", 
23 | ]
24 | 


--------------------------------------------------------------------------------
/src/llmtuner/hparams/evaluation_args.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from dataclasses import dataclass, field
 3 | from typing import Literal, Optional
 4 | 
 5 | from datasets import DownloadMode
 6 | 
 7 | 
 8 | @dataclass
 9 | class EvaluationArguments:
10 |     r"""
11 |     Arguments pertaining to specify the evaluation parameters.
12 |     """
13 | 
14 |     task: str = field(
15 |         metadata={"help": "Name of the evaluation task."},
16 |     )
17 |     task_dir: Optional[str] = field(
18 |         default="evaluation",
19 |         metadata={"help": "Path to the folder containing the evaluation datasets."},
20 |     )
21 |     batch_size: Optional[int] = field(
22 |         default=4,
23 |         metadata={"help": "The batch size per GPU for evaluation."},
24 |     )
25 |     seed: Optional[int] = field(
26 |         default=42,
27 |         metadata={"help": "Random seed to be used with data loaders."},
28 |     )
29 |     lang: Optional[Literal["en", "zh"]] = field(
30 |         default="en",
31 |         metadata={"help": "Language used at evaluation."},
32 |     )
33 |     n_shot: Optional[int] = field(
34 |         default=5,
35 |         metadata={"help": "Number of examplars for few-shot learning."},
36 |     )
37 |     save_dir: Optional[str] = field(
38 |         default=None,
39 |         metadata={"help": "Path to save the evaluation results."},
40 |     )
41 |     download_mode: Optional[DownloadMode] = field(
42 |         default=DownloadMode.REUSE_DATASET_IF_EXISTS,
43 |         metadata={"help": "Download mode used for the evaluation datasets."},
44 |     )
45 | 
46 |     def __post_init__(self):
47 |         if self.save_dir is not None and os.path.exists(self.save_dir):
48 |             raise ValueError("`save_dir` already exists, use another one.")
49 | 


--------------------------------------------------------------------------------
/src/llmtuner/hparams/generating_args.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import asdict, dataclass, field
 2 | from typing import Any, Dict, Optional
 3 | 
 4 | 
 5 | @dataclass
 6 | class GeneratingArguments:
 7 |     r"""
 8 |     Arguments pertaining to specify the decoding parameters.
 9 |     """
10 | 
11 |     do_sample: Optional[bool] = field(
12 |         default=True,
13 |         metadata={"help": "Whether or not to use sampling, use greedy decoding otherwise."},
14 |     )
15 |     temperature: Optional[float] = field(
16 |         default=0.95,
17 |         metadata={"help": "The value used to modulate the next token probabilities."},
18 |     )
19 |     top_p: Optional[float] = field(
20 |         default=0.7,
21 |         metadata={
22 |             "help": "The smallest set of most probable tokens with probabilities that add up to top_p or higher are kept."
23 |         },
24 |     )
25 |     top_k: Optional[int] = field(
26 |         default=50,
27 |         metadata={"help": "The number of highest probability vocabulary tokens to keep for top-k filtering."},
28 |     )
29 |     num_beams: Optional[int] = field(
30 |         default=1,
31 |         metadata={"help": "Number of beams for beam search. 1 means no beam search."},
32 |     )
33 |     max_length: Optional[int] = field(
34 |         default=512,
35 |         metadata={"help": "The maximum length the generated tokens can have. It can be overridden by max_new_tokens."},
36 |     )
37 |     max_new_tokens: Optional[int] = field(
38 |         default=512,
39 |         metadata={"help": "The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt."},
40 |     )
41 |     repetition_penalty: Optional[float] = field(
42 |         default=1.0,
43 |         metadata={"help": "The parameter for repetition penalty. 1.0 means no penalty."},
44 |     )
45 |     length_penalty: Optional[float] = field(
46 |         default=1.0,
47 |         metadata={"help": "Exponential penalty to the length that is used with beam-based generation."},
48 |     )
49 | 
50 |     def to_dict(self) -> Dict[str, Any]:
51 |         args = asdict(self)
52 |         if args.get("max_new_tokens", -1) > 0:
53 |             args.pop("max_length", None)
54 |         else:
55 |             args.pop("max_new_tokens", None)
56 |         return args
57 | 


--------------------------------------------------------------------------------
/src/llmtuner/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .loader import load_model_and_tokenizer, load_tokenizer
2 | from .utils import dispatch_model, load_valuehead_params
3 | 
4 | 
5 | __all__ = ["load_model_and_tokenizer", "load_tokenizer", "dispatch_model", "load_valuehead_params"]
6 | 


--------------------------------------------------------------------------------