├── .gitattributes ├── .gitignore ├── BootstrapNAS ├── README.md ├── architecture.png ├── examples │ ├── .gitignore │ ├── BootstrapNAS.ipynb │ ├── README.md │ ├── bootstrapnas_utils.py │ ├── imgs │ │ ├── architecture.png │ │ └── search_progression.png │ ├── imports_bnas.py │ ├── third_party_search │ │ ├── dynas-t_external_search_resnet50_supernet.ipynb │ │ ├── dynast_bootstrapnas_resnet50_cifar10_example.png │ │ └── sigopt_external_search_resnet50_supernet.ipynb │ └── yolox-nano │ │ ├── README.md │ │ ├── class_list.txt │ │ ├── nohup_wo_ignored_scope.out │ │ ├── search_progression.png │ │ └── yolox-bootstrapnas.patch ├── instructions │ ├── Configuration.md │ ├── Home.md │ ├── Quickstart.md │ └── Subnetwork_Search.md ├── models │ ├── pretrained │ │ └── resnet50.pt │ └── supernets │ │ └── cifar10 │ │ └── resnet50 │ │ ├── config.json │ │ ├── elasticity.pth │ │ ├── results.md │ │ ├── search_progression.png │ │ └── supernet_weights.pth └── requirements.txt ├── EFTNAS ├── README.md ├── eftnas_configs │ ├── nncf_base_config_for_bert_base.json │ ├── nncf_eftnas_s1_bert_base_cola.json │ ├── nncf_eftnas_s1_bert_base_mnli.json │ ├── nncf_eftnas_s1_bert_base_mrpc.json │ ├── nncf_eftnas_s1_bert_base_qnli.json │ ├── nncf_eftnas_s1_bert_base_qqp.json │ ├── nncf_eftnas_s1_bert_base_rte.json │ ├── nncf_eftnas_s1_bert_base_squadv1.json │ ├── nncf_eftnas_s1_bert_base_squadv2.json │ ├── nncf_eftnas_s1_bert_base_sst2.json │ ├── nncf_eftnas_s2_bert_medium_cola.json │ ├── nncf_eftnas_s2_bert_medium_mnli.json │ ├── nncf_eftnas_s2_bert_medium_mrpc.json │ ├── nncf_eftnas_s2_bert_medium_qnli.json │ ├── nncf_eftnas_s2_bert_medium_qqp.json │ ├── nncf_eftnas_s2_bert_medium_rte.json │ ├── nncf_eftnas_s2_bert_medium_squadv1.json │ ├── nncf_eftnas_s2_bert_medium_squadv2.json │ └── nncf_eftnas_s2_bert_medium_sst2.json ├── eftnas_search_space │ └── generate_eftnas_search_space.py ├── figures │ └── eftnas_pipeline.png ├── install.sh ├── patches │ ├── nncf.patch │ └── transformers.patch └── running_commands │ ├── cola.sh │ ├── mnli.sh │ ├── mrpc.sh │ ├── qnli.sh │ ├── qqp.sh │ ├── rte.sh │ ├── squadv1.sh │ ├── squadv2.sh │ └── sst2.sh ├── EZNAS ├── README.md ├── dataset_utils.py ├── evol_config.yaml ├── evol_utilities.py ├── gp_func_defs.py ├── nasspace.py ├── novel_search.py ├── reproduce.sh ├── reproduce │ └── all_tests.csv ├── runjob.sh ├── setup_script.sh └── verify_scores.py ├── LICENSE ├── LoNAS ├── README.md ├── install.sh ├── nncf_config │ ├── glue │ │ ├── nncf_lonas_bert_base_cola.json │ │ ├── nncf_lonas_bert_base_mnli.json │ │ ├── nncf_lonas_bert_base_mrpc.json │ │ ├── nncf_lonas_bert_base_qnli.json │ │ ├── nncf_lonas_bert_base_qqp.json │ │ ├── nncf_lonas_bert_base_rte.json │ │ ├── nncf_lonas_bert_base_sst2.json │ │ └── nncf_lonas_bert_base_stsb.json │ ├── unified_commonsense │ │ └── nncf_lonas_llama_7b.json │ └── unified_math │ │ ├── nncf_lonas_bloomz_7b.json │ │ ├── nncf_lonas_llama_13b.json │ │ └── nncf_lonas_llama_7b.json ├── patches │ ├── nncf-544d5141.patch │ ├── peft-v0.5.0.patch │ └── transformers-v4.31.0.patch ├── run_commonsense.py ├── run_glue.py ├── run_math.py └── running_commands ├── Mamba-Shedder ├── README.md ├── eval.py ├── extract │ ├── README.md │ └── extract_mamba.py ├── hybrid │ ├── Hymba-Pruning │ │ ├── README.md │ │ ├── eval.py │ │ ├── extract │ │ │ ├── README.md │ │ │ └── extract_hymba.py │ │ ├── patches │ │ │ ├── hymba-e1b7ee9.patch │ │ │ └── transformers-v4.47.0.patch │ │ ├── prune.py │ │ ├── recovery │ │ │ ├── README.md │ │ │ ├── finetune_hymba.py │ │ │ └── merge.py │ │ └── results │ │ │ ├── README.md │ │ │ └── hymba-1.5b-base │ │ │ ├── eval.res.config.hymba_block.5.json │ │ │ ├── eval.res.config.hymba_block.6.json │ │ │ ├── eval.res.config.hymba_block.7.json │ │ │ └── pruning_config.json │ └── Zamba2-Pruning │ │ ├── README.md │ │ ├── eval.py │ │ ├── extract │ │ └── extract_zamba2.py │ │ ├── install.sh │ │ ├── patches │ │ └── zamba2-7593823.patch │ │ ├── preprocess.py │ │ ├── prune.py │ │ ├── prune_hybrid.py │ │ ├── recovery │ │ └── finetune_zamba2.py │ │ └── results │ │ ├── README.md │ │ └── zamba2-2.7b │ │ ├── ratio_10 │ │ ├── eval.res.config.ssm.40.json │ │ └── pruning_config.json │ │ └── ratio_15 │ │ ├── eval.res.config.ssm.45.json │ │ └── pruning_config.json ├── install.sh ├── patches │ └── mamba-62db608.patch ├── prune.py ├── recovery │ └── finetune_mamba.py ├── results │ ├── README.md │ ├── mamba-2.8b │ │ ├── eval.res.config.mamba_block.13.json │ │ ├── eval.res.config.mamba_block.6.json │ │ └── pruning_config.json │ └── mamba2-2.7b │ │ ├── eval.res.config.ssm.15.json │ │ ├── eval.res.config.ssm.19.json │ │ ├── eval.res.config.ssm.21.json │ │ ├── eval.res.config.ssm.23.json │ │ └── pruning_config.json └── utils.py ├── MultiPruner ├── README.md ├── eval.py ├── extract │ ├── README.md │ └── extract_model.py ├── install.sh ├── patches │ └── transformers-v4.45.0.patch ├── recovery │ ├── README.md │ ├── finetune.py │ └── merge.py ├── requirements.txt ├── results │ ├── Baichuan2-13B-Base │ │ └── ratio_24 │ │ │ ├── eval.res.json │ │ │ └── pruning_config.json │ ├── Baichuan2-7B-Base │ │ └── ratio_22 │ │ │ ├── eval.res.json │ │ │ └── pruning_config.json │ ├── Llama-2-13B │ │ └── ratio_25 │ │ │ ├── eval.res.json │ │ │ └── pruning_config.json │ ├── Llama-2-7B │ │ ├── ratio_10 │ │ │ ├── eval.res.json │ │ │ └── pruning_config.json │ │ ├── ratio_12 │ │ │ ├── eval.res.json │ │ │ └── pruning_config.json │ │ ├── ratio_14 │ │ │ ├── eval.res.json │ │ │ └── pruning_config.json │ │ ├── ratio_15 │ │ │ ├── eval.res.json │ │ │ └── pruning_config.json │ │ ├── ratio_18 │ │ │ ├── eval.res.json │ │ │ └── pruning_config.json │ │ ├── ratio_22 │ │ │ ├── eval.res.json │ │ │ └── pruning_config.json │ │ └── ratio_7 │ │ │ ├── eval.res.json │ │ │ └── pruning_config.json │ ├── Llama-3.1-8B │ │ ├── ratio_10 │ │ │ ├── eval.res.json │ │ │ └── pruning_config.json │ │ ├── ratio_17 │ │ │ ├── eval.res.json │ │ │ └── pruning_config.json │ │ └── ratio_20 │ │ │ ├── eval.res.json │ │ │ └── pruning_config.json │ ├── Llama-3.2-3B │ │ └── ratio_9 │ │ │ ├── eval.res.json │ │ │ └── pruning_config.json │ ├── Meta-Llama-3-8B │ │ ├── ratio_10 │ │ │ ├── eval.res.json │ │ │ └── pruning_config.json │ │ ├── ratio_17 │ │ │ ├── eval.res.json │ │ │ └── pruning_config.json │ │ └── ratio_20 │ │ │ ├── eval.res.json │ │ │ └── pruning_config.json │ ├── Qwen1.5-14B │ │ └── ratio_24 │ │ │ ├── eval.res.json │ │ │ └── pruning_config.json │ ├── Qwen1.5-7B │ │ └── ratio_22 │ │ │ ├── eval.res.json │ │ │ └── pruning_config.json │ ├── Qwen2.5-7B │ │ ├── ratio_10 │ │ │ ├── eval.res.json │ │ │ └── pruning_config.json │ │ └── ratio_20 │ │ │ ├── eval.res.json │ │ │ └── pruning_config.json │ └── README.md ├── run_multipruner.py └── utils.py ├── README.md ├── SQFT ├── README.md ├── install.sh ├── legacy │ ├── README.md │ ├── eval │ │ └── evaluate_math.py │ ├── install.sh │ ├── install_inference.sh │ ├── modules │ │ └── sqft_linear.py │ ├── notebooks │ │ ├── sqft_lora.ipynb │ │ ├── sqft_nls.ipynb │ │ ├── sqft_qa_sparsepeft_lora.ipynb │ │ ├── sqft_qa_sparsepeft_nls.ipynb │ │ ├── sqft_sparsepeft_lora.ipynb │ │ └── sqft_sparsepeft_nls.ipynb │ ├── opea │ │ ├── Dockerfile │ │ ├── README.md │ │ ├── dataset │ │ │ ├── arce_train_instruct.json │ │ │ └── preprocess_arc.py │ │ ├── example_nncf_config │ │ │ └── nncf_config.json │ │ └── search.py │ ├── patches │ │ ├── nncf-f143e1c.patch │ │ ├── peft-v0.10.0.patch │ │ ├── transformers-v4.44.2.patch │ │ └── wanda-8e8fc87.patch │ ├── run_command │ │ ├── README.md │ │ ├── llama-3-8b │ │ │ ├── run.sh │ │ │ └── sparse_quantization.sh │ │ ├── mistral-7b-v0.3 │ │ │ ├── run.sh │ │ │ └── sparse_quantization.sh │ │ └── phi-3-mini-4k-instruct │ │ │ ├── run.sh │ │ │ └── sparse_quantization.sh │ ├── run_instruction_tuning.py │ ├── run_standard_tuning.py │ └── utils │ │ ├── check_sparsity.py │ │ ├── create_sqft_nncf_config.py │ │ ├── extract_sub_adapter.py │ │ ├── load_dataset.py │ │ ├── merge.py │ │ └── quantization.py ├── modules │ ├── elastic_lora_linear.py │ └── sqft_qa_linear.py ├── patches │ ├── peft-v0.10.0.patch │ └── wanda-8e8fc87.patch ├── run_sqft.py └── utils │ ├── check_sparsity.py │ ├── extract_sub_adapter.py │ ├── load_dataset.py │ ├── merge.py │ └── quantization.py ├── Shears ├── README.md ├── example_commonsense.py ├── example_math.py ├── install.sh ├── nncf_config │ ├── nncf_config.md │ ├── nncf_shears_llama.json │ ├── nncf_shears_llama_with_gate_proj.json │ └── nncf_shears_mpt.json ├── patches │ ├── nncf-544d5141.patch │ ├── peft-v0.5.0-inference.patch │ ├── peft-v0.5.0.patch │ └── transformers-v4.31.0.patch ├── preprocess │ └── mpt_process │ │ ├── mpt-7b-modifications-for-shears-usage.patch │ │ ├── split_qkv_preprocess.py │ │ └── wanda │ │ ├── main_mpt.py │ │ └── prune_mpt.py ├── run_commonsense.py ├── run_gsm8k.py ├── run_math.py ├── running_commands ├── search │ ├── load_and_explore_supernet.ipynb │ └── supernet.py └── utils │ └── utils.py ├── SparAMX ├── .gitignore ├── README.md ├── Videos │ ├── sparamx.gif │ └── stock.gif ├── benchmark_deepsparse.sh ├── compare_stock_vs_custom_linear.py ├── compare_stock_vs_onednn_linear.py ├── compare_stock_vs_sparse_linear.py ├── csrc │ ├── avx_sparse_linear.cpp │ ├── dense_linear.cpp │ ├── example_utils.hpp │ ├── onednn_linear.cpp │ ├── quantized_dense_linear.cpp │ ├── quantized_sparse_linear.cpp │ ├── sparse_linear.cpp │ └── sparse_linear_temp.cpp ├── custom_llama_attention.py ├── deepsparse_optimized_llama2.py ├── generate_attention_experiments.py ├── generate_experiments.py ├── layer │ ├── avx_sparse_linear.py │ ├── dense_linear.py │ ├── onednn_linear.py │ ├── quantized_dense_linear.py │ ├── quantized_sparse_linear.py │ └── sparse_linear.py ├── llm_pipeline.py ├── openvino │ └── run_llama_2_8b.py ├── requirements.txt ├── run_experiments.sh ├── run_experiments_attention.sh ├── setup.py ├── test_avx_sparse_attention.py ├── test_avx_sparse_layer.py ├── test_dense_attention.py ├── test_dense_layer.py ├── test_onednn_layer.py ├── test_quantized_dense_layer.py ├── test_quantized_sparse_layer.py ├── test_sparse_attention.py └── test_sparse_layer.py └── security.md /.gitattributes: -------------------------------------------------------------------------------- 1 | *.pt filter=lfs diff=lfs merge=lfs -text 2 | *.pth filter=lfs diff=lfs merge=lfs -text 3 | *.bin filter=lfs diff=lfs merge=lfs -text 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # Mac 132 | .DS_Store 133 | 134 | # Pycharm 135 | .idea -------------------------------------------------------------------------------- /BootstrapNAS/README.md: -------------------------------------------------------------------------------- 1 | # BootstrapNAS Jupyter Notebooks 2 | 3 | --- 4 | 5 |

6 | BootstrapNAS Architecture 7 |

8 | 9 | BootstrapNAS (1) takes a pre-trained model as input. (2) It uses this model to generate a weight-sharing super-network. (3) BootstrapNAS then applies a training strategy, and once the super-network has been trained, (4) it searches for efficient subnetworks that satisfy the user's requirements. (5) The configuration of the discovered sub-network(s) is returned to the user. 10 | 11 | ## Quickstart 12 | 13 | Please follow the instructions [here](https://github.com/jpablomch/bootstrapnas/wiki/Quickstart). 14 | 15 | If you already have a super-network trained with BootstrapNAS, please follow the instructions to search for sub-networks [here](https://github.com/jpablomch/bootstrapnas/wiki/Subnetwork_Search). 16 | 17 | More information about BootstrapNAS is available in our papers: 18 | 19 | [Automated Super-Network Generation for Scalable Neural Architecture Search](https://openreview.net/pdf?id=HK-zmbTB8gq). 20 | 21 | ```bibtex 22 | @inproceedings{ 23 | munoz2022automated, 24 | title={Automated Super-Network Generation for Scalable Neural Architecture Search}, 25 | author={Muñoz, J. Pablo and Lyalyushkin, Nikolay and Lacewell, Chaunte and Senina, Anastasia and Cummings, Daniel and Sarah, Anthony and Kozlov, Alexander and Jain, Nilesh}, 26 | booktitle={First Conference on Automated Machine Learning (Main Track)}, 27 | year={2022}, 28 | url={https://openreview.net/pdf?id=HK-zmbTB8gq} 29 | } 30 | ``` 31 | [Enabling NAS with Automated Super-Network Generation](https://arxiv.org/abs/2112.10878) 32 | 33 | ```BibTex 34 | @article{ 35 | bootstrapNAS, 36 | author = {Muñoz, J. Pablo and Lyalyushkin, Nikolay and Akhauri, Yash and Senina, Anastasia and Kozlov, Alexander and Jain, Nilesh}, 37 | title = {Enabling NAS with Automated Super-Network Generation}, 38 | journal = {1st International Workshop on Practical 39 | Deep Learning in the Wild at AAAI}, 40 | year = {2022}, 41 | url = {https://practical-dl.github.io/2022/short_paper/21.pdf}, 42 | } 43 | ``` 44 | 45 | ## Contributing to BootstrapNAS 46 | Please follow the contribution guidelines in [NNCF](https://github.com/openvinotoolkit/nncf). 47 | 48 | -------------------------------------------------------------------------------- /BootstrapNAS/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/7549413d38677dd6eb92f918f7cc003dc65d1deb/BootstrapNAS/architecture.png -------------------------------------------------------------------------------- /BootstrapNAS/examples/.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | model 3 | output 4 | .DS_Store -------------------------------------------------------------------------------- /BootstrapNAS/examples/README.md: -------------------------------------------------------------------------------- 1 |

2 | Automated Neural Architecture Search with BootstrapNAS 3 |

4 | 5 | This notebook demonstrates how to use [BootstrapNAS](https://arxiv.org/abs/2112.10878), a capability in NNCF to generate weight-sharing super-networks from pre-trained models. Once the super-network has been generated, BootstrapNAS can train it and search for efficient sub-networks. 6 | 7 | ## Examples for using third-party solutions to search for subnetworks. 8 | 9 | [SigOpt](third_party_search/sigopt_external_search_resnet50_supernet.ipynb) 10 | [DyNAS-T](third_party_search/dynas-t_external_search_resnet50_supernet.ipynb) -------------------------------------------------------------------------------- /BootstrapNAS/examples/imgs/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/7549413d38677dd6eb92f918f7cc003dc65d1deb/BootstrapNAS/examples/imgs/architecture.png -------------------------------------------------------------------------------- /BootstrapNAS/examples/imgs/search_progression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/7549413d38677dd6eb92f918f7cc003dc65d1deb/BootstrapNAS/examples/imgs/search_progression.png -------------------------------------------------------------------------------- /BootstrapNAS/examples/imports_bnas.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Intel Corporation 2 | # SPDX-License-Identifier: MIT 3 | 4 | import sys 5 | import time 6 | import zipfile 7 | from pathlib import Path 8 | import logging 9 | import warnings # to disable warnings on export to ONNX 10 | warnings.filterwarnings("ignore") 11 | warnings.simplefilter('ignore') 12 | 13 | import torch 14 | import torch.nn as nn 15 | import torch.nn.parallel 16 | import torch.optim 17 | import torch.utils.data 18 | import torch.utils.data.distributed 19 | import torchvision.datasets as datasets 20 | import torchvision.models as models 21 | import torchvision.transforms as transforms 22 | 23 | import nncf # Important - should be imported directly after torch 24 | from nncf.common.utils.logger import set_log_level 25 | set_log_level(logging.ERROR) # Disables all NNCF info and warning messages 26 | from nncf import NNCFConfig 27 | from nncf.config.structures import BNAdaptationInitArgs 28 | from nncf.experimental.torch.nas.bootstrapNAS import EpochBasedTrainingAlgorithm 29 | from nncf.experimental.torch.nas.bootstrapNAS import SearchAlgorithm 30 | from nncf.torch import create_compressed_model, register_default_init_args 31 | from nncf.torch.initialization import wrap_dataloader_for_init 32 | from nncf.torch.model_creation import create_nncf_network 33 | 34 | from bootstrapnas_utils import resnet50_cifar10, validate, train_epoch, create_folders_demo, create_cifar10_dataloader, download_file 35 | 36 | torch.manual_seed(0) 37 | 38 | print("Imported PyTorch and NNCF") -------------------------------------------------------------------------------- /BootstrapNAS/examples/third_party_search/dynast_bootstrapnas_resnet50_cifar10_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/7549413d38677dd6eb92f918f7cc003dc65d1deb/BootstrapNAS/examples/third_party_search/dynast_bootstrapnas_resnet50_cifar10_example.png -------------------------------------------------------------------------------- /BootstrapNAS/examples/yolox-nano/README.md: -------------------------------------------------------------------------------- 1 | # YoloX-NAS 2 | 3 | ---- 4 | ### Prepare Dataset 5 | ``` 6 | cd /data/dataset/ 7 | wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar 8 | tar xf VOCtrainval_06-Nov-2007.tar 9 | mkdir voc2007_coco 10 | cd voc2007_coco 11 | 12 | wget https://raw.githubusercontent.com/yukkyo/voc2coco/master/voc2coco.py 13 | mkdir annotations 14 | ln -s ../VOCdevkit/VOC2007/Annotations . 15 | ln -s ../VOCdevkit/VOC2007/ImageSets . 16 | ``` 17 | 18 | Add class_list.txt to this directory. 19 | 20 | ``` 21 | python3 voc2coco.py --ann_dir Annotations/ --ann_ids ImageSets/Main/train.txt --labels class_list.txt --output annotations/instances_train.json --ext xml --extract_num_from_imgid 22 | python3 voc2coco.py --ann_dir Annotations/ --ann_ids ImageSets/Main/val.txt --labels class_list.txt --output annotations/instances_val.json --ext xml --extract_num_from_imgid 23 | 24 | ln -s ../VOCdevkit/VOC2007/JPEGImages/ train2017 25 | ln -s ../VOCdevkit/VOC2007/JPEGImages/ val2017 26 | 27 | **change to working directory.** 28 | 29 | git clone https://github.com/Megvii-BaseDetection/YOLOX.git && cd YOLOX 30 | git checkout -b bootstrapnas bb9185c095dfd7a8015a1b82f3e9a065090860b8 31 | git apply < /path/to/yolox-bootstrapnas.patch 32 | 33 | cd datasets && ln -s ../../VOCdevkit/VOC2007 && cd - 34 | cd datasets && ln -s /data/dataset/voc2007_coco/ VOC2007 && cd - 35 | 36 | poetry config --local virtualenvs.in-project true 37 | poetry install 38 | poetry shell 39 | 40 | 41 | #train without BootstrapNAS to get pretrained weight 42 | wget https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_nano.pth 43 | C=yolox_nano.pth 44 | PYTHONPATH=. nohup 2>&1 python tools/train.py -f exps/default/yolox_nano_voc-e50.py -d 1 -b 6 -o -c $C --cache 45 | 46 | #train with BootstrapNAS using pretrained initial weight 47 | C=YOLOX_outputs/yolox_nano_voc-e50/best_ckpt.pth 48 | PYTHONPATH=. nohup 2>&1 python tools/train.py -f exps/default/first_try.py -d 1 -b 6 -o -c $C --cache --nncf_config_path nncf_config_yolox_bootstrapNAS.json 49 | ``` 50 | -------------------------------------------------------------------------------- /BootstrapNAS/examples/yolox-nano/class_list.txt: -------------------------------------------------------------------------------- 1 | aeroplane 2 | bicycle 3 | bird 4 | boat 5 | bottle 6 | bus 7 | car 8 | cat 9 | chair 10 | cow 11 | diningtable 12 | dog 13 | horse 14 | motorbike 15 | person 16 | pottedplant 17 | sheep 18 | sofa 19 | train 20 | tvmonitor -------------------------------------------------------------------------------- /BootstrapNAS/examples/yolox-nano/search_progression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/7549413d38677dd6eb92f918f7cc003dc65d1deb/BootstrapNAS/examples/yolox-nano/search_progression.png -------------------------------------------------------------------------------- /BootstrapNAS/instructions/Home.md: -------------------------------------------------------------------------------- 1 | ### BootstrapNAS: Automated Super-Network Generation for Scalable Neural Architecture Search 2 | 3 | ### [Quickstart](https://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/tree/main/BootstrapNAS/instructions/Quickstart.md) 4 | 5 | ### [Sub-network Search](https://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/tree/main/BootstrapNAS/instructions/Subnetwork_Search.md) 6 | 7 | ### [Configuration](https://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/tree/main/BootstrapNAS/instructions/Configuration.md) 8 | 9 | -------------------------------------------------------------------------------- /BootstrapNAS/instructions/Subnetwork_Search.md: -------------------------------------------------------------------------------- 1 | ### Search on an existing super-network 2 | 3 | If you have a trained super-network, you can start the search stage directly using the ```bootstrap_nas_search.py``` script located [here](https://github.com/openvinotoolkit/nncf/blob/develop/examples/experimental/torch/classification/bootstrap_nas_search.py). 4 | 5 | You must pass the path where the weights and elasticity information have been stored, which is your log directory by default. 6 | 7 | ```shell 8 | python bootstrap_nas_search.py -m 9 | train 10 | --config 11 | --log-dir 12 | --dataset 13 | 14 | --data 15 | --elasticity-state-path 16 | 17 | --supernet-weights 18 | --search-mode 19 | ``` 20 | 21 | #### Hardware-aware search 22 | 23 | BootstrapNAS can be made hardware-aware when searching for efficient sub-networks. To accomplish this, you can pass your own `efficiency evaluator` for the target hardware to the search component. -------------------------------------------------------------------------------- /BootstrapNAS/models/pretrained/resnet50.pt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:af4a5794552d80db4554b1f6fe260bdd3357da77b0707d00a39cb206c2811b90 3 | size 94403311 4 | -------------------------------------------------------------------------------- /BootstrapNAS/models/supernets/cifar10/resnet50/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": "resnet50_cifar10", 3 | "num_classes": 10, 4 | "dataset": "cifar10", 5 | "input_info": { 6 | "sample_size": [1, 3, 32, 32] 7 | }, 8 | "batch_size": 64, 9 | "batch_size_val": 2000, 10 | "multiprocessing_distributed": false, 11 | "optimizer": { 12 | "type": "sgd", 13 | "momentum": 0.9, 14 | "nesterov": true, 15 | "weight_decay": 3e-7, 16 | "base_lr": 2.5e-4, 17 | "label_smoothing": 0.1, 18 | "no_decay_keys": "bn#bias" 19 | }, 20 | "bootstrapNAS": { 21 | "training": { 22 | "algorithm": "progressive_shrinking", 23 | "progressivity_of_elasticity": ["depth", "width"], 24 | "batchnorm_adaptation": { 25 | "num_bn_adaptation_samples": 1500 26 | }, 27 | "schedule": { 28 | "list_stage_descriptions": [ 29 | {"train_dims": ["depth", "width"], "epochs": 125, "depth_indicator": 2, "width_indicator": 4, "init_lr": 2.5e-4, "epochs_lr": 125, "reorg_weights": true} 30 | ] 31 | }, 32 | "elasticity": { 33 | "available_elasticity_dims": ["width", "depth"], 34 | "width": { 35 | "max_num_widths": 4, 36 | "min_out_channels": 32, 37 | "width_step": 32, 38 | "width_multipliers": [1, 0.80, 0.60, 0.50] 39 | }, 40 | "depth": { 41 | "mode": "manual", 42 | "skipped_blocks": [ 43 | ["ResNet/Sequential[layer1]/Bottleneck[1]/ReLU[relu]/relu__2", "ResNet/Sequential[layer1]/Bottleneck[2]/ReLU[relu]/relu__2"], 44 | ["ResNet/Sequential[layer2]/Bottleneck[1]/ReLU[relu]/relu__2", "ResNet/Sequential[layer2]/Bottleneck[2]/ReLU[relu]/relu__2"], 45 | ["ResNet/Sequential[layer2]/Bottleneck[2]/ReLU[relu]/relu__2", "ResNet/Sequential[layer2]/Bottleneck[3]/ReLU[relu]/relu__2"], 46 | ["ResNet/Sequential[layer3]/Bottleneck[3]/ReLU[relu]/relu__2", "ResNet/Sequential[layer3]/Bottleneck[4]/ReLU[relu]/relu__2"], 47 | ["ResNet/Sequential[layer3]/Bottleneck[4]/ReLU[relu]/relu__2", "ResNet/Sequential[layer3]/Bottleneck[5]/ReLU[relu]/relu__2"], 48 | ["ResNet/Sequential[layer4]/Bottleneck[1]/ReLU[relu]/relu__2", "ResNet/Sequential[layer4]/Bottleneck[2]/ReLU[relu]/relu__2"] 49 | ] 50 | } 51 | } 52 | }, 53 | "search": { 54 | "algorithm": "NSGA2", 55 | "batchnorm_adaptation": { 56 | "num_bn_adaptation_samples": 6000 57 | }, 58 | "num_evals": 3000, 59 | "population": 50, 60 | "ref_acc": 93.65 61 | } 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /BootstrapNAS/models/supernets/cifar10/resnet50/elasticity.pth: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:106473f6f954a1bd0d93339cc10454e6f9ab9b5aff3b872067414cbdd0d9f08c 3 | size 6645 4 | -------------------------------------------------------------------------------- /BootstrapNAS/models/supernets/cifar10/resnet50/results.md: -------------------------------------------------------------------------------- 1 | # ResNet50-CIFAR10 2 | 3 | ## 1. Current Results 4 | 5 |
6 | 7 | | Architecture | MACs | Acc@1 | 8 | |-|-|-| 9 | [Pretrained](train_results_others/pretrained.pt) | 325.8M | 93.65 | 10 | [SuperNet](supernet_weights.pth) | 325.8M | 94.09 | 11 | Minimum SubNet | 64.1M | 92.61 | 12 | [Best Found SubNet](search_results_others/subnetwork_best.pth) | 68.3M | 92.96 | 13 | 14 |
15 | 16 | 17 | ## 2. Search Progression 18 |

19 | resnet50-cifar10 search progression 20 |

-------------------------------------------------------------------------------- /BootstrapNAS/models/supernets/cifar10/resnet50/search_progression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/7549413d38677dd6eb92f918f7cc003dc65d1deb/BootstrapNAS/models/supernets/cifar10/resnet50/search_progression.png -------------------------------------------------------------------------------- /BootstrapNAS/models/supernets/cifar10/resnet50/supernet_weights.pth: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:6dbb4aaf604f5a27f6031603e3cc52bca41213dc6839210540f5c26e71d97085 3 | size 94455127 4 | -------------------------------------------------------------------------------- /BootstrapNAS/requirements.txt: -------------------------------------------------------------------------------- 1 | nncf[torch] 2 | ipywidgets 3 | -------------------------------------------------------------------------------- /EFTNAS/figures/eftnas_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/7549413d38677dd6eb92f918f7cc003dc65d1deb/EFTNAS/figures/eftnas_pipeline.png -------------------------------------------------------------------------------- /EFTNAS/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | set -x 4 | 5 | EFTNAS_PATH=$PWD 6 | 7 | git clone https://github.com/openvinotoolkit/nncf.git nncf_eftnas 8 | cd nncf_eftnas 9 | git checkout 415c3c4d 10 | # Apply Patch 11 | git apply $EFTNAS_PATH/patches/nncf.patch 12 | pip install -e . 13 | cd .. 14 | 15 | 16 | git clone https://github.com/huggingface/transformers.git transformers_eftnas 17 | cd transformers_eftnas 18 | git checkout v4.29.1 19 | git apply $EFTNAS_PATH/patches/transformers.patch 20 | pip install -e . 21 | pip install -r examples/pytorch/text-classification/requirements.txt 22 | cd .. 23 | -------------------------------------------------------------------------------- /EFTNAS/running_commands/cola.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | EFTNAS_PATH=$PWD 4 | DEVICES=0 5 | 6 | # Model: BERT-base 7 | # Dataset: cola 8 | 9 | cd transformers_eftnas 10 | CUDA_VISIBLE_DEVICES=${DEVICES} python examples/pytorch/text-classification/run_glue.py \ 11 | --model_name_or_path bert-base-uncased \ 12 | --task_name cola \ 13 | --nncf_config ${EFTNAS_PATH}/eftnas_configs/nncf_eftnas_s1_bert_base_cola.json \ 14 | --output_dir ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-cola/movement_sparsity \ 15 | --do_train \ 16 | --do_eval \ 17 | --max_seq_length 128 \ 18 | --per_device_train_batch_size 32 \ 19 | --per_device_eval_batch_size 128 \ 20 | --learning_rate 2e-5 \ 21 | --num_train_epochs 30 \ 22 | --evaluation_strategy epoch \ 23 | --save_strategy epoch \ 24 | --save_total_limit 1 \ 25 | --seed 42 \ 26 | --fp16 \ 27 | --only_generate_importance_weight True 28 | cd .. 29 | 30 | CUDA_VISIBLE_DEVICES=${DEVICES} python eftnas_search_space/generate_eftnas_search_space.py \ 31 | --source_config eftnas_configs/nncf_eftnas_s1_bert_base_cola.json \ 32 | --model_name_or_path bert-base-uncased \ 33 | --importance_weight_dir trained_models/eftnas-bert-base-cola/movement_sparsity \ 34 | --target_config results/generated_configs/nncf_eftnas_s1_bert_base_cola.json 35 | 36 | cd transformers_eftnas 37 | CUDA_VISIBLE_DEVICES=${DEVICES} python examples/pytorch/text-classification/run_glue.py \ 38 | --model_name_or_path bert-base-uncased \ 39 | --kd_teacher_model ModelTC/bert-base-uncased-cola \ 40 | --reorg_cache_model ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-cola/movement_sparsity/pytorch_model.bin \ 41 | --task_name cola \ 42 | --nncf_config ${EFTNAS_PATH}/results/generated_configs/nncf_eftnas_s1_bert_base_cola.json \ 43 | --output_dir ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-cola \ 44 | --do_train \ 45 | --do_eval \ 46 | --do_search \ 47 | --max_seq_length 128 \ 48 | --per_device_train_batch_size 32 \ 49 | --per_device_eval_batch_size 128 \ 50 | --evaluation_strategy epoch \ 51 | --save_strategy epoch \ 52 | --save_total_limit 1 \ 53 | --seed 42 \ 54 | --fp16 55 | -------------------------------------------------------------------------------- /EFTNAS/running_commands/mnli.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | EFTNAS_PATH=$PWD 4 | DEVICES=0 5 | 6 | # Model: BERT-base 7 | # Dataset: mnli 8 | 9 | cd transformers_eftnas 10 | CUDA_VISIBLE_DEVICES=${DEVICES} python examples/pytorch/text-classification/run_glue.py \ 11 | --model_name_or_path bert-base-uncased \ 12 | --task_name mnli \ 13 | --nncf_config ${EFTNAS_PATH}/eftnas_configs/nncf_eftnas_s1_bert_base_mnli.json \ 14 | --output_dir ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-mnli/movement_sparsity \ 15 | --do_train \ 16 | --do_eval \ 17 | --max_seq_length 128 \ 18 | --per_device_train_batch_size 32 \ 19 | --per_device_eval_batch_size 128 \ 20 | --learning_rate 2e-5 \ 21 | --num_train_epochs 6 \ 22 | --evaluation_strategy epoch \ 23 | --save_strategy epoch \ 24 | --save_total_limit 1 \ 25 | --seed 42 \ 26 | --fp16 \ 27 | --only_generate_importance_weight True 28 | cd .. 29 | 30 | CUDA_VISIBLE_DEVICES=${DEVICES} python eftnas_search_space/generate_eftnas_search_space.py \ 31 | --source_config eftnas_configs/nncf_eftnas_s1_bert_base_mnli.json \ 32 | --model_name_or_path bert-base-uncased \ 33 | --importance_weight_dir trained_models/eftnas-bert-base-mnli/movement_sparsity \ 34 | --target_config results/generated_configs/nncf_eftnas_s1_bert_base_mnli.json 35 | 36 | cd transformers_eftnas 37 | CUDA_VISIBLE_DEVICES=${DEVICES} python examples/pytorch/text-classification/run_glue.py \ 38 | --model_name_or_path bert-base-uncased \ 39 | --kd_teacher_model JeremiahZ/bert-base-uncased-mnli \ 40 | --reorg_cache_model ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-mnli/movement_sparsity/pytorch_model.bin \ 41 | --task_name mnli \ 42 | --nncf_config ${EFTNAS_PATH}/results/generated_configs/nncf_eftnas_s1_bert_base_mnli.json \ 43 | --output_dir ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-mnli \ 44 | --do_train \ 45 | --do_eval \ 46 | --do_search \ 47 | --max_seq_length 128 \ 48 | --per_device_train_batch_size 32 \ 49 | --per_device_eval_batch_size 128 \ 50 | --evaluation_strategy epoch \ 51 | --save_strategy epoch \ 52 | --save_total_limit 1 \ 53 | --seed 42 \ 54 | --fp16 55 | -------------------------------------------------------------------------------- /EFTNAS/running_commands/mrpc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | EFTNAS_PATH=$PWD 4 | DEVICES=0 5 | 6 | # Model: BERT-base 7 | # Dataset: mrpc 8 | 9 | cd transformers_eftnas 10 | CUDA_VISIBLE_DEVICES=${DEVICES} python examples/pytorch/text-classification/run_glue.py \ 11 | --model_name_or_path bert-base-uncased \ 12 | --task_name mrpc \ 13 | --nncf_config ${EFTNAS_PATH}/eftnas_configs/nncf_eftnas_s1_bert_base_mrpc.json \ 14 | --output_dir ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-mrpc/movement_sparsity \ 15 | --do_train \ 16 | --do_eval \ 17 | --max_seq_length 128 \ 18 | --per_device_train_batch_size 32 \ 19 | --per_device_eval_batch_size 128 \ 20 | --learning_rate 2e-5 \ 21 | --num_train_epochs 30 \ 22 | --evaluation_strategy epoch \ 23 | --save_strategy epoch \ 24 | --save_total_limit 1 \ 25 | --seed 42 \ 26 | --fp16 \ 27 | --only_generate_importance_weight True 28 | cd .. 29 | 30 | CUDA_VISIBLE_DEVICES=${DEVICES} python eftnas_search_space/generate_eftnas_search_space.py \ 31 | --source_config eftnas_configs/nncf_eftnas_s1_bert_base_mrpc.json \ 32 | --model_name_or_path bert-base-uncased \ 33 | --importance_weight_dir trained_models/eftnas-bert-base-mrpc/movement_sparsity \ 34 | --target_config results/generated_configs/nncf_eftnas_s1_bert_base_mrpc.json 35 | 36 | cd transformers_eftnas 37 | CUDA_VISIBLE_DEVICES=${DEVICES} python examples/pytorch/text-classification/run_glue.py \ 38 | --model_name_or_path bert-base-uncased \ 39 | --kd_teacher_model Intel/bert-base-uncased-mrpc \ 40 | --reorg_cache_model ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-mrpc/movement_sparsity/pytorch_model.bin \ 41 | --task_name mrpc \ 42 | --nncf_config ${EFTNAS_PATH}/results/generated_configs/nncf_eftnas_s1_bert_base_mrpc.json \ 43 | --output_dir ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-mrpc \ 44 | --do_train \ 45 | --do_eval \ 46 | --do_search \ 47 | --max_seq_length 128 \ 48 | --per_device_train_batch_size 32 \ 49 | --per_device_eval_batch_size 128 \ 50 | --evaluation_strategy epoch \ 51 | --save_strategy epoch \ 52 | --save_total_limit 1 \ 53 | --seed 42 \ 54 | --fp16 55 | -------------------------------------------------------------------------------- /EFTNAS/running_commands/qnli.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | EFTNAS_PATH=$PWD 4 | DEVICES=0 5 | 6 | # Model: BERT-base 7 | # Dataset: qnli 8 | 9 | cd transformers_eftnas 10 | CUDA_VISIBLE_DEVICES=${DEVICES} python examples/pytorch/text-classification/run_glue.py \ 11 | --model_name_or_path bert-base-uncased \ 12 | --task_name qnli \ 13 | --nncf_config ${EFTNAS_PATH}/eftnas_configs/nncf_eftnas_s1_bert_base_qnli.json \ 14 | --output_dir ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-qnli/movement_sparsity \ 15 | --do_train \ 16 | --do_eval \ 17 | --max_seq_length 128 \ 18 | --per_device_train_batch_size 32 \ 19 | --per_device_eval_batch_size 128 \ 20 | --learning_rate 2e-5 \ 21 | --num_train_epochs 6 \ 22 | --evaluation_strategy epoch \ 23 | --save_strategy epoch \ 24 | --save_total_limit 1 \ 25 | --seed 42 \ 26 | --fp16 \ 27 | --only_generate_importance_weight True 28 | cd .. 29 | 30 | CUDA_VISIBLE_DEVICES=${DEVICES} python eftnas_search_space/generate_eftnas_search_space.py \ 31 | --source_config eftnas_configs/nncf_eftnas_s1_bert_base_qnli.json \ 32 | --model_name_or_path bert-base-uncased \ 33 | --importance_weight_dir trained_models/eftnas-bert-base-qnli/movement_sparsity \ 34 | --target_config results/generated_configs/nncf_eftnas_s1_bert_base_qnli.json 35 | 36 | cd transformers_eftnas 37 | CUDA_VISIBLE_DEVICES=${DEVICES} python examples/pytorch/text-classification/run_glue.py \ 38 | --model_name_or_path bert-base-uncased \ 39 | --kd_teacher_model ModelTC/bert-base-uncased-qnli \ 40 | --reorg_cache_model ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-qnli/movement_sparsity/pytorch_model.bin \ 41 | --task_name qnli \ 42 | --nncf_config ${EFTNAS_PATH}/results/generated_configs/nncf_eftnas_s1_bert_base_qnli.json \ 43 | --output_dir ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-qnli \ 44 | --do_train \ 45 | --do_eval \ 46 | --do_search \ 47 | --max_seq_length 128 \ 48 | --per_device_train_batch_size 32 \ 49 | --per_device_eval_batch_size 128 \ 50 | --evaluation_strategy epoch \ 51 | --save_strategy epoch \ 52 | --save_total_limit 1 \ 53 | --seed 42 \ 54 | --fp16 55 | -------------------------------------------------------------------------------- /EFTNAS/running_commands/qqp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | EFTNAS_PATH=$PWD 4 | DEVICES=0 5 | 6 | # Model: BERT-base 7 | # Dataset: qqp 8 | 9 | cd transformers_eftnas 10 | CUDA_VISIBLE_DEVICES=${DEVICES} python examples/pytorch/text-classification/run_glue.py \ 11 | --model_name_or_path bert-base-uncased \ 12 | --task_name qqp \ 13 | --nncf_config ${EFTNAS_PATH}/eftnas_configs/nncf_eftnas_s1_bert_base_qqp.json \ 14 | --output_dir ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-qqp/movement_sparsity \ 15 | --do_train \ 16 | --do_eval \ 17 | --max_seq_length 128 \ 18 | --per_device_train_batch_size 32 \ 19 | --per_device_eval_batch_size 128 \ 20 | --learning_rate 2e-5 \ 21 | --num_train_epochs 6 \ 22 | --evaluation_strategy epoch \ 23 | --save_strategy epoch \ 24 | --save_total_limit 1 \ 25 | --seed 42 \ 26 | --fp16 \ 27 | --only_generate_importance_weight True 28 | cd .. 29 | 30 | CUDA_VISIBLE_DEVICES=${DEVICES} python eftnas_search_space/generate_eftnas_search_space.py \ 31 | --source_config eftnas_configs/nncf_eftnas_s1_bert_base_qqp.json \ 32 | --model_name_or_path bert-base-uncased \ 33 | --importance_weight_dir trained_models/eftnas-bert-base-qqp/movement_sparsity \ 34 | --target_config results/generated_configs/nncf_eftnas_s1_bert_base_qqp.json 35 | 36 | cd transformers_eftnas 37 | CUDA_VISIBLE_DEVICES=${DEVICES} python examples/pytorch/text-classification/run_glue.py \ 38 | --model_name_or_path bert-base-uncased \ 39 | --kd_teacher_model JeremiahZ/bert-base-uncased-qqp \ 40 | --reorg_cache_model ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-qqp/movement_sparsity/pytorch_model.bin \ 41 | --task_name qqp \ 42 | --nncf_config ${EFTNAS_PATH}/results/generated_configs/nncf_eftnas_s1_bert_base_qqp.json \ 43 | --output_dir ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-qqp \ 44 | --do_train \ 45 | --do_eval \ 46 | --do_search \ 47 | --max_seq_length 128 \ 48 | --per_device_train_batch_size 32 \ 49 | --per_device_eval_batch_size 128 \ 50 | --evaluation_strategy epoch \ 51 | --save_strategy epoch \ 52 | --save_total_limit 1 \ 53 | --seed 42 \ 54 | --fp16 55 | -------------------------------------------------------------------------------- /EFTNAS/running_commands/rte.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | EFTNAS_PATH=$PWD 4 | DEVICES=0 5 | 6 | # Model: BERT-base 7 | # Dataset: rte 8 | 9 | cd transformers_eftnas 10 | CUDA_VISIBLE_DEVICES=${DEVICES} python examples/pytorch/text-classification/run_glue.py \ 11 | --model_name_or_path bert-base-uncased \ 12 | --task_name rte \ 13 | --nncf_config ${EFTNAS_PATH}/eftnas_configs/nncf_eftnas_s1_bert_base_rte.json \ 14 | --output_dir ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-rte/movement_sparsity \ 15 | --do_train \ 16 | --do_eval \ 17 | --max_seq_length 128 \ 18 | --per_device_train_batch_size 32 \ 19 | --per_device_eval_batch_size 128 \ 20 | --learning_rate 2e-5 \ 21 | --num_train_epochs 30 \ 22 | --evaluation_strategy epoch \ 23 | --save_strategy epoch \ 24 | --save_total_limit 1 \ 25 | --seed 42 \ 26 | --fp16 \ 27 | --only_generate_importance_weight True 28 | cd .. 29 | 30 | CUDA_VISIBLE_DEVICES=${DEVICES} python eftnas_search_space/generate_eftnas_search_space.py \ 31 | --source_config eftnas_configs/nncf_eftnas_s1_bert_base_rte.json \ 32 | --model_name_or_path bert-base-uncased \ 33 | --importance_weight_dir trained_models/eftnas-bert-base-rte/movement_sparsity \ 34 | --target_config results/generated_configs/nncf_eftnas_s1_bert_base_rte.json 35 | 36 | cd transformers_eftnas 37 | CUDA_VISIBLE_DEVICES=${DEVICES} python examples/pytorch/text-classification/run_glue.py \ 38 | --model_name_or_path bert-base-uncased \ 39 | --kd_teacher_model textattack/bert-base-uncased-RTE \ 40 | --reorg_cache_model ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-rte/movement_sparsity/pytorch_model.bin \ 41 | --task_name rte \ 42 | --nncf_config ${EFTNAS_PATH}/results/generated_configs/nncf_eftnas_s1_bert_base_rte.json \ 43 | --output_dir ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-rte \ 44 | --do_train \ 45 | --do_eval \ 46 | --do_search \ 47 | --max_seq_length 128 \ 48 | --per_device_train_batch_size 32 \ 49 | --per_device_eval_batch_size 128 \ 50 | --evaluation_strategy epoch \ 51 | --save_strategy epoch \ 52 | --save_total_limit 1 \ 53 | --seed 42 \ 54 | --fp16 55 | -------------------------------------------------------------------------------- /EFTNAS/running_commands/squadv1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | EFTNAS_PATH=$PWD 4 | DEVICES=0 5 | 6 | # Model: BERT-base 7 | # Dataset: squadv1 8 | 9 | cd transformers_eftnas 10 | CUDA_VISIBLE_DEVICES=${DEVICES} python examples/pytorch/question-answering/run_qa.py \ 11 | --model_name_or_path bert-base-uncased \ 12 | --do_train \ 13 | --do_eval \ 14 | --dataset_name squad \ 15 | --learning_rate 2e-5 \ 16 | --per_gpu_train_batch_size 16 \ 17 | --per_gpu_eval_batch_size 128 \ 18 | --output_dir ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-squadv1/movement_sparsity \ 19 | --max_seq_length 384 \ 20 | --doc_stride 128 \ 21 | --nncf_config ${EFTNAS_PATH}/eftnas_configs/nncf_eftnas_s1_bert_base_squadv1.json \ 22 | --evaluation_strategy epoch \ 23 | --save_strategy epoch \ 24 | --metric_for_best_model f1 \ 25 | --overwrite_output_dir \ 26 | --save_total_limit 1 \ 27 | --num_train_epochs 8 \ 28 | --fp16 \ 29 | --only_generate_importance_weight True 30 | 31 | CUDA_VISIBLE_DEVICES=${DEVICES} python eftnas_search_space/generate_eftnas_search_space.py \ 32 | --source_config eftnas_configs/nncf_eftnas_s1_bert_base_squadv1.json \ 33 | --model_name_or_path bert-base-uncased \ 34 | --importance_weight_dir trained_models/eftnas-bert-base-squadv1/movement_sparsity \ 35 | --target_config results/generated_configs/nncf_eftnas_s1_bert_base_squadv1.json 36 | 37 | cd transformers_eftnas 38 | CUDA_VISIBLE_DEVICES=${DEVICES} python examples/pytorch/question-answering/run_qa.py \ 39 | --model_name_or_path bert-base-uncased \ 40 | --kd_teacher_model csarron/bert-base-uncased-squad-v1 \ 41 | --reorg_cache_model ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-squadv1/movement_sparsity/pytorch_model.bin \ 42 | --do_train \ 43 | --do_eval \ 44 | --do_search \ 45 | --dataset_name squad \ 46 | --learning_rate 3e-5 \ 47 | --per_gpu_train_batch_size 16 \ 48 | --per_gpu_eval_batch_size 128 \ 49 | --output_dir ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-squadv1 \ 50 | --max_seq_length 384 \ 51 | --doc_stride 128 \ 52 | --nncf_config ${EFTNAS_PATH}/results/generated_configs/nncf_eftnas_s1_bert_base_squadv1.json \ 53 | --evaluation_strategy epoch \ 54 | --save_strategy epoch \ 55 | --metric_for_best_model f1 \ 56 | --save_total_limit 1 \ 57 | --num_train_epochs 8 \ 58 | --fp16 59 | -------------------------------------------------------------------------------- /EFTNAS/running_commands/squadv2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | EFTNAS_PATH=$PWD 4 | DEVICES=0 5 | 6 | # Model: BERT-base 7 | # Dataset: squadv2 8 | 9 | cd transformers_eftnas 10 | CUDA_VISIBLE_DEVICES=${DEVICES} python examples/pytorch/question-answering/run_qa.py \ 11 | --model_name_or_path bert-base-uncased \ 12 | --do_train \ 13 | --do_eval \ 14 | --dataset_name squad_v2 \ 15 | --learning_rate 2e-5 \ 16 | --per_gpu_train_batch_size 16 \ 17 | --per_gpu_eval_batch_size 128 \ 18 | --output_dir ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-squadv2/movement_sparsity \ 19 | --max_seq_length 384 \ 20 | --doc_stride 128 \ 21 | --nncf_config ${EFTNAS_PATH}/eftnas_configs/nncf_eftnas_s1_bert_base_squadv2.json \ 22 | --evaluation_strategy epoch \ 23 | --save_strategy epoch \ 24 | --metric_for_best_model f1 \ 25 | --overwrite_output_dir \ 26 | --save_total_limit 1 \ 27 | --num_train_epochs 8 \ 28 | --fp16 \ 29 | --only_generate_importance_weight True 30 | 31 | CUDA_VISIBLE_DEVICES=${DEVICES} python eftnas_search_space/generate_eftnas_search_space.py \ 32 | --source_config eftnas_configs/nncf_eftnas_s1_bert_base_squadv2.json \ 33 | --model_name_or_path bert-base-uncased \ 34 | --importance_weight_dir trained_models/eftnas-bert-base-squadv2/movement_sparsity \ 35 | --target_config results/generated_configs/nncf_eftnas_s1_bert_base_squadv2.json 36 | 37 | cd transformers_eftnas 38 | CUDA_VISIBLE_DEVICES=${DEVICES} python examples/pytorch/question-answering/run_qa.py \ 39 | --model_name_or_path bert-base-uncased \ 40 | --kd_teacher_model deepset/bert-base-uncased-squad2 \ 41 | --reorg_cache_model ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-squadv2/movement_sparsity/pytorch_model.bin \ 42 | --do_train \ 43 | --do_eval \ 44 | --do_search \ 45 | --dataset_name squad_v2 \ 46 | --learning_rate 3e-5 \ 47 | --per_gpu_train_batch_size 16 \ 48 | --per_gpu_eval_batch_size 128 \ 49 | --version_2_with_negative \ 50 | --output_dir ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-squadv2 \ 51 | --max_seq_length 384 \ 52 | --doc_stride 128 \ 53 | --nncf_config ${EFTNAS_PATH}/results/generated_configs/nncf_eftnas_s1_bert_base_squadv2.json \ 54 | --evaluation_strategy epoch \ 55 | --save_strategy epoch \ 56 | --metric_for_best_model f1 \ 57 | --ddp_find_unused_parameters True \ 58 | --save_total_limit 1 \ 59 | --num_train_epochs 8 \ 60 | --fp16 61 | -------------------------------------------------------------------------------- /EFTNAS/running_commands/sst2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | EFTNAS_PATH=$PWD 4 | DEVICES=0 5 | 6 | # Model: BERT-base 7 | # Dataset: sst2 8 | 9 | cd transformers_eftnas 10 | CUDA_VISIBLE_DEVICES=${DEVICES} python examples/pytorch/text-classification/run_glue.py \ 11 | --model_name_or_path bert-base-uncased \ 12 | --task_name sst2 \ 13 | --nncf_config ${EFTNAS_PATH}/eftnas_configs/nncf_eftnas_s1_bert_base_sst2.json \ 14 | --output_dir ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-sst2/movement_sparsity \ 15 | --do_train \ 16 | --do_eval \ 17 | --max_seq_length 128 \ 18 | --per_device_train_batch_size 32 \ 19 | --per_device_eval_batch_size 128 \ 20 | --learning_rate 2e-5 \ 21 | --num_train_epochs 6 \ 22 | --evaluation_strategy epoch \ 23 | --save_strategy epoch \ 24 | --save_total_limit 1 \ 25 | --seed 42 \ 26 | --fp16 \ 27 | --only_generate_importance_weight True 28 | cd .. 29 | 30 | CUDA_VISIBLE_DEVICES=${DEVICES} python eftnas_search_space/generate_eftnas_search_space.py \ 31 | --source_config eftnas_configs/nncf_eftnas_s1_bert_base_sst2.json \ 32 | --model_name_or_path bert-base-uncased \ 33 | --importance_weight_dir trained_models/eftnas-bert-base-sst2/movement_sparsity \ 34 | --target_config results/generated_configs/nncf_eftnas_s1_bert_base_sst2.json 35 | 36 | cd transformers_eftnas 37 | CUDA_VISIBLE_DEVICES=${DEVICES} python examples/pytorch/text-classification/run_glue.py \ 38 | --model_name_or_path bert-base-uncased \ 39 | --kd_teacher_model JeremiahZ/bert-base-uncased-sst2 \ 40 | --reorg_cache_model ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-sst2/movement_sparsity/pytorch_model.bin \ 41 | --task_name sst2 \ 42 | --nncf_config ${EFTNAS_PATH}/results/generated_configs/nncf_eftnas_s1_bert_base_sst2.json \ 43 | --output_dir ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-sst2 \ 44 | --do_train \ 45 | --do_eval \ 46 | --do_search \ 47 | --max_seq_length 128 \ 48 | --per_device_train_batch_size 32 \ 49 | --per_device_eval_batch_size 128 \ 50 | --evaluation_strategy epoch \ 51 | --save_strategy epoch \ 52 | --save_total_limit 1 \ 53 | --seed 42 \ 54 | --fp16 55 | -------------------------------------------------------------------------------- /EZNAS/README.md: -------------------------------------------------------------------------------- 1 | # EZNAS: Evolving Zero-Cost Proxies For Neural Architecture Scoring 2 | 3 | EZNAS is a genetic programming-driven methodology for automatically discovering Zero-Cost Neural Architecture Scoring Metrics (ZC-NASMs). It aims to provide an interpretable, generalizable, and efficient approach to rank neural networks without the expensive training routines, significantly reducing the carbon footprint of Neural Architecture Search (NAS). 4 | 5 | ## Installation 6 | 7 | Follow these steps to set up and run EZNAS: 8 | 9 | ### Step 1: Base Set-up 10 | Run the provided setup_script.sh to install all necessary packages and dependencies. 11 | 12 | ```bash 13 | bash setup_script.sh 14 | ``` 15 | 16 | This script should handle: 17 | 18 | 1. Installation of required Python packages. 19 | 2. Cloning of external GitHub repositories. 20 | 3. Setting up datasets and additional files necessary for running the project. 21 | 22 | ### Step 2: Set Environment Variable 23 | 24 | Set the PROJ_HOME environment variable to the path of your project: 25 | 26 | ```bash 27 | export PROJ_HOME="" 28 | ``` 29 | 30 | ### Step 3: Run evaluation 31 | 32 | For SLURM based execution, modify runjob.sh as per server specification. 33 | 34 | To reproduce results for a specific data-set, simply run the appropriate command in quotes from the reproduce.sh file. 35 | 36 | ```bash 37 | python verify_scores.py --batch_size 16 --search_space NASBench201 --dataset cifar10 --nds_space '' 38 | ``` 39 | 40 | ### Results 41 | 42 | | Search Space | Kendall τ | Spearman ρ | 43 | |------------------------|--------------|--------------| 44 | | NASBench-201 CIFAR-10 | 0.6195383854 | 0.8084988792 | 45 | | NASBench-201 CIFAR-100 | 0.6168760649 | 0.7983379022 | 46 | | NATSBench-SSS | 0.7073727282 | 0.8873359833 | 47 | | NDS DARTS | 0.5466290384 | 0.7364709542 | 48 | | NDS Amoeba | 0.4130041903 | 0.5775007582 | 49 | | NDS ENAS | 0.5111310224 | 0.6932549307 | 50 | | NDS PNAS | 0.4781835008 | 0.656343803 | 51 | | NDS NASNet | 0.4312498051 | 0.6050820615 | 52 | 53 | 54 | Note that the above table is for a batch size of 16. For better results, a higher batch-size is recommended! For instance, for NATSBench-SS at batch-size of 64, the Spearman ρ is 0.91. 55 | 56 | # Citation 57 | 58 | If you use the code or data in your research, please use the following BibTex entry: 59 | 60 | ``` 61 | @inproceedings{ 62 | akhauri2022eznas, 63 | title={{EZNAS}: Evolving Zero-Cost Proxies For Neural Architecture Scoring}, 64 | author={Yash Akhauri and Juan Pablo Munoz and Nilesh Jain and Ravishankar Iyer}, 65 | booktitle={Advances in Neural Information Processing Systems}, 66 | editor={Alice H. Oh and Alekh Agarwal and Danielle Belgrave and Kyunghyun Cho}, 67 | year={2022}, 68 | url={https://openreview.net/forum?id=lSqaDG4dvdt} 69 | } 70 | ``` -------------------------------------------------------------------------------- /EZNAS/evol_config.yaml: -------------------------------------------------------------------------------- 1 | NUM_MATH_OPS: 28 2 | STATIC_ADDRS: 22 3 | LEN_IND_ADDR: 3 4 | NUM_DYNAMIC_ADDR_SPACES: 5 5 | NEW_INST_MIN_LEN: 8 6 | NEW_INST_MAX_LEN: 24 7 | NGEN: 10 8 | POPSIZE: 50 9 | TOURSIZE: 4 10 | MU: 25 11 | lambda_ : 50 12 | CXPR: 0.4 13 | MUTPR: 0.4 14 | nproc: 16 15 | NUM_NETS: 500 16 | SUBSAMPLE_NETS: 20 17 | NUM_SAMPLING_EVAL: 4 18 | rangemix: True 19 | MIN_TREE_DEPTH: 2 20 | MAX_TREE_DEPTH: 6 21 | data_folder: "data2" -------------------------------------------------------------------------------- /EZNAS/reproduce.sh: -------------------------------------------------------------------------------- 1 | bash runjob.sh "python verify_scores.py --batch_size 16 " "nb2_cf10" 2 | bash runjob.sh "python verify_scores.py --batch_size 16 --dataset cifar100" "nb2_cf100" 3 | bash runjob.sh "python verify_scores.py --batch_size 16 --dataset ImageNet16-120" "nb2_in" 4 | bash runjob.sh "python verify_scores.py --batch_size 16 --nds_space nds_amoeba --search_space NDS" "amoeba" 5 | bash runjob.sh "python verify_scores.py --batch_size 16 --nds_space nds_darts --search_space NDS" "darts" 6 | bash runjob.sh "python verify_scores.py --batch_size 16 --nds_space nds_pnas --search_space NDS" "pnas" 7 | bash runjob.sh "python verify_scores.py --batch_size 16 --nds_space nds_nasnet --search_space NDS" "nasnet" 8 | bash runjob.sh "python verify_scores.py --batch_size 16 --nds_space nds_enas --search_space NDS" "enas" 9 | bash runjob.sh "python verify_scores.py --batch_size 64 --search_space NATSBench" "nats" 10 | 11 | -------------------------------------------------------------------------------- /EZNAS/reproduce/all_tests.csv: -------------------------------------------------------------------------------- 1 | 1,-1,16,cifar10,NASBench201,,0.6195383853731455,0.4902176532853689,0.8084988792039854 2 | 1,-1,16,cifar100,NASBench201,,0.6168760649367213,0.5991489645246415,0.7983379021533304 3 | 1,-1,16,cifar10,NATSBench,,0.7073727281824621,0.8655292939139214,0.8873359832758938 4 | 1,-1,16,cifar10,NDS,nds_darts,0.5466290384387654,0.2101314187641226,0.7364709541852409 5 | 1,-1,16,cifar10,NDS,nds_amoeba,0.41300419025409685,0.13047597007496348,0.5775007581755685 6 | 1,-1,16,cifar10,NDS,nds_enas,0.5111310223594074,0.12183248307551318,0.6932549307473483 7 | 1,-1,16,cifar10,NDS,nds_pnas,0.47818350082450095,0.0657925903642663,0.6563438030327016 8 | 1,-1,16,cifar10,NDS,nds_nasnet,0.43124980513279043,0.16231595543650768,0.6050820615327234 -------------------------------------------------------------------------------- /EZNAS/runjob.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if two arguments are provided 4 | if [ "$#" -ne 2 ]; then 5 | echo "Usage: $0 [command] [job-name]" 6 | exit 1 7 | fi 8 | 9 | COMMAND=$1 10 | JOB_NAME=$2 11 | 12 | # Create a temporary SLURM script 13 | TMP_SLURM_SCRIPT="tmp_${JOB_NAME}.slurm" 14 | 15 | cat > $TMP_SLURM_SCRIPT < \ 10 | --pruned_model_config_file /pruning_config.json # Or specify the config file of a pruning step from the `pruned_model_configs` folder, e.g., /pruned_model_configs/config.mamba_block.${eval_step}.json 11 | 12 | # Mamba-2 (SSM Pruning) 13 | python extract/extract_mamba.py \ 14 | --model_path state-spaces/mamba2-2.7b \ 15 | --output_path \ 16 | --pruned_model_config_file /pruning_config.json # Or specify the config file of a pruning step from the `pruned_model_configs` folder, e.g., /pruned_model_configs/config.ssm.${eval_step}.json 17 | ``` 18 | 19 | - `model_path`: Path to the pre-trained model. 20 | - `pruned_model_config_file`: JSON file for the pruned model configuration. 21 | - `output_path`: Directory to save the compressed model. 22 | -------------------------------------------------------------------------------- /Mamba-Shedder/extract/extract_mamba.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import logging 4 | import os 5 | import torch 6 | 7 | from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel 8 | from transformers import AutoTokenizer 9 | 10 | 11 | MAMBA_MODULES = [ 12 | "backbone.layers.*.mixer.dt_bias", 13 | "backbone.layers.*.mixer.A_log", 14 | "backbone.layers.*.mixer.D", 15 | "backbone.layers.*.mixer.in_proj.weight", 16 | "backbone.layers.*.mixer.conv1d.weight", 17 | "backbone.layers.*.mixer.conv1d.bias", 18 | "backbone.layers.*.mixer.norm.weight", 19 | "backbone.layers.*.mixer.out_proj.weight", 20 | "backbone.layers.*.mixer.dt_proj.weight", # Mamba-1 21 | "backbone.layers.*.mixer.dt_proj.bias", # Mamba-1 22 | "backbone.layers.*.mixer.x_proj.weight", # Mamba-1 23 | "backbone.layers.*.norm.weight", 24 | ] 25 | 26 | # only for Mamba-2 27 | SSM_MODULES = [ 28 | "backbone.layers.*.mixer.D", 29 | "backbone.layers.*.mixer.dt_bias", 30 | ] 31 | 32 | 33 | def main(): 34 | parser = argparse.ArgumentParser() 35 | parser.add_argument( 36 | "--model_path", 37 | type=str, 38 | help="Path to the Mamba model." 39 | ) 40 | parser.add_argument( 41 | "--output_path", 42 | type=str, 43 | help="Directory to save the compressed model." 44 | ) 45 | parser.add_argument( 46 | "--pruned_model_config_file", 47 | type=str, 48 | help="Path to the pruned model configuration file." 49 | ) 50 | 51 | args = parser.parse_args() 52 | model_path = args.model_path 53 | output_path = args.output_path 54 | # Create output directory if it doesn't exist 55 | os.makedirs(output_path, exist_ok=True) 56 | pruned_model_config_file = args.pruned_model_config_file 57 | 58 | # Load model and tokenizer 59 | tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b") 60 | model = MambaLMHeadModel.from_pretrained(model_path, device="cuda", dtype=torch.float16) 61 | 62 | # Load pruning results 63 | with open(pruned_model_config_file, "r") as f: 64 | pruned_config = json.load(f) 65 | logging.info(f"Detect a pruned model config: {pruned_config}") 66 | state_dict = model.state_dict() 67 | 68 | def prune_modules(state_dict, idx, module_names): 69 | for module_name in module_names: 70 | module_name = module_name.replace("*", str(idx)) 71 | if module_name in state_dict: 72 | del state_dict[module_name] 73 | 74 | if pruned_config.get("pruned_mamba_block_idx"): 75 | pruned_mamba_block_idx = pruned_config["pruned_mamba_block_idx"] 76 | for idx in pruned_mamba_block_idx: 77 | prune_modules(state_dict, idx, MAMBA_MODULES) 78 | if pruned_config.get("pruned_ssm_idx"): 79 | pruned_ssm_idx = pruned_config["pruned_ssm_idx"] 80 | for idx in pruned_ssm_idx: 81 | prune_modules(state_dict, idx, SSM_MODULES) 82 | 83 | model.save_pretrained(output_path, state_dict=state_dict) 84 | tokenizer.save_pretrained(output_path) 85 | 86 | 87 | if __name__ == "__main__": 88 | main() 89 | -------------------------------------------------------------------------------- /Mamba-Shedder/hybrid/Hymba-Pruning/eval.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import logging 4 | 5 | from transformers import AutoModelForCausalLM, AutoTokenizer 6 | 7 | from lm_eval import evaluator 8 | from lm_eval.models.huggingface import HFLM 9 | 10 | TASKS = ["arc_easy", "arc_challenge", "piqa", "winogrande", "hellaswag"] 11 | 12 | 13 | def main(): 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument( 16 | "--model_path", 17 | type=str, 18 | ) 19 | args = parser.parse_args() 20 | model_path = args.model_path 21 | 22 | model = AutoModelForCausalLM.from_pretrained( 23 | model_path, 24 | device_map="cuda", 25 | torch_dtype="float16", 26 | trust_remote_code=True 27 | ) 28 | tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 29 | lm = HFLM(pretrained=model, tokenizer=tokenizer, batch_size=64) 30 | 31 | # Evaluate on selected tasks 32 | logging.info(f"Selected Tasks: {TASKS}") 33 | results = evaluator.simple_evaluate(lm, tasks=TASKS, num_fewshot=0, batch_size=64, log_samples=False)['results'] 34 | 35 | metric_vals = {} 36 | for task, result in results.items(): 37 | res = result['acc,none'] 38 | metric_vals[task] = round(res, 3) * 100 39 | 40 | logging.info(json.dumps(metric_vals, indent=4)) 41 | 42 | 43 | if __name__ == "__main__": 44 | main() 45 | -------------------------------------------------------------------------------- /Mamba-Shedder/hybrid/Hymba-Pruning/extract/README.md: -------------------------------------------------------------------------------- 1 | ## Extract the Compressed Model from Mamba-Shedder 2 | 3 | The final compressed model can be extracted based on the optimal pruning configuration obtained from Mamba-Shedder. 4 | 5 | ```bash 6 | # Hymba 7 | python extract/extract_hymba.py \ 8 | --model_path Hymba-1.5B-Base \ 9 | --weight_reorder \ 10 | --output_path . \ 11 | --pruned_model_config_file /pruning_config.json # Or specify the config file of a pruning step from the `pruned_model_configs` folder, e.g., /pruned_model_configs/config.mlp_width.${eval_step}.json 12 | 13 | ``` 14 | 15 | - `model_path`: Path to the pre-trained model. 16 | - `weight_reorder`: Flag to indicate whether to perform weight reorder in MLP. 17 | - `pruned_model_config_file`: JSON file for the pruned model configuration. 18 | - `output_path`: Directory to save the compressed model. 19 | -------------------------------------------------------------------------------- /Mamba-Shedder/hybrid/Hymba-Pruning/recovery/README.md: -------------------------------------------------------------------------------- 1 | ### Recovery Fine-tuning after Pruning 2 | 3 | After obtaining the pruned model ([extract](../extract)), we can finetune it to recover accuracy. 4 | The dataset used for finetuning is [Alpaca](https://huggingface.co/datasets/yahma/alpaca-cleaned). 5 | Here is an example command: 6 | 7 | ```bash 8 | # Finetune the compressed Hymba 9 | python finetune_hymba.py \ 10 | --model_path \ 11 | --do_train \ 12 | --batch_size 4 \ 13 | --gradient_accumulation_steps 8 \ 14 | --num_train_epochs 3 \ 15 | --learning_rate 3e-4 \ 16 | --lora \ 17 | --lora_r 16 \ 18 | --lora_alpha 32 \ 19 | --lora_target_modules in_proj,out_proj,down_proj,up_proj \ 20 | --output_path \ 21 | --do_eval 22 | 23 | # after fine-tuning, merge the adapter to the compressed model 24 | python merge.py \ 25 | --base_model_path \ 26 | --adapter_model_path \ 27 | --output_path \ 28 | 29 | ``` 30 | -------------------------------------------------------------------------------- /Mamba-Shedder/hybrid/Hymba-Pruning/recovery/merge.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from peft import PeftModel 4 | from transformers import AutoModelForCausalLM, AutoTokenizer 5 | 6 | 7 | def main(): 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument("--base_model_path", type=str) 10 | parser.add_argument("--adapter_model_path", type=str) 11 | parser.add_argument("--output_path", type=str) 12 | args = parser.parse_args() 13 | base_model_path = args.base_model_path 14 | adapter_model_path = args.adapter_model_path 15 | output_path = args.output_path 16 | 17 | base_model, loading_info = AutoModelForCausalLM.from_pretrained( 18 | base_model_path, 19 | device_map={"": 0}, 20 | trust_remote_code=True, 21 | torch_dtype="float16", 22 | output_loading_info=True, 23 | ) 24 | model = PeftModel.from_pretrained(base_model, adapter_model_path, device_map={"": 0}) 25 | model.eval() 26 | merged_model = model.merge_and_unload() 27 | merged_model.train(False) 28 | 29 | sd = merged_model.state_dict() 30 | base_model.save_pretrained(output_path, state_dict=sd) 31 | tokenizer = AutoTokenizer.from_pretrained(base_model_path, trust_remote_code=True) 32 | tokenizer.save_pretrained(output_path) 33 | 34 | 35 | if __name__ == "__main__": 36 | main() 37 | -------------------------------------------------------------------------------- /Mamba-Shedder/hybrid/Hymba-Pruning/results/README.md: -------------------------------------------------------------------------------- 1 | ## Run Command (Hymba) 2 | 3 | Here are the commands to reproduce the main results of the paper. 4 | 5 | ### Hymba-1.5B 6 | 7 | ```bash 8 | pruning_result_path=results/hymba-1.5b-base 9 | 10 | python prune.py \ 11 | --model_path Hymba-1.5B-Base \ 12 | --do_prune \ 13 | --output_path ${pruning_result_path} \ 14 | --num_block_pruning_steps 8 \ 15 | --block_pruning_targets hymba_block \ 16 | --importance_metric ppl \ 17 | --calibration_dataset alpaca \ 18 | --num_calibration_samples 256 \ 19 | 20 | for eval_step in 5 6 7; do 21 | python prune.py \ 22 | --model_path Hymba-1.5B-Base \ 23 | --output_path ${pruning_result_path} \ 24 | --do_eval \ 25 | --pruned_model_config_file ${pruning_result_path}/pruned_model_configs/config.hymba_block.${eval_step}.json 26 | done 27 | ``` 28 | -------------------------------------------------------------------------------- /Mamba-Shedder/hybrid/Hymba-Pruning/results/hymba-1.5b-base/eval.res.config.hymba_block.5.json: -------------------------------------------------------------------------------- 1 | { 2 | "total_params": 1522797824, 3 | "5cs_acc_avg": 62.3, 4 | "arc_challenge": 44.9, 5 | "arc_easy": 76.0, 6 | "hellaswag": 50.5, 7 | "piqa": 75.8, 8 | "winogrande": 64.1 9 | } -------------------------------------------------------------------------------- /Mamba-Shedder/hybrid/Hymba-Pruning/results/hymba-1.5b-base/eval.res.config.hymba_block.6.json: -------------------------------------------------------------------------------- 1 | { 2 | "total_params": 1522797824, 3 | "5cs_acc_avg": 61.7, 4 | "arc_challenge": 43.9, 5 | "arc_easy": 74.8, 6 | "hellaswag": 49.9, 7 | "piqa": 74.9, 8 | "winogrande": 64.9 9 | } -------------------------------------------------------------------------------- /Mamba-Shedder/hybrid/Hymba-Pruning/results/hymba-1.5b-base/eval.res.config.hymba_block.7.json: -------------------------------------------------------------------------------- 1 | { 2 | "total_params": 1522797824, 3 | "5cs_acc_avg": 60.5, 4 | "arc_challenge": 43.2, 5 | "arc_easy": 74.2, 6 | "hellaswag": 49.2, 7 | "piqa": 74.3, 8 | "winogrande": 61.5 9 | } -------------------------------------------------------------------------------- /Mamba-Shedder/hybrid/Hymba-Pruning/results/hymba-1.5b-base/pruning_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "pruned_hymba_block_idx": [ 3 | 2, 4 | 22, 5 | 15, 6 | 28, 7 | 10, 8 | 24, 9 | 26 10 | ] 11 | } -------------------------------------------------------------------------------- /Mamba-Shedder/hybrid/Zamba2-Pruning/eval.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import logging 4 | 5 | from transformers import AutoModelForCausalLM, AutoTokenizer 6 | 7 | from lm_eval import evaluator 8 | from lm_eval.models.huggingface import HFLM 9 | 10 | TASKS = ["lambada_openai", "hellaswag", "piqa", "arc_easy", "arc_challenge", "winogrande", "openbookqa"] 11 | 12 | 13 | def main(): 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument( 16 | "--model_path", 17 | type=str, 18 | ) 19 | args = parser.parse_args() 20 | model_path = args.model_path 21 | 22 | model = AutoModelForCausalLM.from_pretrained( 23 | model_path, 24 | device_map="cuda", 25 | torch_dtype="float16", 26 | trust_remote_code=True 27 | ) 28 | tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 29 | lm = HFLM(pretrained=model, tokenizer=tokenizer, batch_size=64) 30 | 31 | # Evaluate on selected tasks 32 | logging.info(f"Selected Tasks: {TASKS}") 33 | results = evaluator.simple_evaluate(lm, tasks=TASKS, log_samples=False)['results'] 34 | 35 | metric_vals = {} 36 | for task, result in results.items(): 37 | # TODO: fix (all are `acc_norm,none`) 38 | res = result['acc,none'] if task == 'arc_easy' else result.get('acc_norm,none', result['acc,none']) 39 | metric_vals[task] = round(res, 3) * 100 40 | if task == "lambada_openai": 41 | metric_vals[task + "_ppl"] = result['perplexity,none'] 42 | 43 | logging.info(json.dumps(metric_vals, indent=4)) 44 | 45 | 46 | if __name__ == "__main__": 47 | main() -------------------------------------------------------------------------------- /Mamba-Shedder/hybrid/Zamba2-Pruning/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | set -x 4 | 5 | MAMBA_SHEDDER_ZAMBA2_PATH=$PWD 6 | 7 | cp ../../utils.py . 8 | cp ../../patches/mamba-62db608.patch ./patches 9 | 10 | pip install virtualenv 11 | virtualenv venv 12 | source venv/bin/activate 13 | 14 | pip install --upgrade pip setuptools wheel 15 | pip install torch==2.4.0 16 | pip install causal-conv1d>=1.4.0 17 | 18 | mkdir -pv ${MAMBA_SHEDDER_PATH}/third_party 19 | 20 | pushd ${MAMBA_SHEDDER_ZAMBA2_PATH}/third_party 21 | git clone https://github.com/state-spaces/mamba.git 22 | pushd mamba 23 | git checkout 62db608 24 | git apply --ignore-space-change --ignore-whitespace ${MAMBA_SHEDDER_ZAMBA2_PATH}/patches/mamba-62db608.patch 25 | pip install . 26 | pip install lm-eval==0.4.2 27 | 28 | pushd ${MAMBA_SHEDDER_ZAMBA2_PATH}/third_party 29 | git clone https://github.com/Zyphra/transformers_zamba2.git 30 | cd transformers_zamba2 31 | git checkout 7593823 32 | git apply --ignore-space-change --ignore-whitespace ${MAMBA_SHEDDER_ZAMBA2_PATH}/patches/zamba2-7593823.patch 33 | pip install -e . --no-deps 34 | pip install tokenizers==0.19.0 numpy==1.26.4 accelerate 35 | pushd ${MAMBA_SHEDDER_ZAMBA2_PATH} 36 | 37 | echo "Environment all ready. execute 'source venv/bin/activate' to run" 38 | -------------------------------------------------------------------------------- /Mamba-Shedder/hybrid/Zamba2-Pruning/preprocess.py: -------------------------------------------------------------------------------- 1 | import json 2 | import torch 3 | from safetensors.torch import save_file 4 | from safetensors import safe_open 5 | 6 | NUM_G_LAYERS = 9 7 | NUM_MEM_BLOCKS = 2 8 | 9 | 10 | def preprocess_func(data): 11 | new_data = {} 12 | transformer_weight_data = {} 13 | for key, weight_tensor in data.items(): 14 | if "model.blocks" in key: 15 | if "linear_fc1" in key and "lora_A" not in key: 16 | up_proj_weight_tensor, gate_proj_weight_tensor = weight_tensor.chunk(2, dim=0) 17 | transformer_weight_data[key.replace("linear_fc1", "linear_fc1_up")] = up_proj_weight_tensor.clone() 18 | transformer_weight_data[key.replace("linear_fc1", "linear_fc1_gate")] = gate_proj_weight_tensor.clone() 19 | elif "lora_A" in key: 20 | transformer_weight_data[key.replace("linear_fc1", "linear_fc1_up")] = weight_tensor.clone() 21 | transformer_weight_data[key.replace("linear_fc1", "linear_fc1_gate")] = weight_tensor.clone() 22 | else: 23 | transformer_weight_data[key] = weight_tensor.clone() 24 | else: 25 | new_data[key] = weight_tensor.clone() 26 | 27 | for num_layer in range(NUM_G_LAYERS): 28 | if num_layer % NUM_MEM_BLOCKS == 0: 29 | id = 0 30 | else: 31 | id = 1 32 | cur_data = {k: v for k, v in transformer_weight_data.items() if f"model.blocks.{id}" in k} 33 | for k, v in cur_data.items(): 34 | if "lora" not in k: 35 | new_data[k.replace(f"model.blocks.{id}", f"model.blocks.{num_layer}")] = v.clone() 36 | elif f"_lora_B_list.{num_layer}" in k: # lora 37 | lora_B = v.clone() 38 | lora_A = cur_data[k.replace(f"lora_B", "lora_A")] 39 | lora_BA = torch.matmul(lora_B, lora_A) 40 | new_data[k.replace(f"model.blocks.{id}", f"model.blocks.{num_layer}").replace(f"_lora_B_list.{num_layer}", "")] += lora_BA.clone() 41 | return new_data 42 | 43 | def load_safetensors(filename): 44 | tensors = {} 45 | with safe_open(filename, framework="pt", device=0) as f: 46 | metadata = f.metadata() 47 | for k in f.keys(): 48 | tensors[k] = f.get_tensor(k) 49 | return tensors, metadata 50 | 51 | def save_safetensors(data, metadata, filename): 52 | save_file(data, filename, metadata=metadata) 53 | 54 | new_weight_map = {} 55 | 56 | data, metadata = load_safetensors("./Zamba2-2.7B/model-00001-of-00002.safetensors") 57 | new_data = preprocess_func(data) 58 | save_safetensors(new_data, metadata, "./Zamba2-2.7B/model-00001-of-00002.safetensors") 59 | for key in new_data.keys(): 60 | new_weight_map[key] = "model-00001-of-00002.safetensors" 61 | 62 | data, metadata = load_safetensors("./Zamba2-2.7B/model-00002-of-00002.safetensors") 63 | new_data = preprocess_func(data) 64 | save_safetensors(new_data, metadata, "./Zamba2-2.7B/model-00002-of-00002.safetensors") 65 | for key in new_data.keys(): 66 | new_weight_map[key] = "model-00002-of-00002.safetensors" 67 | 68 | with open("./Zamba2-2.7B/model.safetensors.index.json", "r") as f: 69 | data = json.load(f) 70 | data["weight_map"] = new_weight_map 71 | with open("./Zamba2-2.7B/model.safetensors.index.json", "w") as f: 72 | json.dump(data, f, indent=4) 73 | -------------------------------------------------------------------------------- /Mamba-Shedder/hybrid/Zamba2-Pruning/results/README.md: -------------------------------------------------------------------------------- 1 | ## Run Command (Zamba2) 2 | 3 | Here are the commands to reproduce the main results of the paper. 4 | 5 | ### Zamba2-2.7B 6 | 7 | #### Pruning Ratio: 10% 8 | 9 | ```bash 10 | pruning_result_path=results/zamba2-2.7b 11 | 12 | # Multi-granularity Pruning 13 | python prune_hybrid.py \ 14 | --model_path Zamba2-2.7B \ 15 | --output_path ${pruning_result_path} \ 16 | --do_prune \ 17 | --target_block_pruning_steps 3 \ 18 | --target_width_pruning_steps 20 \ 19 | --target_ssm_pruning_steps 18 \ 20 | --mlp_channel_group_size 1024 \ 21 | --importance_metric ppl \ 22 | --calibration_dataset alpaca \ 23 | --num_calibration_samples_block 256 \ 24 | --num_calibration_samples_width 256 \ 25 | --num_calibration_samples_ssm 256 26 | 27 | # Evaluation: w/o SSM Pruning 28 | python prune_hybrid.py \ 29 | --model_path Zamba2-2.7B \ 30 | --output_path ${pruning_result_path} \ 31 | --do_eval \ 32 | --pruned_model_config_file ${pruning_result_path}/pruned_model_configs/config.mlp_width.22.json 33 | 34 | # Evaluation: w/ SSM Pruning 35 | python prune_hybrid.py \ 36 | --model_path Zamba2-2.7B \ 37 | --output_path ${pruning_result_path} \ 38 | --do_eval \ 39 | --pruned_model_config_file ${pruning_result_path}/pruned_model_configs/config.ssm.40.json 40 | ``` 41 | 42 | #### Pruning Ratio: 15% 43 | 44 | ```bash 45 | pruning_result_path=results/zamba2-2.7b 46 | 47 | # Multi-granularity Pruning 48 | python prune_hybrid.py \ 49 | --model_path Zamba2-2.7B \ 50 | --output_path ${pruning_result_path} \ 51 | --do_prune \ 52 | --target_block_pruning_steps 8 \ 53 | --target_width_pruning_steps 20 \ 54 | --target_ssm_pruning_steps 18 \ 55 | --mlp_channel_group_size 1024 \ 56 | --importance_metric ppl \ 57 | --calibration_dataset alpaca \ 58 | --num_calibration_samples_block 256 \ 59 | --num_calibration_samples_width 256 \ 60 | --num_calibration_samples_ssm 256 61 | 62 | # Evaluation: w/o SSM Pruning 63 | python prune_hybrid.py \ 64 | --model_path Zamba2-2.7B \ 65 | --output_path ${pruning_result_path} \ 66 | --do_eval \ 67 | --pruned_model_config_file ${pruning_result_path}/pruned_model_configs/config.mlp_width.27.json 68 | 69 | # Evaluation: w/ SSM Pruning 70 | python prune_hybrid.py \ 71 | --model_path Zamba2-2.7B \ 72 | --output_path ${pruning_result_path} \ 73 | --do_eval \ 74 | --pruned_model_config_file ${pruning_result_path}/pruned_model_configs/config.ssm.45.json 75 | ``` 76 | -------------------------------------------------------------------------------- /Mamba-Shedder/hybrid/Zamba2-Pruning/results/zamba2-2.7b/ratio_10/eval.res.config.ssm.40.json: -------------------------------------------------------------------------------- 1 | { 2 | "total_params": 3828481440, 3 | "7cs_acc_avg": 65.9, 4 | "openbookqa": 46.6, 5 | "winogrande": 69.5, 6 | "arc_challenge": 48.699999999999996, 7 | "arc_easy": 79.0, 8 | "piqa": 80.0, 9 | "hellaswag": 73.9, 10 | "lambada_openai": 63.4, 11 | "lambada_openai_ppl": 5.180797687033141 12 | } -------------------------------------------------------------------------------- /Mamba-Shedder/hybrid/Zamba2-Pruning/results/zamba2-2.7b/ratio_10/pruning_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "pruned_mamba_block_idx": [], 3 | "pruned_mha_idx": [], 4 | "pruned_mlp_idx": [ 5 | 6, 6 | 8, 7 | 7 8 | ], 9 | "pruned_mlp_channels": { 10 | "0": 3072, 11 | "1": 4096, 12 | "2": 8192, 13 | "3": 10240, 14 | "4": 6144, 15 | "5": 9216, 16 | "6": 10240, 17 | "7": 10240, 18 | "8": 10240 19 | }, 20 | "pruned_ssm_idx": [ 21 | 47, 22 | 42, 23 | 48, 24 | 43, 25 | 45, 26 | 37, 27 | 51, 28 | 40, 29 | 44, 30 | 46, 31 | 38, 32 | 41, 33 | 53, 34 | 49, 35 | 33, 36 | 31, 37 | 34, 38 | 50 39 | ] 40 | } -------------------------------------------------------------------------------- /Mamba-Shedder/hybrid/Zamba2-Pruning/results/zamba2-2.7b/ratio_15/eval.res.config.ssm.45.json: -------------------------------------------------------------------------------- 1 | { 2 | "total_params": 3828481440, 3 | "7cs_acc_avg": 61.3, 4 | "openbookqa": 42.8, 5 | "winogrande": 67.7, 6 | "arc_challenge": 41.8, 7 | "arc_easy": 73.4, 8 | "piqa": 77.9, 9 | "hellaswag": 68.89999999999999, 10 | "lambada_openai": 56.49999999999999, 11 | "lambada_openai_ppl": 7.437938064270495 12 | } -------------------------------------------------------------------------------- /Mamba-Shedder/hybrid/Zamba2-Pruning/results/zamba2-2.7b/ratio_15/pruning_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "pruned_mamba_block_idx": [ 3 | 13, 4 | 21, 5 | 44, 6 | 22, 7 | 28 8 | ], 9 | "pruned_mha_idx": [], 10 | "pruned_mlp_idx": [ 11 | 6, 12 | 8, 13 | 7 14 | ], 15 | "pruned_mlp_channels": { 16 | "0": 3072, 17 | "1": 4096, 18 | "2": 7168, 19 | "3": 10240, 20 | "4": 6144, 21 | "5": 10240, 22 | "6": 10240, 23 | "7": 10240, 24 | "8": 10240 25 | }, 26 | "pruned_ssm_idx": [ 27 | 47, 28 | 51, 29 | 48, 30 | 43, 31 | 45, 32 | 37, 33 | 41, 34 | 40, 35 | 42, 36 | 53, 37 | 46, 38 | 49, 39 | 38, 40 | 34, 41 | 50, 42 | 52, 43 | 33, 44 | 31 45 | ] 46 | } -------------------------------------------------------------------------------- /Mamba-Shedder/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | set -x 4 | 5 | MAMBA_SHEDDER_PATH=$PWD 6 | 7 | pip install virtualenv 8 | virtualenv venv 9 | source venv/bin/activate 10 | 11 | pip install --upgrade pip setuptools wheel 12 | pip install torch==2.4.0 13 | pip install transformers==4.43.0 14 | pip install causal-conv1d>=1.4.0 15 | 16 | mkdir -pv ${MAMBA_SHEDDER_PATH}/third_party 17 | 18 | pushd ${MAMBA_SHEDDER_PATH}/third_party 19 | git clone https://github.com/state-spaces/mamba.git 20 | pushd mamba 21 | git checkout 62db608 22 | git apply --ignore-space-change --ignore-whitespace ${MAMBA_SHEDDER_PATH}/patches/mamba-62db608.patch 23 | pip install . 24 | pushd ${MAMBA_SHEDDER_PATH} 25 | 26 | pip install lm-eval==0.4.2 27 | echo "Environment all ready. execute 'source venv/bin/activate' to run" 28 | -------------------------------------------------------------------------------- /Mamba-Shedder/results/README.md: -------------------------------------------------------------------------------- 1 | ## Run Command (Mamba and Mamba2) 2 | 3 | Here are the commands to reproduce the main results of the paper. 4 | 5 | ### Mamba-2.8B 6 | 7 | ```bash 8 | pruning_result_path=results/mamba-2.8b 9 | 10 | # Mamba Block Pruning 11 | python prune.py \ 12 | --model_path state-spaces/mamba-2.8b \ 13 | --prune_target mamba_block \ 14 | --output_path ${pruning_result_path} \ 15 | --do_prune \ 16 | --target_pruning_steps 14 \ 17 | --importance_metric ppl \ 18 | --calibration_dataset alpaca \ 19 | --num_calibration_samples 256 20 | 21 | # Evaluation for different steps 22 | for eval_step in 6 13; do 23 | python prune.py \ 24 | --model_path state-spaces/mamba-2.8b \ 25 | --output_path ${pruning_result_path} \ 26 | --do_eval \ 27 | --pruned_model_config_file ${pruning_result_path}/pruned_model_configs/config.mamba_block.${eval_step}.json 28 | done 29 | ``` 30 | 31 | ### Mamba2-2.7B 32 | 33 | ```bash 34 | pruning_result_path=results/mamba2-2.7b 35 | 36 | # SSM Pruning 37 | python prune.py \ 38 | --model_path state-spaces/mamba2-2.7b \ 39 | --prune_target ssm \ 40 | --output_path ${pruning_result_path} \ 41 | --do_prune \ 42 | --target_pruning_steps 24 \ 43 | --importance_metric ppl \ 44 | --calibration_dataset alpaca \ 45 | --num_calibration_samples 256 46 | 47 | # Evaluation for different steps 48 | for eval_step in 15 19 21 23; do 49 | python prune.py \ 50 | --model_path state-spaces/mamba2-2.7b \ 51 | --output_path ${pruning_result_path} \ 52 | --do_eval \ 53 | --pruned_model_config_file ${pruning_result_path}/pruned_model_configs/config.ssm.${eval_step}.json 54 | done 55 | ``` 56 | -------------------------------------------------------------------------------- /Mamba-Shedder/results/mamba-2.8b/eval.res.config.mamba_block.13.json: -------------------------------------------------------------------------------- 1 | { 2 | "total_params": 2768345600, 3 | "7cs_acc_avg": 53.800000000000004, 4 | "openbookqa": 33.2, 5 | "winogrande": 61.1, 6 | "arc_challenge": 32.0, 7 | "arc_easy": 62.7, 8 | "piqa": 71.0, 9 | "hellaswag": 57.599999999999994, 10 | "lambada_openai": 58.9, 11 | "lambada_openai_ppl": 7.50530338198477 12 | } -------------------------------------------------------------------------------- /Mamba-Shedder/results/mamba-2.8b/eval.res.config.mamba_block.6.json: -------------------------------------------------------------------------------- 1 | { 2 | "total_params": 2768345600, 3 | "7cs_acc_avg": 57.8, 4 | "openbookqa": 37.0, 5 | "winogrande": 62.5, 6 | "arc_challenge": 33.5, 7 | "arc_easy": 68.0, 8 | "piqa": 73.7, 9 | "hellaswag": 63.7, 10 | "lambada_openai": 65.8, 11 | "lambada_openai_ppl": 4.943333997537422 12 | } -------------------------------------------------------------------------------- /Mamba-Shedder/results/mamba-2.8b/pruning_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "pruned_mamba_block_idx": [ 3 | 2, 4 | 6, 5 | 12, 6 | 5, 7 | 10, 8 | 8, 9 | 13, 10 | 11, 11 | 26, 12 | 48, 13 | 19, 14 | 55, 15 | 15, 16 | 3 17 | ] 18 | } -------------------------------------------------------------------------------- /Mamba-Shedder/results/mamba2-2.7b/eval.res.config.ssm.15.json: -------------------------------------------------------------------------------- 1 | { 2 | "total_params": 2702599680, 3 | "7cs_acc_avg": 59.8, 4 | "openbookqa": 39.2, 5 | "winogrande": 64.0, 6 | "arc_challenge": 37.2, 7 | "arc_easy": 68.60000000000001, 8 | "piqa": 76.4, 9 | "hellaswag": 66.10000000000001, 10 | "lambada_openai": 66.9, 11 | "lambada_openai_ppl": 4.267932476614268 12 | } -------------------------------------------------------------------------------- /Mamba-Shedder/results/mamba2-2.7b/eval.res.config.ssm.19.json: -------------------------------------------------------------------------------- 1 | { 2 | "total_params": 2702599680, 3 | "7cs_acc_avg": 58.599999999999994, 4 | "openbookqa": 39.2, 5 | "winogrande": 63.6, 6 | "arc_challenge": 36.7, 7 | "arc_easy": 68.89999999999999, 8 | "piqa": 76.1, 9 | "hellaswag": 66.0, 10 | "lambada_openai": 59.8, 11 | "lambada_openai_ppl": 5.897567054174858 12 | } -------------------------------------------------------------------------------- /Mamba-Shedder/results/mamba2-2.7b/eval.res.config.ssm.21.json: -------------------------------------------------------------------------------- 1 | { 2 | "total_params": 2702599680, 3 | "7cs_acc_avg": 57.8, 4 | "openbookqa": 38.0, 5 | "winogrande": 62.9, 6 | "arc_challenge": 36.5, 7 | "arc_easy": 68.30000000000001, 8 | "piqa": 75.7, 9 | "hellaswag": 65.60000000000001, 10 | "lambada_openai": 57.599999999999994, 11 | "lambada_openai_ppl": 6.498095929049319 12 | } -------------------------------------------------------------------------------- /Mamba-Shedder/results/mamba2-2.7b/eval.res.config.ssm.23.json: -------------------------------------------------------------------------------- 1 | { 2 | "total_params": 2702599680, 3 | "7cs_acc_avg": 55.50000000000001, 4 | "openbookqa": 38.0, 5 | "winogrande": 62.9, 6 | "arc_challenge": 36.6, 7 | "arc_easy": 67.10000000000001, 8 | "piqa": 74.8, 9 | "hellaswag": 65.8, 10 | "lambada_openai": 43.4, 11 | "lambada_openai_ppl": 14.95706208912779 12 | } -------------------------------------------------------------------------------- /Mamba-Shedder/results/mamba2-2.7b/pruning_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "pruned_ssm_idx": [ 3 | 63, 4 | 54, 5 | 42, 6 | 45, 7 | 53, 8 | 57, 9 | 58, 10 | 59, 11 | 38, 12 | 56, 13 | 50, 14 | 61, 15 | 60, 16 | 43, 17 | 37, 18 | 62, 19 | 49, 20 | 34, 21 | 55, 22 | 33, 23 | 39, 24 | 35, 25 | 44, 26 | 46 27 | ] 28 | } -------------------------------------------------------------------------------- /MultiPruner/eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import logging 4 | import argparse 5 | 6 | from transformers import AutoModelForCausalLM, AutoTokenizer 7 | 8 | from lm_eval import evaluator 9 | from lm_eval.models.huggingface import HFLM 10 | 11 | import utils 12 | 13 | 14 | def main(): 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument("--model_path", type=str) 17 | parser.add_argument("--output_path", type=str) 18 | args = parser.parse_args() 19 | model_path = args.model_path 20 | output_path = args.output_path 21 | 22 | # Ensure the output directory exists 23 | if not os.path.exists(output_path): 24 | os.makedirs(output_path) 25 | 26 | model = AutoModelForCausalLM.from_pretrained( 27 | model_path, 28 | device_map="auto", 29 | torch_dtype="float16", 30 | trust_remote_code=True, 31 | ) 32 | tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 33 | 34 | # Evaluate on wikitext2 dataset 35 | dataset = utils.get_dataset("wikitext2") 36 | test_dataset = dataset["test"] 37 | test_loader = utils.prepare_test_dataloader( 38 | dataset=test_dataset, 39 | tokenizer=tokenizer, 40 | seqlen=2048, 41 | batch_size=1 42 | ) 43 | dataset_ppl = utils.evaluate_ppl( 44 | model=model, 45 | dataloader=test_loader, 46 | pad_token_id=model.config.eos_token_id, 47 | ) 48 | dataset_ppl = round(dataset_ppl, 2) 49 | logging.info(f'wikitext2 PPL: {dataset_ppl}') 50 | 51 | # Evaluate on selected tasks 52 | hflm = HFLM(pretrained=model, tokenizer=tokenizer, batch_size=64) 53 | 54 | task_names = ["piqa", "winogrande", "hellaswag", "arc_easy", "arc_challenge"] 55 | logging.info(f"Selected Tasks: {task_names}") 56 | 57 | results = evaluator.simple_evaluate(hflm, tasks=task_names, num_fewshot=0, batch_size=64, log_samples=False)['results'] 58 | 59 | metric_vals = {task: round(result.get('acc_norm,none', result['acc,none']), 4) * 100 for task, result in results.items()} 60 | logging.info(json.dumps(metric_vals, indent=4)) 61 | 62 | def calculate_avg_accuracy(task_names, results): 63 | n_tasks = len(task_names) 64 | acc_cumul = sum(result.get('acc_norm,none', result['acc,none']) for task, result in results.items()) 65 | return round(acc_cumul / n_tasks, 4) * 100 66 | 67 | acc_avg = calculate_avg_accuracy(task_names, results) 68 | logging.info(f"Average accuracy across tasks: {acc_avg}") 69 | 70 | # Save evaluation results 71 | overall_results = { 72 | "ppl_wikitext2": dataset_ppl, 73 | "5cs_acc_avg": acc_avg, 74 | **metric_vals 75 | } 76 | eval_result_path = os.path.join(output_path, f"eval.res.json") 77 | with open(eval_result_path, "w") as f: 78 | json.dump(overall_results, f, indent=4) 79 | 80 | 81 | if __name__ == "__main__": 82 | main() 83 | -------------------------------------------------------------------------------- /MultiPruner/extract/README.md: -------------------------------------------------------------------------------- 1 | ## Extract the Compressed Model from MultiPruner 2 | 3 | The final compressed model can be extracted based on the optimal pruning configuration obtained from **MultiPruner**. 4 | Here is an example command for the compressed Llama-2-7B: 5 | 6 | ```bash 7 | python extract/extract_model.py \ 8 | --model_path meta-llama/Llama-2-7b-hf \ 9 | --weight_reorder \ 10 | --pruned_model_config_file /pruning_config.json \ 11 | --output_path 12 | ``` 13 | 14 | - `model_path`: Path to the pre-trained model. 15 | - `weight_reorder`: Flag to indicate whether to perform weight reordering. 16 | - `pruned_model_config_file`: JSON file for the pruned model configuration. 17 | - `output_path`: Directory to save the compressed model. 18 | -------------------------------------------------------------------------------- /MultiPruner/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | set -x 4 | 5 | MULTIPRUNER_PATH=$PWD 6 | 7 | python3.10 -m venv venv 8 | source venv/bin/activate 9 | 10 | mkdir -pv third_party 11 | pushd third_party 12 | 13 | git clone https://github.com/huggingface/transformers.git 14 | pushd transformers 15 | git checkout v4.45.0 16 | git apply --ignore-space-change --ignore-whitespace ${MULTIPRUNER_PATH}/patches/transformers-v4.45.0.patch 17 | pip install -e . 18 | 19 | pushd ${MULTIPRUNER_PATH} 20 | 21 | pip install -r requirements.txt 22 | 23 | echo "Environment all ready. execute 'source venv/bin/activate' to run" 24 | 25 | -------------------------------------------------------------------------------- /MultiPruner/recovery/README.md: -------------------------------------------------------------------------------- 1 | ### Recovery Fine-tuning after Pruning 2 | 3 | After obtaining the compressed model ([here](../extract)), we can finetune it to recover accuracy. 4 | The dataset used for finetuning is [Alpaca](https://huggingface.co/datasets/yahma/alpaca-cleaned). 5 | Here is an example command: 6 | 7 | ```bash 8 | python recovery/finetune.py \ 9 | --model_path \ 10 | --do_train \ 11 | --batch_size 8 \ 12 | --gradient_accumulation_steps 4 \ 13 | --max_steps 3000 \ 14 | --learning_rate 1e-4 \ 15 | --lora \ 16 | --lora_r 16 \ 17 | --lora_alpha 32 \ 18 | --lora_target_modules q_proj,k_proj,v_proj,o_proj,down_proj,up_proj,gate_proj \ 19 | --output_path \ 20 | --do_eval 21 | ``` 22 | 23 | After fine-tuning, we can merge the trained adapter into the pruned base model. 24 | 25 | ```bash 26 | python recovery/merge.py \ 27 | --base_model_path \ 28 | --adapter_model_path \ 29 | --output_path 30 | ``` 31 | -------------------------------------------------------------------------------- /MultiPruner/recovery/merge.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from peft import PeftModel 4 | from transformers import AutoModelForCausalLM, AutoTokenizer 5 | 6 | 7 | def main(): 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument("--base_model_path", type=str) 10 | parser.add_argument("--adapter_model_path", type=str) 11 | parser.add_argument("--output_path", type=str) 12 | args = parser.parse_args() 13 | base_model_path = args.base_model_path 14 | adapter_model_path = args.adapter_model_path 15 | output_path = args.output_path 16 | 17 | base_model, loading_info = AutoModelForCausalLM.from_pretrained( 18 | base_model_path, 19 | device_map={"": 0}, 20 | trust_remote_code=True, 21 | torch_dtype="float16", 22 | output_loading_info=True, 23 | ) 24 | model = PeftModel.from_pretrained(base_model, adapter_model_path, device_map={"": 0}) 25 | model.eval() 26 | merged_model = model.merge_and_unload() 27 | merged_model.train(False) 28 | 29 | sd = merged_model.state_dict() 30 | 31 | if len(loading_info["missing_keys"]) > 0: 32 | for key in loading_info["missing_keys"]: 33 | del sd[key] 34 | 35 | base_model.save_pretrained(output_path, state_dict=sd) 36 | tokenizer = AutoTokenizer.from_pretrained(base_model_path, trust_remote_code=True) 37 | tokenizer.save_pretrained(output_path) 38 | 39 | 40 | if __name__ == "__main__": 41 | main() 42 | -------------------------------------------------------------------------------- /MultiPruner/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy<2.0.0 2 | setuptools==70.0.0 3 | datasets 4 | accelerate 5 | sentencepiece 6 | protobuf 7 | bitsandbytes 8 | lm-eval==0.4.2 9 | torch==2.3.1 10 | -------------------------------------------------------------------------------- /MultiPruner/results/Baichuan2-13B-Base/ratio_24/eval.res.json: -------------------------------------------------------------------------------- 1 | { 2 | "total_params": 13896668160, 3 | "pruned_params": 10559575040, 4 | "ratio": 24.013620254712908, 5 | "ppl_wikitext2": 10.99, 6 | "5cs_acc_avg": 59.12, 7 | "arc_challenge": 37.2, 8 | "arc_easy": 57.95, 9 | "hellaswag": 64.03, 10 | "winogrande": 66.46, 11 | "piqa": 69.97 12 | } -------------------------------------------------------------------------------- /MultiPruner/results/Baichuan2-13B-Base/ratio_24/pruning_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "pruned_attn_idx": [ 3 | 33, 4 | 31, 5 | 26, 6 | 32, 7 | 34, 8 | 27, 9 | 6, 10 | 29, 11 | 28 12 | ], 13 | "pruned_mlp_idx": [ 14 | 34, 15 | 6, 16 | 31, 17 | 35 18 | ], 19 | "pruned_attn_width": { 20 | "0": 5120, 21 | "1": 5120, 22 | "2": 4608, 23 | "3": 5120, 24 | "4": 4992, 25 | "5": 4864, 26 | "6": 5120, 27 | "7": 4864, 28 | "8": 5120, 29 | "9": 4992, 30 | "10": 5120, 31 | "11": 5120, 32 | "12": 5120, 33 | "13": 5120, 34 | "14": 4992, 35 | "15": 4992, 36 | "16": 4864, 37 | "17": 5120, 38 | "18": 4992, 39 | "19": 4864, 40 | "20": 4992, 41 | "21": 5120, 42 | "22": 4864, 43 | "23": 5120, 44 | "24": 5120, 45 | "25": 4480, 46 | "26": 5120, 47 | "27": 5120, 48 | "28": 5120, 49 | "29": 5120, 50 | "30": 4096, 51 | "31": 5120, 52 | "32": 5120, 53 | "33": 5120, 54 | "34": 5120, 55 | "35": 4608, 56 | "36": 4992, 57 | "37": 4992, 58 | "38": 4864, 59 | "39": 4480 60 | }, 61 | "pruned_mlp_width": { 62 | "0": 8576, 63 | "1": 12672, 64 | "2": 9600, 65 | "3": 13696, 66 | "4": 12672, 67 | "5": 6528, 68 | "6": 13696, 69 | "7": 12672, 70 | "8": 12672, 71 | "9": 12672, 72 | "10": 12672, 73 | "11": 10624, 74 | "12": 12672, 75 | "13": 12672, 76 | "14": 9600, 77 | "15": 10624, 78 | "16": 12672, 79 | "17": 13696, 80 | "18": 12672, 81 | "19": 11648, 82 | "20": 11648, 83 | "21": 12672, 84 | "22": 13696, 85 | "23": 12672, 86 | "24": 9600, 87 | "25": 9600, 88 | "26": 6528, 89 | "27": 7552, 90 | "28": 6528, 91 | "29": 4480, 92 | "30": 13696, 93 | "31": 13696, 94 | "32": 4480, 95 | "33": 13696, 96 | "34": 13696, 97 | "35": 13696, 98 | "36": 13696, 99 | "37": 10624, 100 | "38": 13696, 101 | "39": 13696 102 | } 103 | } -------------------------------------------------------------------------------- /MultiPruner/results/Baichuan2-7B-Base/ratio_22/eval.res.json: -------------------------------------------------------------------------------- 1 | { 2 | "total_params": 7505973248, 3 | "pruned_params": 5824057344, 4 | "ratio": 22.407699154112414, 5 | "ppl_wikitext2": 12.37, 6 | "5cs_acc_avg": 57.599999999999994, 7 | "arc_challenge": 34.9, 8 | "arc_easy": 56.989999999999995, 9 | "hellaswag": 60.919999999999995, 10 | "winogrande": 64.72, 11 | "piqa": 70.46 12 | } -------------------------------------------------------------------------------- /MultiPruner/results/Baichuan2-7B-Base/ratio_22/pruning_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "pruned_attn_idx": [ 3 | 4, 4 | 24, 5 | 26, 6 | 25, 7 | 21, 8 | 22, 9 | 27 10 | ], 11 | "pruned_mlp_idx": [ 12 | 4, 13 | 26 14 | ], 15 | "pruned_attn_width": { 16 | "0": 3840, 17 | "1": 4096, 18 | "2": 3840, 19 | "3": 2560, 20 | "4": 4096, 21 | "5": 3584, 22 | "6": 3968, 23 | "7": 3968, 24 | "8": 3840, 25 | "9": 4096, 26 | "10": 4096, 27 | "11": 4096, 28 | "12": 4096, 29 | "13": 4096, 30 | "14": 4096, 31 | "15": 3968, 32 | "16": 3840, 33 | "17": 4096, 34 | "18": 3968, 35 | "19": 4096, 36 | "20": 3968, 37 | "21": 4096, 38 | "22": 4096, 39 | "23": 4096, 40 | "24": 4096, 41 | "25": 4096, 42 | "26": 4096, 43 | "27": 4096, 44 | "28": 4096, 45 | "29": 4096, 46 | "30": 4096, 47 | "31": 4096 48 | }, 49 | "pruned_mlp_width": { 50 | "0": 11008, 51 | "1": 2816, 52 | "2": 11008, 53 | "3": 11008, 54 | "4": 11008, 55 | "5": 2816, 56 | "6": 11008, 57 | "7": 11008, 58 | "8": 11008, 59 | "9": 11008, 60 | "10": 11008, 61 | "11": 9984, 62 | "12": 11008, 63 | "13": 11008, 64 | "14": 11008, 65 | "15": 2816, 66 | "16": 11008, 67 | "17": 11008, 68 | "18": 11008, 69 | "19": 3840, 70 | "20": 11008, 71 | "21": 11008, 72 | "22": 1792, 73 | "23": 11008, 74 | "24": 768, 75 | "25": 11008, 76 | "26": 11008, 77 | "27": 11008, 78 | "28": 768, 79 | "29": 1792, 80 | "30": 11008, 81 | "31": 11008 82 | } 83 | } -------------------------------------------------------------------------------- /MultiPruner/results/Llama-2-13B/ratio_25/eval.res.json: -------------------------------------------------------------------------------- 1 | { 2 | "total_params": 13015864320, 3 | "pruned_params": 9760035840, 4 | "ratio": 25.01430869248643, 5 | "ppl_wikitext2": 7.19, 6 | "5cs_acc_avg": 68.64, 7 | "arc_challenge": 46.5, 8 | "arc_easy": 71.38, 9 | "hellaswag": 75.62, 10 | "winogrande": 71.89999999999999, 11 | "piqa": 77.8 12 | } -------------------------------------------------------------------------------- /MultiPruner/results/Llama-2-13B/ratio_25/pruning_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "pruned_attn_idx": [ 3 | 27, 4 | 32, 5 | 29, 6 | 33, 7 | 23, 8 | 30, 9 | 31, 10 | 34, 11 | 25, 12 | 28, 13 | 26, 14 | 35, 15 | 24 16 | ], 17 | "pruned_mlp_idx": [ 18 | 33 19 | ], 20 | "pruned_attn_width": { 21 | "0": 4864, 22 | "1": 3328, 23 | "2": 5120, 24 | "3": 5120, 25 | "4": 4480, 26 | "5": 4736, 27 | "6": 4992, 28 | "7": 5120, 29 | "8": 4736, 30 | "9": 5120, 31 | "10": 5120, 32 | "11": 5120, 33 | "12": 5120, 34 | "13": 5120, 35 | "14": 4992, 36 | "15": 5120, 37 | "16": 4864, 38 | "17": 4992, 39 | "18": 4992, 40 | "19": 4736, 41 | "20": 4864, 42 | "21": 4480, 43 | "22": 4608, 44 | "23": 5120, 45 | "24": 5120, 46 | "25": 5120, 47 | "26": 5120, 48 | "27": 5120, 49 | "28": 5120, 50 | "29": 5120, 51 | "30": 5120, 52 | "31": 5120, 53 | "32": 5120, 54 | "33": 5120, 55 | "34": 5120, 56 | "35": 5120, 57 | "36": 5120, 58 | "37": 5120, 59 | "38": 5120, 60 | "39": 5120 61 | }, 62 | "pruned_mlp_width": { 63 | "0": 13824, 64 | "1": 13824, 65 | "2": 12800, 66 | "3": 13824, 67 | "4": 10752, 68 | "5": 9728, 69 | "6": 13824, 70 | "7": 13824, 71 | "8": 13824, 72 | "9": 13824, 73 | "10": 13824, 74 | "11": 13824, 75 | "12": 13824, 76 | "13": 13824, 77 | "14": 7680, 78 | "15": 7680, 79 | "16": 13824, 80 | "17": 13824, 81 | "18": 13824, 82 | "19": 13824, 83 | "20": 13824, 84 | "21": 10752, 85 | "22": 4608, 86 | "23": 13824, 87 | "24": 13824, 88 | "25": 13824, 89 | "26": 2560, 90 | "27": 3584, 91 | "28": 13824, 92 | "29": 1536, 93 | "30": 13824, 94 | "31": 1536, 95 | "32": 13824, 96 | "33": 13824, 97 | "34": 2560, 98 | "35": 2560, 99 | "36": 13824, 100 | "37": 13824, 101 | "38": 13824, 102 | "39": 13824 103 | } 104 | } -------------------------------------------------------------------------------- /MultiPruner/results/Llama-2-7B/ratio_10/eval.res.json: -------------------------------------------------------------------------------- 1 | { 2 | "total_params": 6738415616, 3 | "pruned_params": 6063132672, 4 | "ratio": 10.02139052385812, 5 | "ppl_wikitext2": 6.55, 6 | "5cs_acc_avg": 67.02, 7 | "arc_challenge": 44.45, 8 | "arc_easy": 71.0, 9 | "hellaswag": 74.07000000000001, 10 | "winogrande": 68.19, 11 | "piqa": 77.37 12 | } -------------------------------------------------------------------------------- /MultiPruner/results/Llama-2-7B/ratio_10/pruning_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "pruned_attn_idx": [ 3 | 25, 4 | 27, 5 | 21, 6 | 23, 7 | 24 8 | ], 9 | "pruned_mlp_idx": [], 10 | "pruned_attn_width": { 11 | "0": 4096, 12 | "1": 3840, 13 | "2": 3840, 14 | "3": 4096, 15 | "4": 4096, 16 | "5": 3968, 17 | "6": 4096, 18 | "7": 4096, 19 | "8": 3968, 20 | "9": 4096, 21 | "10": 4096, 22 | "11": 4096, 23 | "12": 4096, 24 | "13": 4096, 25 | "14": 4096, 26 | "15": 4096, 27 | "16": 4096, 28 | "17": 3968, 29 | "18": 4096, 30 | "19": 3968, 31 | "20": 3968, 32 | "21": 4096, 33 | "22": 3968, 34 | "23": 4096, 35 | "24": 4096, 36 | "25": 4096, 37 | "26": 4096, 38 | "27": 4096, 39 | "28": 3968, 40 | "29": 4096, 41 | "30": 3968, 42 | "31": 4096 43 | }, 44 | "pruned_mlp_width": { 45 | "0": 11008, 46 | "1": 11008, 47 | "2": 5888, 48 | "3": 11008, 49 | "4": 11008, 50 | "5": 11008, 51 | "6": 11008, 52 | "7": 9984, 53 | "8": 11008, 54 | "9": 11008, 55 | "10": 11008, 56 | "11": 9984, 57 | "12": 11008, 58 | "13": 11008, 59 | "14": 11008, 60 | "15": 11008, 61 | "16": 11008, 62 | "17": 11008, 63 | "18": 11008, 64 | "19": 11008, 65 | "20": 11008, 66 | "21": 11008, 67 | "22": 11008, 68 | "23": 1792, 69 | "24": 11008, 70 | "25": 11008, 71 | "26": 11008, 72 | "27": 1792, 73 | "28": 11008, 74 | "29": 11008, 75 | "30": 11008, 76 | "31": 11008 77 | } 78 | } -------------------------------------------------------------------------------- /MultiPruner/results/Llama-2-7B/ratio_12/eval.res.json: -------------------------------------------------------------------------------- 1 | { 2 | "total_params": 6738415616, 3 | "pruned_params": 5931012096, 4 | "ratio": 11.982097365482536, 5 | "ppl_wikitext2": 7.1, 6 | "5cs_acc_avg": 66.47999999999999, 7 | "arc_challenge": 44.03, 8 | "arc_easy": 69.82000000000001, 9 | "hellaswag": 73.77, 10 | "winogrande": 68.43, 11 | "piqa": 76.33 12 | } -------------------------------------------------------------------------------- /MultiPruner/results/Llama-2-7B/ratio_12/pruning_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "pruned_attn_idx": [ 3 | 25, 4 | 27, 5 | 21, 6 | 23, 7 | 24, 8 | 29 9 | ], 10 | "pruned_mlp_idx": [], 11 | "pruned_attn_width": { 12 | "0": 4096, 13 | "1": 4096, 14 | "2": 3840, 15 | "3": 3968, 16 | "4": 4096, 17 | "5": 4096, 18 | "6": 4096, 19 | "7": 4096, 20 | "8": 3968, 21 | "9": 4096, 22 | "10": 4096, 23 | "11": 4096, 24 | "12": 4096, 25 | "13": 4096, 26 | "14": 4096, 27 | "15": 4096, 28 | "16": 3968, 29 | "17": 3968, 30 | "18": 4096, 31 | "19": 3968, 32 | "20": 3968, 33 | "21": 4096, 34 | "22": 3968, 35 | "23": 4096, 36 | "24": 4096, 37 | "25": 4096, 38 | "26": 4096, 39 | "27": 4096, 40 | "28": 3712, 41 | "29": 4096, 42 | "30": 3968, 43 | "31": 4096 44 | }, 45 | "pruned_mlp_width": { 46 | "0": 11008, 47 | "1": 11008, 48 | "2": 5888, 49 | "3": 11008, 50 | "4": 11008, 51 | "5": 11008, 52 | "6": 11008, 53 | "7": 9984, 54 | "8": 11008, 55 | "9": 11008, 56 | "10": 11008, 57 | "11": 11008, 58 | "12": 11008, 59 | "13": 11008, 60 | "14": 11008, 61 | "15": 11008, 62 | "16": 11008, 63 | "17": 11008, 64 | "18": 11008, 65 | "19": 11008, 66 | "20": 11008, 67 | "21": 11008, 68 | "22": 11008, 69 | "23": 1792, 70 | "24": 11008, 71 | "25": 1792, 72 | "26": 11008, 73 | "27": 4864, 74 | "28": 11008, 75 | "29": 11008, 76 | "30": 11008, 77 | "31": 11008 78 | } 79 | } -------------------------------------------------------------------------------- /MultiPruner/results/Llama-2-7B/ratio_14/eval.res.json: -------------------------------------------------------------------------------- 1 | { 2 | "total_params": 6738415616, 3 | "pruned_params": 5796794368, 4 | "ratio": 13.973926537926385, 5 | "ppl_wikitext2": 7.56, 6 | "5cs_acc_avg": 65.93, 7 | "arc_challenge": 43.519999999999996, 8 | "arc_easy": 68.64, 9 | "hellaswag": 72.27, 10 | "winogrande": 67.96, 11 | "piqa": 77.25999999999999 12 | } -------------------------------------------------------------------------------- /MultiPruner/results/Llama-2-7B/ratio_14/pruning_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "pruned_attn_idx": [ 3 | 25, 4 | 27, 5 | 21, 6 | 23, 7 | 24, 8 | 29, 9 | 28 10 | ], 11 | "pruned_mlp_idx": [], 12 | "pruned_attn_width": { 13 | "0": 4096, 14 | "1": 3712, 15 | "2": 3840, 16 | "3": 3840, 17 | "4": 4096, 18 | "5": 4096, 19 | "6": 4096, 20 | "7": 4096, 21 | "8": 3968, 22 | "9": 4096, 23 | "10": 4096, 24 | "11": 4096, 25 | "12": 4096, 26 | "13": 4096, 27 | "14": 4096, 28 | "15": 4096, 29 | "16": 3968, 30 | "17": 3712, 31 | "18": 3968, 32 | "19": 3968, 33 | "20": 4096, 34 | "21": 4096, 35 | "22": 3968, 36 | "23": 4096, 37 | "24": 4096, 38 | "25": 4096, 39 | "26": 4096, 40 | "27": 4096, 41 | "28": 4096, 42 | "29": 4096, 43 | "30": 4096, 44 | "31": 4096 45 | }, 46 | "pruned_mlp_width": { 47 | "0": 11008, 48 | "1": 11008, 49 | "2": 5888, 50 | "3": 11008, 51 | "4": 11008, 52 | "5": 11008, 53 | "6": 11008, 54 | "7": 9984, 55 | "8": 11008, 56 | "9": 11008, 57 | "10": 11008, 58 | "11": 1792, 59 | "12": 11008, 60 | "13": 11008, 61 | "14": 11008, 62 | "15": 11008, 63 | "16": 11008, 64 | "17": 11008, 65 | "18": 11008, 66 | "19": 11008, 67 | "20": 11008, 68 | "21": 11008, 69 | "22": 11008, 70 | "23": 1792, 71 | "24": 11008, 72 | "25": 1792, 73 | "26": 11008, 74 | "27": 8960, 75 | "28": 11008, 76 | "29": 11008, 77 | "30": 11008, 78 | "31": 11008 79 | } 80 | } -------------------------------------------------------------------------------- /MultiPruner/results/Llama-2-7B/ratio_15/eval.res.json: -------------------------------------------------------------------------------- 1 | { 2 | "total_params": 6738415616, 3 | "pruned_params": 5727588352, 4 | "ratio": 15.000963454967753, 5 | "ppl_wikitext2": 7.66, 6 | "5cs_acc_avg": 65.25999999999999, 7 | "arc_challenge": 42.24, 8 | "arc_easy": 68.10000000000001, 9 | "hellaswag": 71.82, 10 | "winogrande": 67.4, 11 | "piqa": 76.77000000000001 12 | } -------------------------------------------------------------------------------- /MultiPruner/results/Llama-2-7B/ratio_15/pruning_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "pruned_attn_idx": [ 3 | 25, 4 | 27, 5 | 21, 6 | 23, 7 | 24, 8 | 29, 9 | 28 10 | ], 11 | "pruned_mlp_idx": [], 12 | "pruned_attn_width": { 13 | "0": 4096, 14 | "1": 3712, 15 | "2": 3456, 16 | "3": 3840, 17 | "4": 4096, 18 | "5": 4096, 19 | "6": 4096, 20 | "7": 4096, 21 | "8": 3968, 22 | "9": 4096, 23 | "10": 4096, 24 | "11": 4096, 25 | "12": 4096, 26 | "13": 4096, 27 | "14": 4096, 28 | "15": 4096, 29 | "16": 3968, 30 | "17": 3712, 31 | "18": 3968, 32 | "19": 3968, 33 | "20": 4096, 34 | "21": 4096, 35 | "22": 3968, 36 | "23": 4096, 37 | "24": 4096, 38 | "25": 4096, 39 | "26": 4096, 40 | "27": 4096, 41 | "28": 4096, 42 | "29": 4096, 43 | "30": 4096, 44 | "31": 4096 45 | }, 46 | "pruned_mlp_width": { 47 | "0": 11008, 48 | "1": 11008, 49 | "2": 5888, 50 | "3": 11008, 51 | "4": 11008, 52 | "5": 11008, 53 | "6": 11008, 54 | "7": 9984, 55 | "8": 11008, 56 | "9": 11008, 57 | "10": 11008, 58 | "11": 1792, 59 | "12": 11008, 60 | "13": 11008, 61 | "14": 11008, 62 | "15": 11008, 63 | "16": 11008, 64 | "17": 11008, 65 | "18": 11008, 66 | "19": 11008, 67 | "20": 11008, 68 | "21": 11008, 69 | "22": 11008, 70 | "23": 1792, 71 | "24": 11008, 72 | "25": 1792, 73 | "26": 11008, 74 | "27": 3840, 75 | "28": 11008, 76 | "29": 11008, 77 | "30": 11008, 78 | "31": 11008 79 | } 80 | } -------------------------------------------------------------------------------- /MultiPruner/results/Llama-2-7B/ratio_18/eval.res.json: -------------------------------------------------------------------------------- 1 | { 2 | "total_params": 6738415616, 3 | "pruned_params": 5524164608, 4 | "ratio": 18.01982954445296, 5 | "ppl_wikitext2": 8.62, 6 | "5cs_acc_avg": 64.2, 7 | "arc_challenge": 41.89, 8 | "arc_easy": 65.11, 9 | "hellaswag": 71.45, 10 | "winogrande": 66.61, 11 | "piqa": 75.94999999999999 12 | } -------------------------------------------------------------------------------- /MultiPruner/results/Llama-2-7B/ratio_18/pruning_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "pruned_attn_idx": [ 3 | 25, 4 | 27, 5 | 21, 6 | 23, 7 | 24, 8 | 29, 9 | 28, 10 | 18 11 | ], 12 | "pruned_mlp_idx": [], 13 | "pruned_attn_width": { 14 | "0": 4096, 15 | "1": 3712, 16 | "2": 3456, 17 | "3": 3712, 18 | "4": 4096, 19 | "5": 4096, 20 | "6": 4096, 21 | "7": 4096, 22 | "8": 3968, 23 | "9": 4096, 24 | "10": 4096, 25 | "11": 4096, 26 | "12": 3840, 27 | "13": 4096, 28 | "14": 4096, 29 | "15": 4096, 30 | "16": 3968, 31 | "17": 3712, 32 | "18": 4096, 33 | "19": 3840, 34 | "20": 3968, 35 | "21": 4096, 36 | "22": 3968, 37 | "23": 4096, 38 | "24": 4096, 39 | "25": 4096, 40 | "26": 4096, 41 | "27": 4096, 42 | "28": 4096, 43 | "29": 4096, 44 | "30": 3968, 45 | "31": 4096 46 | }, 47 | "pruned_mlp_width": { 48 | "0": 11008, 49 | "1": 11008, 50 | "2": 5888, 51 | "3": 11008, 52 | "4": 11008, 53 | "5": 11008, 54 | "6": 11008, 55 | "7": 9984, 56 | "8": 11008, 57 | "9": 11008, 58 | "10": 11008, 59 | "11": 768, 60 | "12": 11008, 61 | "13": 11008, 62 | "14": 11008, 63 | "15": 11008, 64 | "16": 11008, 65 | "17": 3840, 66 | "18": 11008, 67 | "19": 11008, 68 | "20": 11008, 69 | "21": 11008, 70 | "22": 11008, 71 | "23": 1792, 72 | "24": 11008, 73 | "25": 1792, 74 | "26": 11008, 75 | "27": 1792, 76 | "28": 11008, 77 | "29": 11008, 78 | "30": 11008, 79 | "31": 11008 80 | } 81 | } -------------------------------------------------------------------------------- /MultiPruner/results/Llama-2-7B/ratio_22/eval.res.json: -------------------------------------------------------------------------------- 1 | { 2 | "total_params": 6738415616, 3 | "pruned_params": 5258874880, 4 | "ratio": 21.956804393111508, 5 | "ppl_wikitext2": 9.33, 6 | "5cs_acc_avg": 62.83, 7 | "arc_challenge": 41.13, 8 | "arc_easy": 64.77000000000001, 9 | "hellaswag": 68.94, 10 | "winogrande": 64.64, 11 | "piqa": 74.65 12 | } -------------------------------------------------------------------------------- /MultiPruner/results/Llama-2-7B/ratio_22/pruning_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "pruned_attn_idx": [ 3 | 25, 4 | 27, 5 | 24, 6 | 23, 7 | 21, 8 | 29, 9 | 28, 10 | 18, 11 | 8 12 | ], 13 | "pruned_mlp_idx": [ 14 | 8 15 | ], 16 | "pruned_attn_width": { 17 | "0": 4096, 18 | "1": 3328, 19 | "2": 3328, 20 | "3": 4096, 21 | "4": 4096, 22 | "5": 3968, 23 | "6": 4096, 24 | "7": 4096, 25 | "8": 4096, 26 | "9": 4096, 27 | "10": 4096, 28 | "11": 4096, 29 | "12": 4096, 30 | "13": 4096, 31 | "14": 4096, 32 | "15": 4096, 33 | "16": 3968, 34 | "17": 3584, 35 | "18": 4096, 36 | "19": 3840, 37 | "20": 3968, 38 | "21": 4096, 39 | "22": 3968, 40 | "23": 4096, 41 | "24": 4096, 42 | "25": 4096, 43 | "26": 4096, 44 | "27": 4096, 45 | "28": 4096, 46 | "29": 4096, 47 | "30": 3968, 48 | "31": 4096 49 | }, 50 | "pruned_mlp_width": { 51 | "0": 11008, 52 | "1": 11008, 53 | "2": 5888, 54 | "3": 11008, 55 | "4": 11008, 56 | "5": 11008, 57 | "6": 11008, 58 | "7": 6912, 59 | "8": 11008, 60 | "9": 11008, 61 | "10": 11008, 62 | "11": 11008, 63 | "12": 5888, 64 | "13": 11008, 65 | "14": 11008, 66 | "15": 11008, 67 | "16": 11008, 68 | "17": 3840, 69 | "18": 11008, 70 | "19": 11008, 71 | "20": 11008, 72 | "21": 11008, 73 | "22": 11008, 74 | "23": 1792, 75 | "24": 11008, 76 | "25": 1792, 77 | "26": 3840, 78 | "27": 1792, 79 | "28": 11008, 80 | "29": 11008, 81 | "30": 11008, 82 | "31": 11008 83 | } 84 | } -------------------------------------------------------------------------------- /MultiPruner/results/Llama-2-7B/ratio_7/eval.res.json: -------------------------------------------------------------------------------- 1 | { 2 | "total_params": 6738415616, 3 | "pruned_params": 6268653568, 4 | "ratio": 6.971402103553482, 5 | "ppl_wikitext2": 6.33, 6 | "5cs_acc_avg": 67.94, 7 | "arc_challenge": 44.62, 8 | "arc_easy": 73.44000000000001, 9 | "hellaswag": 74.32, 10 | "winogrande": 69.46, 11 | "piqa": 77.86 12 | } -------------------------------------------------------------------------------- /MultiPruner/results/Llama-2-7B/ratio_7/pruning_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "pruned_attn_idx": [ 3 | 25, 4 | 27, 5 | 21, 6 | 23 7 | ], 8 | "pruned_mlp_idx": [], 9 | "pruned_attn_width": { 10 | "0": 4096, 11 | "1": 4096, 12 | "2": 3968, 13 | "3": 3712, 14 | "4": 4096, 15 | "5": 4096, 16 | "6": 4096, 17 | "7": 4096, 18 | "8": 4096, 19 | "9": 4096, 20 | "10": 4096, 21 | "11": 4096, 22 | "12": 4096, 23 | "13": 4096, 24 | "14": 4096, 25 | "15": 4096, 26 | "16": 4096, 27 | "17": 4096, 28 | "18": 4096, 29 | "19": 4096, 30 | "20": 4096, 31 | "21": 4096, 32 | "22": 3968, 33 | "23": 4096, 34 | "24": 4096, 35 | "25": 4096, 36 | "26": 4096, 37 | "27": 4096, 38 | "28": 4096, 39 | "29": 4096, 40 | "30": 3968, 41 | "31": 4096 42 | }, 43 | "pruned_mlp_width": { 44 | "0": 11008, 45 | "1": 11008, 46 | "2": 8960, 47 | "3": 11008, 48 | "4": 11008, 49 | "5": 11008, 50 | "6": 11008, 51 | "7": 11008, 52 | "8": 11008, 53 | "9": 11008, 54 | "10": 11008, 55 | "11": 11008, 56 | "12": 11008, 57 | "13": 11008, 58 | "14": 11008, 59 | "15": 11008, 60 | "16": 11008, 61 | "17": 11008, 62 | "18": 11008, 63 | "19": 11008, 64 | "20": 11008, 65 | "21": 11008, 66 | "22": 11008, 67 | "23": 6912, 68 | "24": 11008, 69 | "25": 11008, 70 | "26": 11008, 71 | "27": 1792, 72 | "28": 11008, 73 | "29": 11008, 74 | "30": 11008, 75 | "31": 11008 76 | } 77 | } -------------------------------------------------------------------------------- /MultiPruner/results/Llama-3.1-8B/ratio_10/eval.res.json: -------------------------------------------------------------------------------- 1 | { 2 | "total_params": 8030261248, 3 | "pruned_model_params": 7224954880, 4 | "ppl_wikitext2": 8.93, 5 | "5cs_acc_avg": 69.27, 6 | "arc_challenge": 47.78, 7 | "arc_easy": 75.13, 8 | "hellaswag": 73.72999999999999, 9 | "winogrande": 71.27, 10 | "piqa": 78.45 11 | } -------------------------------------------------------------------------------- /MultiPruner/results/Llama-3.1-8B/ratio_10/pruning_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "pruned_attn_idx": [ 3 | 25, 4 | 26, 5 | 20, 6 | 24, 7 | 23, 8 | 21 9 | ], 10 | "pruned_mlp_idx": [], 11 | "pruned_attn_width": { 12 | "0": 4096, 13 | "1": 4096, 14 | "2": 4096, 15 | "3": 4096, 16 | "4": 4096, 17 | "5": 4096, 18 | "6": 4096, 19 | "7": 4096, 20 | "8": 4096, 21 | "9": 4096, 22 | "10": 4096, 23 | "11": 4096, 24 | "12": 4096, 25 | "13": 4096, 26 | "14": 4096, 27 | "15": 4096, 28 | "16": 4096, 29 | "17": 4096, 30 | "18": 4096, 31 | "19": 4096, 32 | "20": 4096, 33 | "21": 4096, 34 | "22": 4096, 35 | "23": 4096, 36 | "24": 4096, 37 | "25": 4096, 38 | "26": 4096, 39 | "27": 4096, 40 | "28": 4096, 41 | "29": 4096, 42 | "30": 4096, 43 | "31": 4096 44 | }, 45 | "pruned_mlp_width": { 46 | "0": 14336, 47 | "1": 14336, 48 | "2": 14336, 49 | "3": 14336, 50 | "4": 14336, 51 | "5": 14336, 52 | "6": 14336, 53 | "7": 9216, 54 | "8": 14336, 55 | "9": 14336, 56 | "10": 14336, 57 | "11": 9216, 58 | "12": 14336, 59 | "13": 14336, 60 | "14": 14336, 61 | "15": 12288, 62 | "16": 14336, 63 | "17": 14336, 64 | "18": 3072, 65 | "19": 14336, 66 | "20": 14336, 67 | "21": 14336, 68 | "22": 14336, 69 | "23": 4096, 70 | "24": 3072, 71 | "25": 14336, 72 | "26": 14336, 73 | "27": 14336, 74 | "28": 14336, 75 | "29": 14336, 76 | "30": 14336, 77 | "31": 14336 78 | } 79 | } -------------------------------------------------------------------------------- /MultiPruner/results/Llama-3.1-8B/ratio_17/eval.res.json: -------------------------------------------------------------------------------- 1 | { 2 | "total_params": 8030261248, 3 | "pruned_model_params": 6654529536, 4 | "ppl_wikitext2": 11.64, 5 | "5cs_acc_avg": 65.81, 6 | "arc_challenge": 43.519999999999996, 7 | "arc_easy": 68.52000000000001, 8 | "hellaswag": 69.46, 9 | "winogrande": 71.27, 10 | "piqa": 76.28 11 | } -------------------------------------------------------------------------------- /MultiPruner/results/Llama-3.1-8B/ratio_17/pruning_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "pruned_attn_idx": [ 3 | 25, 4 | 26, 5 | 20, 6 | 24 7 | ], 8 | "pruned_mlp_idx": [], 9 | "pruned_attn_width": { 10 | "0": 4096, 11 | "1": 4096, 12 | "2": 4096, 13 | "3": 4096, 14 | "4": 4096, 15 | "5": 4096, 16 | "6": 4096, 17 | "7": 4096, 18 | "8": 4096, 19 | "9": 4096, 20 | "10": 4096, 21 | "11": 4096, 22 | "12": 4096, 23 | "13": 4096, 24 | "14": 4096, 25 | "15": 4096, 26 | "16": 4096, 27 | "17": 4096, 28 | "18": 4096, 29 | "19": 4096, 30 | "20": 4096, 31 | "21": 4096, 32 | "22": 4096, 33 | "23": 4096, 34 | "24": 4096, 35 | "25": 4096, 36 | "26": 4096, 37 | "27": 4096, 38 | "28": 4096, 39 | "29": 4096, 40 | "30": 4096, 41 | "31": 4096 42 | }, 43 | "pruned_mlp_width": { 44 | "0": 14336, 45 | "1": 14336, 46 | "2": 14336, 47 | "3": 14336, 48 | "4": 14336, 49 | "5": 14336, 50 | "6": 14336, 51 | "7": 5120, 52 | "8": 14336, 53 | "9": 14336, 54 | "10": 14336, 55 | "11": 10240, 56 | "12": 14336, 57 | "13": 8192, 58 | "14": 14336, 59 | "15": 3072, 60 | "16": 13312, 61 | "17": 14336, 62 | "18": 6144, 63 | "19": 3072, 64 | "20": 14336, 65 | "21": 14336, 66 | "22": 14336, 67 | "23": 3072, 68 | "24": 3072, 69 | "25": 2048, 70 | "26": 2048, 71 | "27": 14336, 72 | "28": 14336, 73 | "29": 14336, 74 | "30": 14336, 75 | "31": 14336 76 | } 77 | } -------------------------------------------------------------------------------- /MultiPruner/results/Llama-3.1-8B/ratio_20/eval.res.json: -------------------------------------------------------------------------------- 1 | { 2 | "total_params": 8030261248, 3 | "pruned_model_params": 6423842816, 4 | "ppl_wikitext2": 13.86, 5 | "5cs_acc_avg": 63.07000000000001, 6 | "arc_challenge": 41.72, 7 | "arc_easy": 64.98, 8 | "hellaswag": 65.38000000000001, 9 | "winogrande": 68.97999999999999, 10 | "piqa": 74.27 11 | } -------------------------------------------------------------------------------- /MultiPruner/results/Llama-3.1-8B/ratio_20/pruning_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "pruned_attn_idx": [ 3 | 25, 4 | 26, 5 | 20, 6 | 24, 7 | 23, 8 | 21, 9 | 27, 10 | 19 11 | ], 12 | "pruned_mlp_idx": [ 13 | 25 14 | ], 15 | "pruned_attn_width": { 16 | "0": 4096, 17 | "1": 4096, 18 | "2": 4096, 19 | "3": 4096, 20 | "4": 4096, 21 | "5": 4096, 22 | "6": 4096, 23 | "7": 4096, 24 | "8": 4096, 25 | "9": 4096, 26 | "10": 4096, 27 | "11": 4096, 28 | "12": 4096, 29 | "13": 4096, 30 | "14": 4096, 31 | "15": 4096, 32 | "16": 4096, 33 | "17": 4096, 34 | "18": 4096, 35 | "19": 4096, 36 | "20": 4096, 37 | "21": 4096, 38 | "22": 4096, 39 | "23": 4096, 40 | "24": 4096, 41 | "25": 4096, 42 | "26": 4096, 43 | "27": 4096, 44 | "28": 4096, 45 | "29": 4096, 46 | "30": 4096, 47 | "31": 4096 48 | }, 49 | "pruned_mlp_width": { 50 | "0": 14336, 51 | "1": 14336, 52 | "2": 14336, 53 | "3": 14336, 54 | "4": 14336, 55 | "5": 14336, 56 | "6": 14336, 57 | "7": 5120, 58 | "8": 14336, 59 | "9": 14336, 60 | "10": 12288, 61 | "11": 1024, 62 | "12": 14336, 63 | "13": 14336, 64 | "14": 14336, 65 | "15": 4096, 66 | "16": 14336, 67 | "17": 14336, 68 | "18": 1024, 69 | "19": 3072, 70 | "20": 8192, 71 | "21": 14336, 72 | "22": 14336, 73 | "23": 3072, 74 | "24": 2048, 75 | "25": 14336, 76 | "26": 14336, 77 | "27": 14336, 78 | "28": 14336, 79 | "29": 14336, 80 | "30": 14336, 81 | "31": 14336 82 | } 83 | } -------------------------------------------------------------------------------- /MultiPruner/results/Llama-3.2-3B/ratio_9/eval.res.json: -------------------------------------------------------------------------------- 1 | { 2 | "total_params": 3212749824, 3 | "pruned_model_params": 2921769984, 4 | "ppl_wikitext2": 10.46, 5 | "5cs_acc_avg": 64.03999999999999, 6 | "arc_challenge": 43.09, 7 | "arc_easy": 66.96, 8 | "hellaswag": 68.5, 9 | "winogrande": 66.85, 10 | "piqa": 74.81 11 | } -------------------------------------------------------------------------------- /MultiPruner/results/Llama-3.2-3B/ratio_9/pruning_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "pruned_attn_idx": [ 3 | 23, 4 | 24 5 | ], 6 | "pruned_mlp_idx": [], 7 | "pruned_attn_width": { 8 | "0": 3072, 9 | "1": 3072, 10 | "2": 3072, 11 | "3": 3072, 12 | "4": 3072, 13 | "5": 3072, 14 | "6": 3072, 15 | "7": 3072, 16 | "8": 3072, 17 | "9": 3072, 18 | "10": 3072, 19 | "11": 3072, 20 | "12": 3072, 21 | "13": 3072, 22 | "14": 3072, 23 | "15": 3072, 24 | "16": 3072, 25 | "17": 3072, 26 | "18": 3072, 27 | "19": 3072, 28 | "20": 3072, 29 | "21": 3072, 30 | "22": 3072, 31 | "23": 3072, 32 | "24": 3072, 33 | "25": 3072, 34 | "26": 3072, 35 | "27": 3072 36 | }, 37 | "pruned_mlp_width": { 38 | "0": 8192, 39 | "1": 8192, 40 | "2": 8192, 41 | "3": 4352, 42 | "4": 8192, 43 | "5": 8192, 44 | "6": 8192, 45 | "7": 8192, 46 | "8": 8192, 47 | "9": 8192, 48 | "10": 512, 49 | "11": 8192, 50 | "12": 8192, 51 | "13": 8192, 52 | "14": 8192, 53 | "15": 7424, 54 | "16": 8192, 55 | "17": 8192, 56 | "18": 8192, 57 | "19": 8192, 58 | "20": 8192, 59 | "21": 2816, 60 | "22": 2048, 61 | "23": 8192, 62 | "24": 8192, 63 | "25": 5888, 64 | "26": 8192, 65 | "27": 8192 66 | } 67 | } -------------------------------------------------------------------------------- /MultiPruner/results/Meta-Llama-3-8B/ratio_10/eval.res.json: -------------------------------------------------------------------------------- 1 | { 2 | "total_params": 8030261248, 3 | "pruned_model_params": 7220760576, 4 | "ppl_wikitext2": 8.19, 5 | "5cs_acc_avg": 69.03, 6 | "arc_challenge": 48.120000000000005, 7 | "arc_easy": 71.3, 8 | "hellaswag": 75.08, 9 | "winogrande": 71.67, 10 | "piqa": 79.0 11 | } -------------------------------------------------------------------------------- /MultiPruner/results/Meta-Llama-3-8B/ratio_10/pruning_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "pruned_attn_idx": [ 3 | 26, 4 | 24, 5 | 23, 6 | 20 7 | ], 8 | "pruned_mlp_idx": [], 9 | "pruned_attn_width": { 10 | "0": 4096, 11 | "1": 4096, 12 | "2": 4096, 13 | "3": 4096, 14 | "4": 4096, 15 | "5": 4096, 16 | "6": 4096, 17 | "7": 4096, 18 | "8": 4096, 19 | "9": 4096, 20 | "10": 4096, 21 | "11": 4096, 22 | "12": 4096, 23 | "13": 4096, 24 | "14": 4096, 25 | "15": 4096, 26 | "16": 4096, 27 | "17": 4096, 28 | "18": 4096, 29 | "19": 4096, 30 | "20": 4096, 31 | "21": 4096, 32 | "22": 4096, 33 | "23": 4096, 34 | "24": 4096, 35 | "25": 4096, 36 | "26": 4096, 37 | "27": 4096, 38 | "28": 4096, 39 | "29": 4096, 40 | "30": 4096, 41 | "31": 4096 42 | }, 43 | "pruned_mlp_width": { 44 | "0": 14336, 45 | "1": 14336, 46 | "2": 14336, 47 | "3": 14336, 48 | "4": 14336, 49 | "5": 14336, 50 | "6": 14336, 51 | "7": 8192, 52 | "8": 14336, 53 | "9": 14336, 54 | "10": 14336, 55 | "11": 14336, 56 | "12": 14336, 57 | "13": 1024, 58 | "14": 14336, 59 | "15": 14336, 60 | "16": 14336, 61 | "17": 14336, 62 | "18": 12288, 63 | "19": 14336, 64 | "20": 14336, 65 | "21": 14336, 66 | "22": 4096, 67 | "23": 5120, 68 | "24": 14336, 69 | "25": 14336, 70 | "26": 14336, 71 | "27": 14336, 72 | "28": 3072, 73 | "29": 14336, 74 | "30": 14336, 75 | "31": 14336 76 | } 77 | } -------------------------------------------------------------------------------- /MultiPruner/results/Meta-Llama-3-8B/ratio_17/eval.res.json: -------------------------------------------------------------------------------- 1 | { 2 | "total_params": 8030261248, 3 | "pruned_model_params": 6654529536, 4 | "ppl_wikitext2": 11.11, 5 | "5cs_acc_avg": 64.4, 6 | "arc_challenge": 42.58, 7 | "arc_easy": 64.64999999999999, 8 | "hellaswag": 68.97, 9 | "winogrande": 69.53, 10 | "piqa": 76.28 11 | } -------------------------------------------------------------------------------- /MultiPruner/results/Meta-Llama-3-8B/ratio_17/pruning_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "pruned_attn_idx": [ 3 | 26, 4 | 24, 5 | 23, 6 | 20 7 | ], 8 | "pruned_mlp_idx": [], 9 | "pruned_attn_width": { 10 | "0": 4096, 11 | "1": 4096, 12 | "2": 4096, 13 | "3": 4096, 14 | "4": 4096, 15 | "5": 4096, 16 | "6": 4096, 17 | "7": 4096, 18 | "8": 4096, 19 | "9": 4096, 20 | "10": 4096, 21 | "11": 4096, 22 | "12": 4096, 23 | "13": 4096, 24 | "14": 4096, 25 | "15": 4096, 26 | "16": 4096, 27 | "17": 4096, 28 | "18": 4096, 29 | "19": 4096, 30 | "20": 4096, 31 | "21": 4096, 32 | "22": 4096, 33 | "23": 4096, 34 | "24": 4096, 35 | "25": 4096, 36 | "26": 4096, 37 | "27": 4096, 38 | "28": 4096, 39 | "29": 4096, 40 | "30": 4096, 41 | "31": 4096 42 | }, 43 | "pruned_mlp_width": { 44 | "0": 14336, 45 | "1": 14336, 46 | "2": 14336, 47 | "3": 14336, 48 | "4": 14336, 49 | "5": 14336, 50 | "6": 14336, 51 | "7": 3072, 52 | "8": 14336, 53 | "9": 14336, 54 | "10": 14336, 55 | "11": 10240, 56 | "12": 14336, 57 | "13": 1024, 58 | "14": 14336, 59 | "15": 5120, 60 | "16": 13312, 61 | "17": 14336, 62 | "18": 3072, 63 | "19": 14336, 64 | "20": 14336, 65 | "21": 14336, 66 | "22": 3072, 67 | "23": 3072, 68 | "24": 14336, 69 | "25": 11264, 70 | "26": 3072, 71 | "27": 14336, 72 | "28": 3072, 73 | "29": 14336, 74 | "30": 14336, 75 | "31": 14336 76 | } 77 | } -------------------------------------------------------------------------------- /MultiPruner/results/Meta-Llama-3-8B/ratio_20/eval.res.json: -------------------------------------------------------------------------------- 1 | { 2 | "total_params": 8030261248, 3 | "pruned_model_params": 6423842816, 4 | "ppl_wikitext2": 16.01, 5 | "5cs_acc_avg": 63.019999999999996, 6 | "arc_challenge": 41.21, 7 | "arc_easy": 63.09, 8 | "hellaswag": 67.42, 9 | "winogrande": 69.61, 10 | "piqa": 73.78 11 | } -------------------------------------------------------------------------------- /MultiPruner/results/Meta-Llama-3-8B/ratio_20/pruning_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "pruned_attn_idx": [ 3 | 26, 4 | 24, 5 | 23, 6 | 20, 7 | 21, 8 | 27, 9 | 22, 10 | 29 11 | ], 12 | "pruned_mlp_idx": [], 13 | "pruned_attn_width": { 14 | "0": 4096, 15 | "1": 4096, 16 | "2": 4096, 17 | "3": 4096, 18 | "4": 4096, 19 | "5": 4096, 20 | "6": 4096, 21 | "7": 4096, 22 | "8": 4096, 23 | "9": 4096, 24 | "10": 4096, 25 | "11": 4096, 26 | "12": 4096, 27 | "13": 4096, 28 | "14": 4096, 29 | "15": 4096, 30 | "16": 4096, 31 | "17": 4096, 32 | "18": 4096, 33 | "19": 4096, 34 | "20": 4096, 35 | "21": 4096, 36 | "22": 4096, 37 | "23": 4096, 38 | "24": 4096, 39 | "25": 4096, 40 | "26": 4096, 41 | "27": 4096, 42 | "28": 4096, 43 | "29": 4096, 44 | "30": 4096, 45 | "31": 4096 46 | }, 47 | "pruned_mlp_width": { 48 | "0": 14336, 49 | "1": 14336, 50 | "2": 14336, 51 | "3": 14336, 52 | "4": 14336, 53 | "5": 13312, 54 | "6": 14336, 55 | "7": 3072, 56 | "8": 5120, 57 | "9": 14336, 58 | "10": 14336, 59 | "11": 14336, 60 | "12": 14336, 61 | "13": 1024, 62 | "14": 14336, 63 | "15": 14336, 64 | "16": 14336, 65 | "17": 14336, 66 | "18": 3072, 67 | "19": 1024, 68 | "20": 5120, 69 | "21": 14336, 70 | "22": 14336, 71 | "23": 3072, 72 | "24": 14336, 73 | "25": 3072, 74 | "26": 2048, 75 | "27": 14336, 76 | "28": 14336, 77 | "29": 14336, 78 | "30": 14336, 79 | "31": 14336 80 | } 81 | } -------------------------------------------------------------------------------- /MultiPruner/results/Qwen1.5-14B/ratio_24/eval.res.json: -------------------------------------------------------------------------------- 1 | { 2 | "total_params": 14167290880, 3 | "pruned_params": 10836096000, 4 | "ratio": 23.513280755057096, 5 | "ppl_wikitext2": 12.94, 6 | "5cs_acc_avg": 62.41, 7 | "arc_challenge": 41.21, 8 | "arc_easy": 69.19, 9 | "hellaswag": 63.260000000000005, 10 | "winogrande": 62.980000000000004, 11 | "piqa": 75.41 12 | } -------------------------------------------------------------------------------- /MultiPruner/results/Qwen1.5-14B/ratio_24/pruning_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "pruned_attn_idx": [ 3 | 13, 4 | 26, 5 | 35, 6 | 32, 7 | 18, 8 | 8, 9 | 31, 10 | 36, 11 | 4 12 | ], 13 | "pruned_mlp_idx":[ 14 | 4, 15 | 7, 16 | 18 17 | ], 18 | "pruned_attn_width": { 19 | "0": 4992, 20 | "1": 4480, 21 | "2": 4736, 22 | "3": 4096, 23 | "4": 5120, 24 | "5": 4864, 25 | "6": 4992, 26 | "7": 5120, 27 | "8": 5120, 28 | "9": 5120, 29 | "10": 5120, 30 | "11": 5120, 31 | "12": 5120, 32 | "13": 5120, 33 | "14": 5120, 34 | "15": 5120, 35 | "16": 4992, 36 | "17": 5120, 37 | "18": 5120, 38 | "19": 5120, 39 | "20": 4992, 40 | "21": 5120, 41 | "22": 5120, 42 | "23": 5120, 43 | "24": 4992, 44 | "25": 5120, 45 | "26": 5120, 46 | "27": 5120, 47 | "28": 5120, 48 | "29": 4608, 49 | "30": 4608, 50 | "31": 5120, 51 | "32": 5120, 52 | "33": 4224, 53 | "34": 4096, 54 | "35": 5120, 55 | "36": 5120, 56 | "37": 5120, 57 | "38": 5120, 58 | "39": 5120 59 | }, 60 | "pruned_mlp_width": { 61 | "0": 13696, 62 | "1": 13696, 63 | "2": 13696, 64 | "3": 1408, 65 | "4": 13696, 66 | "5": 11648, 67 | "6": 13696, 68 | "7": 13696, 69 | "8": 3456, 70 | "9": 7552, 71 | "10": 13696, 72 | "11": 13696, 73 | "12": 13696, 74 | "13": 13696, 75 | "14": 13696, 76 | "15": 7552, 77 | "16": 13696, 78 | "17": 7552, 79 | "18": 13696, 80 | "19": 13696, 81 | "20": 11648, 82 | "21": 13696, 83 | "22": 13696, 84 | "23": 9600, 85 | "24": 13696, 86 | "25": 13696, 87 | "26": 13696, 88 | "27": 13696, 89 | "28": 3456, 90 | "29": 13696, 91 | "30": 1408, 92 | "31": 13696, 93 | "32": 1408, 94 | "33": 1408, 95 | "34": 13696, 96 | "35": 3456, 97 | "36": 13696, 98 | "37": 13696, 99 | "38": 13696, 100 | "39": 13696 101 | } 102 | } -------------------------------------------------------------------------------- /MultiPruner/results/Qwen1.5-7B/ratio_22/eval.res.json: -------------------------------------------------------------------------------- 1 | { 2 | "total_params": 7721324544, 3 | "pruned_params": 6037311488, 4 | "ratio": 21.809898630780832, 5 | "ppl_wikitext2": 18.22, 6 | "5cs_acc_avg": 57.379999999999995, 7 | "arc_challenge": 35.92, 8 | "arc_easy": 59.01, 9 | "hellaswag": 60.6, 10 | "winogrande": 59.589999999999996, 11 | "piqa": 71.76 12 | } -------------------------------------------------------------------------------- /MultiPruner/results/Qwen1.5-7B/ratio_22/pruning_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "pruned_attn_idx": [ 3 | 19, 4 | 6, 5 | 27, 6 | 24, 7 | 26, 8 | 5, 9 | 22, 10 | 25 11 | ], 12 | "pruned_mlp_idx": [ 13 | 6, 14 | 5 15 | ], 16 | "pruned_attn_width": { 17 | "0": 3968, 18 | "1": 4096, 19 | "2": 3968, 20 | "3": 3968, 21 | "4": 4096, 22 | "5": 4096, 23 | "6": 4096, 24 | "7": 3968, 25 | "8": 4096, 26 | "9": 3712, 27 | "10": 3968, 28 | "11": 4096, 29 | "12": 4096, 30 | "13": 4096, 31 | "14": 4096, 32 | "15": 3968, 33 | "16": 4096, 34 | "17": 4096, 35 | "18": 3584, 36 | "19": 4096, 37 | "20": 4096, 38 | "21": 3968, 39 | "22": 4096, 40 | "23": 3968, 41 | "24": 4096, 42 | "25": 4096, 43 | "26": 4096, 44 | "27": 4096, 45 | "28": 4096, 46 | "29": 2560, 47 | "30": 4096, 48 | "31": 3968 49 | }, 50 | "pruned_mlp_width": { 51 | "32": 11008, 52 | "33": 11008, 53 | "34": 11008, 54 | "35": 1792, 55 | "36": 11008, 56 | "37": 11008, 57 | "38": 11008, 58 | "39": 11008, 59 | "40": 11008, 60 | "41": 11008, 61 | "42": 4864, 62 | "43": 11008, 63 | "44": 11008, 64 | "45": 2816, 65 | "46": 11008, 66 | "47": 11008, 67 | "48": 11008, 68 | "49": 11008, 69 | "50": 768, 70 | "51": 11008, 71 | "52": 11008, 72 | "53": 1792, 73 | "54": 11008, 74 | "55": 11008, 75 | "56": 11008, 76 | "57": 7936, 77 | "58": 768, 78 | "59": 768, 79 | "60": 11008, 80 | "61": 11008, 81 | "62": 11008, 82 | "63": 11008 83 | } 84 | } -------------------------------------------------------------------------------- /MultiPruner/results/Qwen2.5-7B/ratio_10/eval.res.json: -------------------------------------------------------------------------------- 1 | { 2 | "total_params": 7615616512, 3 | "pruned_model_params": 6852230144, 4 | "ppl_wikitext2": 9.15, 5 | "5cs_acc_avg": 69.71000000000001, 6 | "arc_challenge": 54.269999999999996, 7 | "arc_easy": 80.81, 8 | "hellaswag": 73.31, 9 | "winogrande": 62.43, 10 | "piqa": 77.75 11 | } -------------------------------------------------------------------------------- /MultiPruner/results/Qwen2.5-7B/ratio_10/pruning_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "pruned_attn_idx": [ 3 | 13, 4 | 5, 5 | 24, 6 | 11, 7 | 2 8 | ], 9 | "pruned_mlp_idx": [ 10 | 12, 11 | 13 12 | ], 13 | "pruned_attn_width": { 14 | "0": 3584, 15 | "1": 3584, 16 | "2": 3584, 17 | "3": 3584, 18 | "4": 3584, 19 | "5": 3584, 20 | "6": 3584, 21 | "7": 3584, 22 | "8": 3584, 23 | "9": 3584, 24 | "10": 3584, 25 | "11": 3584, 26 | "12": 3584, 27 | "13": 3584, 28 | "14": 3584, 29 | "15": 3584, 30 | "16": 3584, 31 | "17": 3584, 32 | "18": 3584, 33 | "19": 3584, 34 | "20": 3584, 35 | "21": 3584, 36 | "22": 3584, 37 | "23": 3584, 38 | "24": 3584, 39 | "25": 3584, 40 | "26": 3584, 41 | "27": 3584 42 | }, 43 | "pruned_mlp_width": { 44 | "0": 13824, 45 | "1": 11776, 46 | "2": 14848, 47 | "3": 18944, 48 | "4": 18944, 49 | "5": 18944, 50 | "6": 18944, 51 | "7": 18944, 52 | "8": 18944, 53 | "9": 18944, 54 | "10": 18944, 55 | "11": 18944, 56 | "12": 18944, 57 | "13": 18944, 58 | "14": 18944, 59 | "15": 18944, 60 | "16": 16896, 61 | "17": 18944, 62 | "18": 18944, 63 | "19": 17920, 64 | "20": 18944, 65 | "21": 18944, 66 | "22": 18944, 67 | "23": 18944, 68 | "24": 18944, 69 | "25": 18944, 70 | "26": 18944, 71 | "27": 18944 72 | } 73 | } -------------------------------------------------------------------------------- /MultiPruner/results/Qwen2.5-7B/ratio_20/eval.res.json: -------------------------------------------------------------------------------- 1 | { 2 | "total_params": 7615616512, 3 | "pruned_model_params": 6090697216, 4 | "ppl_wikitext2": 13.37, 5 | "5cs_acc_avg": 62.82, 6 | "arc_challenge": 43.86, 7 | "arc_easy": 70.54, 8 | "hellaswag": 66.73, 9 | "winogrande": 58.8, 10 | "piqa": 74.16 11 | } -------------------------------------------------------------------------------- /MultiPruner/results/Qwen2.5-7B/ratio_20/pruning_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "pruned_attn_idx": [ 3 | 13, 4 | 5, 5 | 24, 6 | 11, 7 | 2, 8 | 14 9 | ], 10 | "pruned_mlp_idx": [ 11 | 12, 12 | 13, 13 | 5 14 | ], 15 | "pruned_attn_width": { 16 | "0": 3584, 17 | "1": 3584, 18 | "2": 3584, 19 | "3": 3584, 20 | "4": 3584, 21 | "5": 3584, 22 | "6": 3584, 23 | "7": 3584, 24 | "8": 3584, 25 | "9": 3584, 26 | "10": 3584, 27 | "11": 3584, 28 | "12": 3584, 29 | "13": 3584, 30 | "14": 3584, 31 | "15": 3584, 32 | "16": 3584, 33 | "17": 3584, 34 | "18": 3584, 35 | "19": 3584, 36 | "20": 3584, 37 | "21": 3584, 38 | "22": 3584, 39 | "23": 3584, 40 | "24": 3584, 41 | "25": 3584, 42 | "26": 3584, 43 | "27": 3584 44 | }, 45 | "pruned_mlp_width": { 46 | "0": 14848, 47 | "1": 10752, 48 | "2": 14848, 49 | "3": 17920, 50 | "4": 18944, 51 | "5": 18944, 52 | "6": 15872, 53 | "7": 18944, 54 | "8": 18944, 55 | "9": 18944, 56 | "10": 18944, 57 | "11": 18944, 58 | "12": 18944, 59 | "13": 18944, 60 | "14": 9728, 61 | "15": 7680, 62 | "16": 9728, 63 | "17": 1536, 64 | "18": 18944, 65 | "19": 18944, 66 | "20": 18944, 67 | "21": 18944, 68 | "22": 17920, 69 | "23": 18944, 70 | "24": 18944, 71 | "25": 18944, 72 | "26": 18944, 73 | "27": 18944 74 | } 75 | } -------------------------------------------------------------------------------- /SQFT/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | set -x 4 | 5 | pip install 'numpy<2.0.0' setuptools==69.5.1 wheel 6 | pip install transformers==4.47.0 7 | 8 | # peft 9 | SQFT_PATH=$PWD 10 | mkdir third_party && cd third_party 11 | git clone https://github.com/huggingface/peft.git 12 | cd peft && git checkout v0.10.0 && git apply --ignore-space-change --ignore-whitespace ${SQFT_PATH}/patches/peft-v0.10.0.patch && pip install -e . && cd .. 13 | 14 | pip install datasets accelerate sentencepiece protobuf 15 | pip install optimum --no-deps 16 | pip install git+https://github.com/AutoGPTQ/AutoGPTQ@866b4c8 17 | 18 | # lm-eval-harness 19 | pip install lm-eval==0.4.2 20 | -------------------------------------------------------------------------------- /SQFT/legacy/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | set -x 4 | 5 | pip install 'numpy<2.0.0' setuptools==69.5.1 wheel 6 | 7 | SQFT_PATH=$PWD 8 | mkdir third_party && cd third_party 9 | 10 | # transformers 11 | git clone https://github.com/huggingface/transformers.git 12 | cd transformers && git checkout v4.44.2 && git apply --ignore-space-change --ignore-whitespace ${SQFT_PATH}/patches/transformers-v4.44.2.patch && pip install -e . && cd .. 13 | 14 | # peft 15 | git clone https://github.com/huggingface/peft.git 16 | cd peft && git checkout v0.10.0 && git apply --ignore-space-change --ignore-whitespace ${SQFT_PATH}/patches/peft-v0.10.0.patch && pip install -e . && cd .. 17 | 18 | pip install datasets accelerate sentencepiece protobuf 19 | pip install optimum==1.18.0 --no-deps 20 | pip install git+https://github.com/AutoGPTQ/AutoGPTQ@866b4c8 21 | 22 | # nncf 23 | git clone https://github.com/openvinotoolkit/nncf.git 24 | cd nncf && git checkout f143e1c && git apply --ignore-space-change --ignore-whitespace ${SQFT_PATH}/patches/nncf-f143e1c.patch && pip install -e . && cd .. 25 | 26 | # lm-eval-harness 27 | pip install lm-eval==0.4.2 28 | -------------------------------------------------------------------------------- /SQFT/legacy/install_inference.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | set -x 4 | 5 | pip install 'numpy<2.0.0' setuptools==69.5.1 wheel 6 | 7 | # transformers 8 | pip install transformers==4.44.2 9 | pip install datasets accelerate sentencepiece protobuf 10 | pip install optimum==1.18.0 --no-deps 11 | pip install git+https://github.com/AutoGPTQ/AutoGPTQ@866b4c8 12 | 13 | # peft 14 | SQFT_PATH=$PWD 15 | mkdir third_party_inference && cd third_party_inference 16 | git clone https://github.com/huggingface/peft.git 17 | cd peft && git checkout v0.10.0 && git apply --ignore-space-change --ignore-whitespace ${SQFT_PATH}/patches/peft-v0.10.0.patch && pip install -e . && cd .. 18 | 19 | # lm-eval-harness (for evaluation) 20 | pip install lm-eval==0.4.2 21 | -------------------------------------------------------------------------------- /SQFT/legacy/opea/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2024 Intel Corporation 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # Use the same python version with ray 5 | FROM python:3.10.14 6 | 7 | ARG HF_TOKEN 8 | 9 | ENV HF_TOKEN=$HF_TOKEN 10 | 11 | RUN useradd -m -s /bin/bash user && \ 12 | mkdir -p /home/user && \ 13 | chown -R user /home/user/ 14 | 15 | COPY comps /home/user/comps 16 | 17 | RUN chown -R user /home/user/comps/finetuning 18 | 19 | USER user 20 | 21 | ENV PATH=$PATH:/home/user/.local/bin 22 | 23 | RUN python -m pip install --no-cache-dir --upgrade pip && \ 24 | python -m pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu && \ 25 | python -m pip install --no-cache-dir intel-extension-for-pytorch && \ 26 | python -m pip install --no-cache-dir oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ && \ 27 | python -m pip install --no-cache-dir -r /home/user/comps/finetuning/requirements.txt 28 | 29 | WORKDIR /home/user/comps/finetuning 30 | 31 | RUN git clone https://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning.git && \ 32 | cp -r Hardware-Aware-Automated-Machine-Learning/SQFT/patches /home/user/comps/finetuning/patches && \ 33 | rm -rf Hardware-Aware-Automated-Machine-Learning && \ 34 | mkdir third_party 35 | 36 | # Clone and set up transformers 37 | RUN git clone https://github.com/huggingface/transformers.git third_party/transformers && \ 38 | cd third_party/transformers && \ 39 | git checkout v4.44.2 && \ 40 | git apply --ignore-space-change --ignore-whitespace /home/user/comps/finetuning/patches/transformers-v4.44.2.patch && \ 41 | pip install -e . 42 | 43 | # Clone and set up peft 44 | RUN git clone https://github.com/huggingface/peft.git third_party/peft && \ 45 | cd third_party/peft && \ 46 | git checkout v0.10.0 && \ 47 | git apply --ignore-space-change --ignore-whitespace /home/user/comps/finetuning/patches/peft-v0.10.0.patch && \ 48 | pip install -e . 49 | 50 | # Clone and set up nncf 51 | RUN git clone https://github.com/openvinotoolkit/nncf.git third_party/nncf && \ 52 | cd third_party/nncf && \ 53 | git checkout f143e1c && \ 54 | git apply --ignore-space-change --ignore-whitespace /home/user/comps/finetuning/patches/nncf-f143e1c.patch && \ 55 | pip install -e . 56 | 57 | ENV PYTHONPATH=$PYTHONPATH:/home/user 58 | 59 | RUN echo PKGPATH=$(python3 -c "import pkg_resources; print(pkg_resources.get_distribution('oneccl-bind-pt').location)") >> run.sh && \ 60 | echo 'export LD_LIBRARY_PATH=$PKGPATH/oneccl_bindings_for_pytorch/opt/mpi/lib/:$LD_LIBRARY_PATH' >> run.sh && \ 61 | echo 'source $PKGPATH/oneccl_bindings_for_pytorch/env/setvars.sh' >> run.sh && \ 62 | echo ray start --head --dashboard-host=0.0.0.0 >> run.sh && \ 63 | echo export RAY_ADDRESS=http://localhost:8265 >> run.sh && \ 64 | echo python finetuning_service.py >> run.sh 65 | 66 | CMD bash run.sh 67 | -------------------------------------------------------------------------------- /SQFT/legacy/opea/dataset/preprocess_arc.py: -------------------------------------------------------------------------------- 1 | import json 2 | from datasets import load_dataset 3 | 4 | def process_arc_document(document): 5 | """Add prompt to ARC dataset document. 6 | 7 | Args: 8 | document (dict): A dictionary containing the ARC dataset document. 9 | 10 | Returns: 11 | dict: The document with the added prompt. 12 | """ 13 | 14 | instruction = document["question"] 15 | answer_key = document["answerKey"] 16 | choices = document["choices"]["label"] 17 | texts = document["choices"]["text"] 18 | 19 | def _process_output(document): 20 | """Process the ARC document to extract relevant fields.""" 21 | num_to_letter = {"1": "A", "2": "B", "3": "C", "4": "D", "5": "E"} 22 | document["answerKey"] = num_to_letter.get(document["answerKey"], document["answerKey"]) 23 | processed_output = { 24 | "choices": document["choices"]["text"], 25 | "gold": ["A", "B", "C", "D", "E"].index(document["answerKey"]), 26 | } 27 | return processed_output 28 | 29 | 30 | processed_output = _process_output(document) 31 | answer = processed_output["choices"][processed_output["gold"]] 32 | new_entry = { 33 | "instruction": instruction, 34 | "input": "", 35 | "output": answer 36 | } 37 | return new_entry 38 | 39 | 40 | dataset = load_dataset("ai2_arc", "ARC-Easy", split="train") 41 | new_data = [process_arc_document(doc) for doc in dataset] 42 | 43 | print(len(new_data)) 44 | 45 | with open("arce_train_instruct.json", "w", encoding="utf-8") as f: 46 | json.dump(new_data, f, ensure_ascii=False, indent=4) 47 | -------------------------------------------------------------------------------- /SQFT/legacy/patches/wanda-8e8fc87.patch: -------------------------------------------------------------------------------- 1 | diff --git a/lib/data.py b/lib/data.py 2 | index b6842c4..ce2b55c 100644 3 | --- a/lib/data.py 4 | +++ b/lib/data.py 5 | @@ -40,8 +40,8 @@ def get_wikitext2(nsamples, seed, seqlen, tokenizer): 6 | # Load and process c4 dataset 7 | def get_c4(nsamples, seed, seqlen, tokenizer): 8 | # Load train and validation datasets 9 | - traindata = load_dataset('allenai/c4', 'allenai--c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train') 10 | - valdata = load_dataset('allenai/c4', 'allenai--c4', data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'}, split='validation') 11 | + traindata = load_dataset('allenai/c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train') 12 | + valdata = load_dataset('allenai/c4', data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'}, split='validation') 13 | 14 | # Generate samples from training set 15 | random.seed(seed) 16 | diff --git a/lib/prune.py b/lib/prune.py 17 | index 01d981c..b772908 100644 18 | --- a/lib/prune.py 19 | +++ b/lib/prune.py 20 | @@ -141,7 +141,11 @@ def prune_wanda(args, model, tokenizer, device=torch.device("cuda:0"), prune_n=0 21 | 22 | if f"model.layers.{i}" in model.hf_device_map: ## handle the case for llama-30B and llama-65B, when the device map has multiple GPUs; 23 | dev = model.hf_device_map[f"model.layers.{i}"] 24 | - inps, outs, attention_mask, position_ids = inps.to(dev), outs.to(dev), attention_mask.to(dev), position_ids.to(dev) 25 | + inps, outs = inps.to(dev), outs.to(dev) 26 | + if attention_mask is not None: 27 | + attention_mask = attention_mask.to(dev) 28 | + if position_ids is not None: 29 | + position_ids = position_ids.to(dev) 30 | 31 | wrapped_layers = {} 32 | for name in subset: 33 | diff --git a/main.py b/main.py 34 | index a94583c..2d5cbec 100644 35 | --- a/main.py 36 | +++ b/main.py 37 | @@ -22,7 +22,20 @@ def get_llm(model_name, cache_dir="llm_weights"): 38 | device_map="auto" 39 | ) 40 | 41 | - model.seqlen = model.config.max_position_embeddings 42 | + if not hasattr(model.config, 'max_position_embeddings'): 43 | + raise AttributeError( 44 | + "model.config does not have `max_position_embeddings`, please check the attribute name for the maximum length. " 45 | + "You may need to modify the code accordingly." 46 | + ) 47 | + else: 48 | + if model.config.max_position_embeddings > 8192: 49 | + # such as mistralai/Mistral-7B-v0.3 ("max_position_embeddings": 32768) 50 | + model.seqlen = 8192 51 | + print( 52 | + "The maximum length supported by this model is large, setting the maximum length for calibration samples to 8192." 53 | + ) 54 | + else: 55 | + model.seqlen = model.config.max_position_embeddings 56 | return model 57 | 58 | def main(): 59 | -------------------------------------------------------------------------------- /SQFT/legacy/run_command/README.md: -------------------------------------------------------------------------------- 1 | ### Run command 2 | 3 | Prepare the datasets from [LLM-Adapters](https://github.com/AGI-Edgerunners/LLM-Adapters) for our math instruction tuning Setting. 4 | ```bash 5 | git clone https://github.com/AGI-Edgerunners/LLM-Adapters.git 6 | mv LLM-Adapters/dataset/ datasets/ 7 | mv LLM-Adapters/ft-training_set/* datasets/ 8 | ``` 9 | 10 | #### Llama-3 11 | 12 | ```bash 13 | bash run_command/llama-3-8b/sparse_quantization.sh $SPARSITY # e.g., SPARSITY=50 14 | bash run_command/llama-3-8b/run.sh $SPARSITY 15 | ``` 16 | 17 | #### Mistral-v0.3 18 | 19 | ```bash 20 | bash run_command/mistral-7b-v0.3/sparse_quantization.sh $SPARSITY 21 | bash run_command/mistral-7b-v0.3/run.sh $SPARSITY $TASK 22 | ``` 23 | Supported tasks: `gsm8k` and `math`. 24 | 25 | #### Phi-3 26 | 27 | ```bash 28 | bash run_command/phi-3-mini-4k-instruct/sparse_quantization.sh $SPARSITY 29 | bash run_command/phi-3-mini-4k-instruct/run.sh $SPARSITY $TASK 30 | ``` 31 | Supported tasks: `cs` and `math`. 32 | 33 | Note that the results presented in the paper were obtained using an older environment setup. 34 | Specifically, we utilized torch version `2.1.2`, transformers version `4.39.1`, and NNCF with commit ID `544d5141`. 35 | The training was conducted on a single Tesla V100-SXM2-32GB GPU. 36 | -------------------------------------------------------------------------------- /SQFT/legacy/run_command/llama-3-8b/sparse_quantization.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | set -e 4 | 5 | SPARSITY=$1 6 | 7 | BASE_MODEL_PATH=meta-llama/Meta-Llama-3-8B 8 | SPARSE_BASE_MODEL_PATH=sqft-llama-3-8b-${SPARSITY}-base 9 | QUANT_BASE_MODEL_PATH=sqft-llama-3-8b-${SPARSITY}-base-gptq 10 | 11 | python wanda/main.py --model ${BASE_MODEL_PATH} --prune_method wanda --sparsity_ratio $(echo "scale=2; ${SPARSITY}/100" | bc) --sparsity_type unstructured --save wanda_out --save_model ${SPARSE_BASE_MODEL_PATH} 12 | python utils/quantization.py --base_model_path ${SPARSE_BASE_MODEL_PATH} --output_dir ${QUANT_BASE_MODEL_PATH} 13 | -------------------------------------------------------------------------------- /SQFT/legacy/run_command/mistral-7b-v0.3/sparse_quantization.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | set -e 4 | 5 | SPARSITY=$1 6 | 7 | BASE_MODEL_PATH=mistralai/Mistral-7B-v0.3 8 | SPARSE_BASE_MODEL_PATH=sqft-mistral-7b-v0.3-${SPARSITY}-base 9 | QUANT_BASE_MODEL_PATH=sqft-mistral-7b-v0.3-${SPARSITY}-base-gptq 10 | 11 | python wanda/main.py --model ${BASE_MODEL_PATH} --prune_method wanda --sparsity_ratio $(echo "scale=2; ${SPARSITY}/100" | bc) --sparsity_type unstructured --save wanda_out --save_model ${SPARSE_BASE_MODEL_PATH} 12 | python utils/quantization.py --base_model_path ${SPARSE_BASE_MODEL_PATH} --output_dir ${QUANT_BASE_MODEL_PATH} 13 | -------------------------------------------------------------------------------- /SQFT/legacy/run_command/phi-3-mini-4k-instruct/sparse_quantization.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | set -e 4 | 5 | SPARSITY=$1 6 | 7 | BASE_MODEL_PATH=microsoft/Phi-3-mini-4k-instruct 8 | SPARSE_BASE_MODEL_PATH=sqft-phi-3-mini-4k-${SPARSITY}-base 9 | QUANT_BASE_MODEL_PATH=sqft-phi-3-mini-4k-${SPARSITY}-base-gptq 10 | 11 | python wanda/main.py --model ${BASE_MODEL_PATH} --prune_method wanda --sparsity_ratio $(echo "scale=2; ${SPARSITY}/100" | bc) --sparsity_type unstructured --save wanda_out --save_model ${SPARSE_BASE_MODEL_PATH} 12 | python utils/quantization.py --base_model_path ${SPARSE_BASE_MODEL_PATH} --output_dir ${QUANT_BASE_MODEL_PATH} 13 | -------------------------------------------------------------------------------- /SQFT/legacy/utils/quantization.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig 3 | 4 | def main(): 5 | parser = argparse.ArgumentParser(description="Quantize and save a model.") 6 | parser.add_argument("--base_model_path", type=str, required=True, help="Path to the base model.") 7 | parser.add_argument("--tokenizer_path", type=str, default=None, help="Path to the tokenizer. Defaults to base model path if not provided.") 8 | parser.add_argument("--dtype", type=str, default="float16", help="Data type for model weights.") 9 | parser.add_argument("--block_name_to_quantize", type=str, default=None, help="Specific block name to quantize.") 10 | parser.add_argument("--output_dir", type=str, required=True, help="Directory to save the quantized model and tokenizer.") 11 | args = parser.parse_args() 12 | 13 | base_model_path = args.base_model_path 14 | tokenizer_path = args.tokenizer_path 15 | dtype = args.dtype 16 | block_name_to_quantize = args.block_name_to_quantize 17 | output_dir = args.output_dir 18 | 19 | tokenizer = AutoTokenizer.from_pretrained( 20 | base_model_path if tokenizer_path is None else tokenizer_path, 21 | trust_remote_code=True 22 | ) 23 | tokenizer.pad_token = tokenizer.eos_token 24 | 25 | if block_name_to_quantize is None: 26 | quantization_config = GPTQConfig(bits=4, dataset="c4", tokenizer=tokenizer, use_exllama=False) 27 | else: 28 | quantization_config = GPTQConfig( 29 | bits=4, dataset="c4", tokenizer=tokenizer, use_exllama=False, block_name_to_quantize=block_name_to_quantize 30 | ) 31 | 32 | quantized_model = AutoModelForCausalLM.from_pretrained( 33 | base_model_path, 34 | device_map="auto", 35 | torch_dtype=dtype, 36 | trust_remote_code=True, 37 | quantization_config=quantization_config 38 | ) 39 | 40 | quantized_model.config.quantization_config.use_exllama = False 41 | if block_name_to_quantize is not None: 42 | quantized_model.config.quantization_config.block_name_to_quantize = block_name_to_quantize 43 | 44 | quantized_model.save_pretrained(output_dir) 45 | tokenizer.save_pretrained(output_dir) 46 | 47 | # Uncomment the following lines to push the model and tokenizer to the hub 48 | # quantized_model.push_to_hub(output_dir, private=True) 49 | # tokenizer.push_to_hub(output_dir, private=True) 50 | 51 | if __name__ == "__main__": 52 | main() 53 | -------------------------------------------------------------------------------- /SQFT/patches/wanda-8e8fc87.patch: -------------------------------------------------------------------------------- 1 | diff --git a/lib/data.py b/lib/data.py 2 | index b6842c4..ce2b55c 100644 3 | --- a/lib/data.py 4 | +++ b/lib/data.py 5 | @@ -40,8 +40,8 @@ def get_wikitext2(nsamples, seed, seqlen, tokenizer): 6 | # Load and process c4 dataset 7 | def get_c4(nsamples, seed, seqlen, tokenizer): 8 | # Load train and validation datasets 9 | - traindata = load_dataset('allenai/c4', 'allenai--c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train') 10 | - valdata = load_dataset('allenai/c4', 'allenai--c4', data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'}, split='validation') 11 | + traindata = load_dataset('allenai/c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train') 12 | + valdata = load_dataset('allenai/c4', data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'}, split='validation') 13 | 14 | # Generate samples from training set 15 | random.seed(seed) 16 | diff --git a/lib/prune.py b/lib/prune.py 17 | index 01d981c..b772908 100644 18 | --- a/lib/prune.py 19 | +++ b/lib/prune.py 20 | @@ -141,7 +141,11 @@ def prune_wanda(args, model, tokenizer, device=torch.device("cuda:0"), prune_n=0 21 | 22 | if f"model.layers.{i}" in model.hf_device_map: ## handle the case for llama-30B and llama-65B, when the device map has multiple GPUs; 23 | dev = model.hf_device_map[f"model.layers.{i}"] 24 | - inps, outs, attention_mask, position_ids = inps.to(dev), outs.to(dev), attention_mask.to(dev), position_ids.to(dev) 25 | + inps, outs = inps.to(dev), outs.to(dev) 26 | + if attention_mask is not None: 27 | + attention_mask = attention_mask.to(dev) 28 | + if position_ids is not None: 29 | + position_ids = position_ids.to(dev) 30 | 31 | wrapped_layers = {} 32 | for name in subset: 33 | diff --git a/main.py b/main.py 34 | index a94583c..2d5cbec 100644 35 | --- a/main.py 36 | +++ b/main.py 37 | @@ -22,7 +22,20 @@ def get_llm(model_name, cache_dir="llm_weights"): 38 | device_map="auto" 39 | ) 40 | 41 | - model.seqlen = model.config.max_position_embeddings 42 | + if not hasattr(model.config, 'max_position_embeddings'): 43 | + raise AttributeError( 44 | + "model.config does not have `max_position_embeddings`, please check the attribute name for the maximum length. " 45 | + "You may need to modify the code accordingly." 46 | + ) 47 | + else: 48 | + if model.config.max_position_embeddings > 8192: 49 | + # such as mistralai/Mistral-7B-v0.3 ("max_position_embeddings": 32768) 50 | + model.seqlen = 8192 51 | + print( 52 | + "The maximum length supported by this model is large, setting the maximum length for calibration samples to 8192." 53 | + ) 54 | + else: 55 | + model.seqlen = model.config.max_position_embeddings 56 | return model 57 | 58 | def main(): 59 | -------------------------------------------------------------------------------- /SQFT/utils/quantization.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig 3 | 4 | 5 | def main(): 6 | parser = argparse.ArgumentParser(description="Quantize and save a model.") 7 | parser.add_argument("--base_model_path", type=str, required=True, help="Path to the base model.") 8 | parser.add_argument("--tokenizer_path", type=str, default=None, help="Path to the tokenizer. Defaults to base model path if not provided.") 9 | parser.add_argument("--dtype", type=str, default="float16", help="Data type for model weights.") 10 | parser.add_argument("--block_name_to_quantize", type=str, default=None, help="Specific block name to quantize.") 11 | parser.add_argument("--output_dir", type=str, required=True, help="Directory to save the quantized model and tokenizer.") 12 | args = parser.parse_args() 13 | 14 | base_model_path = args.base_model_path 15 | tokenizer_path = args.tokenizer_path 16 | dtype = args.dtype 17 | block_name_to_quantize = args.block_name_to_quantize 18 | output_dir = args.output_dir 19 | 20 | tokenizer = AutoTokenizer.from_pretrained( 21 | base_model_path if tokenizer_path is None else tokenizer_path, 22 | trust_remote_code=True 23 | ) 24 | tokenizer.pad_token = tokenizer.eos_token 25 | 26 | if block_name_to_quantize is None: 27 | quantization_config = GPTQConfig(bits=4, dataset="c4", tokenizer=tokenizer, use_exllama=False) 28 | else: 29 | quantization_config = GPTQConfig( 30 | bits=4, dataset="c4", tokenizer=tokenizer, use_exllama=False, block_name_to_quantize=block_name_to_quantize 31 | ) 32 | 33 | quantized_model = AutoModelForCausalLM.from_pretrained( 34 | base_model_path, 35 | device_map="auto", 36 | torch_dtype=dtype, 37 | trust_remote_code=True, 38 | quantization_config=quantization_config 39 | ) 40 | 41 | quantized_model.config.quantization_config.use_exllama = False 42 | if block_name_to_quantize is not None: 43 | quantized_model.config.quantization_config.block_name_to_quantize = block_name_to_quantize 44 | 45 | quantized_model.save_pretrained(output_dir) 46 | tokenizer.save_pretrained(output_dir) 47 | 48 | # Uncomment the following lines to push the model and tokenizer to the hub 49 | # quantized_model.push_to_hub(output_dir, private=True) 50 | # tokenizer.push_to_hub(output_dir, private=True) 51 | 52 | if __name__ == "__main__": 53 | main() 54 | -------------------------------------------------------------------------------- /Shears/example_commonsense.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import torch 5 | from peft import PeftModel 6 | from transformers import AutoModelForCausalLM 7 | from transformers import AutoTokenizer 8 | 9 | 10 | def generate_prompt(instruction): 11 | return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request. 12 | 13 | ### Instruction: 14 | {instruction} 15 | 16 | ### Response: 17 | """ 18 | 19 | 20 | def main(): 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument("--base_model_path", default="shears-llama-7b-50-base", type=str) 23 | parser.add_argument("--adapter_model_path", default="IntelLabs/shears-llama-7b-50-cs-heuristic-adapter", type=str) 24 | args = parser.parse_args() 25 | base_model_path = args.base_model_path 26 | adapter_model_path = args.adapter_model_path 27 | 28 | base_model = AutoModelForCausalLM.from_pretrained( 29 | base_model_path, 30 | torch_dtype=torch.float16, 31 | device_map="auto", 32 | trust_remote_code=True 33 | ) 34 | model = PeftModel.from_pretrained(base_model, adapter_model_path, torch_dtype=torch.float16, device_map="auto") 35 | model.eval() 36 | tokenizer = AutoTokenizer.from_pretrained(base_model_path) 37 | 38 | non_zero_params = sum([(param.data != 0).sum().item() for _, param in model.named_parameters()]) 39 | print(f"Number of all non-zero parameters: {non_zero_params}") 40 | 41 | instructions = [ 42 | "Please choose the correct answer to the question: A cactus stem is used to store\n\nAnswer1: fruit " 43 | "Answer2: liquid Answer3: food Answer4: spines\n\nAnswer format: answer1/answer2/answer3/answer4", 44 | 45 | "Please choose the correct solution to the question: Prevent bottles from rolling in fridge.\n\n" 46 | "Solution1: Put binder clip on fridge shelves to prevent sliding.\n\nSolution2: Put staple remover on " 47 | "fridge shelves to prevent sliding.\n\nAnswer format: solution1/solution2", 48 | 49 | "Please choose the correct answer to the question: Which characteristic describes the texture of a " 50 | "kitten's fur?\n\nAnswer1: gray Answer2: warm Answer3: long Answer4: soft\n\nAnswer format: answer1/" 51 | "answer2/answer3/answer4", 52 | ] 53 | 54 | for idx, instruction in enumerate(instructions): 55 | print(f"Example {idx}:") 56 | prompt = generate_prompt(instruction) 57 | inputs = tokenizer(prompt, return_tensors="pt") 58 | input_ids = inputs["input_ids"].to(model.device) 59 | with torch.no_grad(): 60 | generation_output = model.generate( 61 | input_ids=input_ids, 62 | return_dict_in_generate=True, 63 | output_scores=True, 64 | max_new_tokens=256, 65 | use_cache=True, 66 | num_beams=4, 67 | ) 68 | s = generation_output.sequences[0] 69 | output = tokenizer.decode(s) 70 | print(output) 71 | 72 | 73 | if __name__ == "__main__": 74 | main() 75 | -------------------------------------------------------------------------------- /Shears/example_math.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import torch 5 | from peft import PeftModel 6 | from transformers import AutoModelForCausalLM 7 | from transformers import AutoTokenizer 8 | 9 | 10 | def generate_prompt(instruction): 11 | return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request. 12 | 13 | ### Instruction: 14 | {instruction} 15 | 16 | ### Response: 17 | """ 18 | 19 | 20 | def main(): 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument("--base_model_path", default="IntelLabs/shears-mpt-7b-50-base", type=str) 23 | parser.add_argument("--adapter_model_path", default="IntelLabs/shears-mpt-7b-50-gsm8k-heuristic-adapter", type=str) 24 | args = parser.parse_args() 25 | base_model_path = args.base_model_path 26 | adapter_model_path = args.adapter_model_path 27 | 28 | base_model = AutoModelForCausalLM.from_pretrained( 29 | base_model_path, 30 | torch_dtype=torch.float16, 31 | device_map="auto", 32 | trust_remote_code=True 33 | ) 34 | model = PeftModel.from_pretrained(base_model, adapter_model_path, torch_dtype=torch.float16, device_map="auto") 35 | model.eval() 36 | tokenizer = AutoTokenizer.from_pretrained(base_model_path) 37 | 38 | non_zero_params = sum([(param.data != 0).sum().item() for _, param in model.named_parameters()]) 39 | print(f"Number of all non-zero parameters: {non_zero_params}") 40 | 41 | instructions = [ 42 | "Jack had $100. Sophia gave him 1/5 of her $100. How many dollars does Jack have now?", 43 | "Edgar eats 18 pretzels a day. If his brother eats 1/2 as many, how many does his brother eat in a week?", 44 | "Trent is 5 years older than Jane, and Jane is 3 years younger than Quinn. If Quinn is 30, how old is Trent?", 45 | ] 46 | 47 | for idx, instruction in enumerate(instructions): 48 | print(f"Example {idx}:") 49 | prompt = generate_prompt(instruction) 50 | inputs = tokenizer(prompt, return_tensors="pt") 51 | input_ids = inputs["input_ids"].to(model.device) 52 | with torch.no_grad(): 53 | generation_output = model.generate( 54 | input_ids=input_ids, 55 | return_dict_in_generate=True, 56 | output_scores=True, 57 | max_new_tokens=256, 58 | use_cache=True, 59 | num_beams=4, 60 | ) 61 | s = generation_output.sequences[0] 62 | output = tokenizer.decode(s) 63 | print(output) 64 | 65 | 66 | if __name__ == "__main__": 67 | main() 68 | -------------------------------------------------------------------------------- /Shears/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | set -x 4 | 5 | SHEARS_PATH=$PWD 6 | mkdir third_party && cd third_party 7 | 8 | # transformers 9 | git clone https://github.com/huggingface/transformers.git 10 | cd transformers && git checkout v4.31.0 && git apply --ignore-space-change --ignore-whitespace $SHEARS_PATH/patches/transformers-v4.31.0.patch && pip install -e . && cd .. 11 | 12 | # peft 13 | git clone https://github.com/huggingface/peft.git 14 | cd peft && git checkout v0.5.0 && git apply --ignore-space-change --ignore-whitespace $SHEARS_PATH/patches/peft-v0.5.0.patch && git apply --ignore-space-change --ignore-whitespace $SHEARS_PATH/patches/peft-v0.5.0-inference.patch && pip install -e . && cd .. 15 | 16 | # nncf 17 | git clone https://github.com/openvinotoolkit/nncf.git 18 | cd nncf && git checkout 544d5141 && git apply --ignore-space-change --ignore-whitespace $SHEARS_PATH/patches/nncf-544d5141.patch && pip install -e . && cd .. 19 | 20 | # others 21 | pip install datasets accelerate sentencepiece protobuf 22 | -------------------------------------------------------------------------------- /Shears/nncf_config/nncf_config.md: -------------------------------------------------------------------------------- 1 | ## NNCF Config of Shears 2 | 3 | To enable the elastic adapter in NLS training, we employ the [BootstrapNAS](https://github.com/openvinotoolkit/nncf/tree/develop/nncf/experimental/torch/nas/bootstrapNAS) feature within [OpenVINO™ NNCF](https://github.com/openvinotoolkit/nncf), offering a range of compression algorithms tailored for optimizing neural networks. 4 | Here is an instruction of NNCF configuration for Shears, aimed at eliminating any user doubts regarding the config and clarifying which parts are relevant to Shears. 5 | 6 | Some explanations: 7 | 8 | - `input_info` is used to create nncf network in Shears. 9 | - Shears employs the `progressive_shrinking` algorithm of bootstrapNAS, details can be found in [BootstrapNAS.md](https://github.com/openvinotoolkit/nncf/blob/develop/nncf/experimental/torch/nas/bootstrapNAS/BootstrapNAS.md). 10 | Actually, Shears only adopts the simplest `progressive_shrinking` feature without utilizing its more intricate and advanced strategies such as multi-stage training. 11 | We will explore more complex training strategies of `progressive_shrinking` in the future. 12 | - `frozen_layers_allowed` should be set to `true`, because Shears freezes the base model. 13 | - `width` means the hidden size of the weight matrix, more precisely, it represents the low-rank size of the LoRA adapter in Shears. 14 | - `num_bn_adaptation_samples` should be set to 0 (default is 2000), as we don't need batch norm adaption. 15 | 16 | ### Elastic low-rank 17 | 18 | In Shears solution, the design of the low-rank search space is crucial, including the allocation of dependency groups and the design of the search space for each group. 19 | In our existing configurations, such as the LLaMA model, we adopt the grouping `[[Q, K, V], [Up], [Down]]` for each llama layer, with each group's search space being `[32, 24, 16]`, i.e., 20 | 21 | - `[Q, K, V]`: `[32, 24, 16]` 22 | - `[Up]`: `[32, 24, 16]` 23 | - `[Down]`: `[32, 24, 16]` 24 | 25 | ```json 26 | "width": { 27 | "overwrite_groups": [ 28 | [ 29 | "{re}PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[{*}]/LlamaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0", 30 | "{re}PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[{*}]/LlamaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0", 31 | "{re}PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[{*}]/LlamaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0" 32 | ], 33 | [ 34 | "{re}PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[{*}]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0" 35 | ], 36 | [ 37 | "{re}PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[{*}]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0" 38 | ] 39 | ], 40 | "overwrite_groups_widths": [ 41 | [32, 24, 16], [32, 24, 16], [32, 24, 16] 42 | ] 43 | } 44 | ``` 45 | 46 | Note that the length of groups should be equal to the length of the group widths, and we only set the output hidden 47 | size space of LoRA-A in the config, as the input hidden size of LoRA-B will be automatically pruned according to LoRA-A. 48 | Feel free to try your own group design and search spaces. 49 | -------------------------------------------------------------------------------- /Shears/nncf_config/nncf_shears_llama.json: -------------------------------------------------------------------------------- 1 | { 2 | "input_info": [ 3 | { 4 | "sample_size": [1, 256], 5 | "type": "long", 6 | "keyword": "input_ids" 7 | }, 8 | { 9 | "sample_size": [1, 256], 10 | "type": "long", 11 | "keyword": "attention_mask" 12 | } 13 | ], 14 | "bootstrapNAS": { 15 | "training": { 16 | "algorithm": "progressive_shrinking", 17 | "frozen_layers_allowed": true, 18 | "progressivity_of_elasticity": ["width"], 19 | "batchnorm_adaptation": { 20 | "num_bn_adaptation_samples": 0 21 | }, 22 | "schedule": { 23 | "list_stage_descriptions": [ 24 | {"train_dims": ["width"], "epochs": -1, "depth_indicator": 1, "width_indicator": 5, "init_lr": -1, "epochs_lr": -1, "sample_rate": 1} 25 | ] 26 | }, 27 | "elasticity": { 28 | "available_elasticity_dims": ["width"], 29 | "width": { 30 | "overwrite_groups": [ 31 | [ 32 | "{re}PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[{*}]/LlamaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0", 33 | "{re}PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[{*}]/LlamaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0", 34 | "{re}PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[{*}]/LlamaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0" 35 | ], 36 | [ 37 | "{re}PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[{*}]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0" 38 | ], 39 | [ 40 | "{re}PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[{*}]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0" 41 | ] 42 | ], 43 | "overwrite_groups_widths": [ 44 | [32, 24, 16], [32, 24, 16], [32, 24, 16] 45 | ] 46 | } 47 | } 48 | } 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /Shears/nncf_config/nncf_shears_llama_with_gate_proj.json: -------------------------------------------------------------------------------- 1 | { 2 | "input_info": [ 3 | { 4 | "sample_size": [1, 256], 5 | "type": "long", 6 | "keyword": "input_ids" 7 | }, 8 | { 9 | "sample_size": [1, 256], 10 | "type": "long", 11 | "keyword": "attention_mask" 12 | } 13 | ], 14 | "bootstrapNAS": { 15 | "training": { 16 | "algorithm": "progressive_shrinking", 17 | "frozen_layers_allowed": true, 18 | "progressivity_of_elasticity": ["width"], 19 | "batchnorm_adaptation": { 20 | "num_bn_adaptation_samples": 0 21 | }, 22 | "schedule": { 23 | "list_stage_descriptions": [ 24 | {"train_dims": ["width"], "epochs": -1, "depth_indicator": 1, "width_indicator": 5, "init_lr": -1, "epochs_lr": -1, "sample_rate": 1} 25 | ] 26 | }, 27 | "elasticity": { 28 | "available_elasticity_dims": ["width"], 29 | "width": { 30 | "overwrite_groups": [ 31 | [ 32 | "{re}PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[{*}]/LlamaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0", 33 | "{re}PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[{*}]/LlamaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0", 34 | "{re}PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[{*}]/LlamaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0" 35 | ], 36 | [ 37 | "{re}PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[{*}]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0", 38 | "{re}PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[{*}]/LlamaMLP[mlp]/Linear[gate_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0" 39 | ], 40 | [ 41 | "{re}PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[{*}]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0" 42 | ] 43 | ], 44 | "overwrite_groups_widths": [ 45 | [32, 24, 16], [32, 24, 16], [32, 24, 16] 46 | ] 47 | } 48 | } 49 | } 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /Shears/nncf_config/nncf_shears_mpt.json: -------------------------------------------------------------------------------- 1 | { 2 | "input_info": [ 3 | { 4 | "sample_size": [1, 256], 5 | "type": "long", 6 | "keyword": "input_ids" 7 | }, 8 | { 9 | "sample_size": [1, 256], 10 | "type": "long", 11 | "keyword": "attention_mask" 12 | } 13 | ], 14 | "bootstrapNAS": { 15 | "training": { 16 | "algorithm": "progressive_shrinking", 17 | "frozen_layers_allowed": true, 18 | "progressivity_of_elasticity": ["width"], 19 | "batchnorm_adaptation": { 20 | "num_bn_adaptation_samples": 0 21 | }, 22 | "schedule": { 23 | "list_stage_descriptions": [ 24 | {"train_dims": ["width"], "epochs": -1, "depth_indicator": 1, "width_indicator": 5, "init_lr": -1, "epochs_lr": -1, "sample_rate": 1} 25 | ] 26 | }, 27 | "elasticity": { 28 | "available_elasticity_dims": ["width"], 29 | "width": { 30 | "overwrite_groups": [ 31 | [ 32 | "{re}PeftModelForCausalLM/LoraModel[base_model]/MPTForCausalLM[model]/MPTModel[transformer]/ModuleList[blocks]/MPTBlock[{*}]/MultiheadAttention[attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0", 33 | "{re}PeftModelForCausalLM/LoraModel[base_model]/MPTForCausalLM[model]/MPTModel[transformer]/ModuleList[blocks]/MPTBlock[{*}]/MultiheadAttention[attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0", 34 | "{re}PeftModelForCausalLM/LoraModel[base_model]/MPTForCausalLM[model]/MPTModel[transformer]/ModuleList[blocks]/MPTBlock[{*}]/MultiheadAttention[attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0", 35 | "{re}PeftModelForCausalLM/LoraModel[base_model]/MPTForCausalLM[model]/MPTModel[transformer]/ModuleList[blocks]/MPTBlock[{*}]/MultiheadAttention[attn]/Linear[out_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0" 36 | ], 37 | [ 38 | "{re}PeftModelForCausalLM/LoraModel[base_model]/MPTForCausalLM[model]/MPTModel[transformer]/ModuleList[blocks]/MPTBlock[{*}]/MPTMLP[ffn]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0", 39 | "{re}PeftModelForCausalLM/LoraModel[base_model]/MPTForCausalLM[model]/MPTModel[transformer]/ModuleList[blocks]/MPTBlock[{*}]/MPTMLP[ffn]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0" 40 | ] 41 | ], 42 | "overwrite_groups_widths": [ 43 | [32, 24, 16], [32, 24, 16] 44 | ] 45 | } 46 | } 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /Shears/patches/nncf-544d5141.patch: -------------------------------------------------------------------------------- 1 | diff --git a/nncf/experimental/torch/nas/bootstrapNAS/elasticity/elasticity_builder.py b/nncf/experimental/torch/nas/bootstrapNAS/elasticity/elasticity_builder.py 2 | index bc6464b2..9f7a2f3d 100644 3 | --- a/nncf/experimental/torch/nas/bootstrapNAS/elasticity/elasticity_builder.py 4 | +++ b/nncf/experimental/torch/nas/bootstrapNAS/elasticity/elasticity_builder.py 5 | @@ -9,7 +9,7 @@ 6 | # See the License for the specific language governing permissions and 7 | # limitations under the License. 8 | from collections import OrderedDict 9 | -from typing import Any, Dict, List 10 | +from typing import Any, Dict, List, Tuple 11 | 12 | from nncf import NNCFConfig 13 | from nncf.experimental.torch.nas.bootstrapNAS.elasticity.base_handler import SingleElasticityBuilder 14 | @@ -152,3 +152,8 @@ class ElasticityBuilder(PTCompressionAlgorithmBuilder): 15 | 16 | # No conflict resolving with the related config options, parameters are overridden by compression state 17 | self._available_elasticity_dims = list(map(ElasticityDim, available_elasticity_dims_state)) 18 | + 19 | + def _are_frozen_layers_allowed(self) -> Tuple[bool, str]: 20 | + if self.config.get("bootstrapNAS", {}).get("training", {}).get("frozen_layers_allowed", False): 21 | + return True, "Frozen layers are allowed (set in NNCF config)" 22 | + return super()._are_frozen_layers_allowed() 23 | diff --git a/nncf/experimental/torch/nas/bootstrapNAS/training/progressive_shrinking_builder.py b/nncf/experimental/torch/nas/bootstrapNAS/training/progressive_shrinking_builder.py 24 | index 92609327..fefdc2f6 100644 25 | --- a/nncf/experimental/torch/nas/bootstrapNAS/training/progressive_shrinking_builder.py 26 | +++ b/nncf/experimental/torch/nas/bootstrapNAS/training/progressive_shrinking_builder.py 27 | @@ -8,7 +8,7 @@ 28 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 29 | # See the License for the specific language governing permissions and 30 | # limitations under the License. 31 | -from typing import Any, Dict, List 32 | +from typing import Any, Dict, List, Tuple 33 | 34 | from nncf import NNCFConfig 35 | from nncf.common.initialization.batchnorm_adaptation import BatchnormAdaptationAlgorithm 36 | @@ -152,3 +152,8 @@ class ProgressiveShrinkingBuilder(PTCompressionAlgorithmBuilder): 37 | self._bn_adapt_params = state_without_name[self._state_names.BN_ADAPTATION_PARAMS] 38 | bn_adapt_algo_kwargs = get_bn_adapt_algo_kwargs(self.config, self._bn_adapt_params) 39 | self._bn_adaptation = BatchnormAdaptationAlgorithm(**bn_adapt_algo_kwargs) if bn_adapt_algo_kwargs else None 40 | + 41 | + def _are_frozen_layers_allowed(self) -> Tuple[bool, str]: 42 | + if self._algo_config.get("frozen_layers_allowed", False): 43 | + return True, "Frozen layers are allowed (set in NNCF config)" 44 | + return super()._are_frozen_layers_allowed() 45 | -------------------------------------------------------------------------------- /Shears/patches/peft-v0.5.0-inference.patch: -------------------------------------------------------------------------------- 1 | diff --git a/src/peft/utils/save_and_load.py b/src/peft/utils/save_and_load.py 2 | index 617287e..cb64cf7 100644 3 | --- a/src/peft/utils/save_and_load.py 4 | +++ b/src/peft/utils/save_and_load.py 5 | @@ -132,6 +132,33 @@ def set_peft_model_state_dict(model, peft_model_state_dict, adapter_name="defaul 6 | else: 7 | raise NotImplementedError 8 | 9 | + def module_reshape(state_dict): 10 | + for param_name, param in state_dict.items(): 11 | + tensor_name = param_name 12 | + splits = tensor_name.split(".") 13 | + if len(splits) > 1: 14 | + module = model 15 | + parent = None 16 | + for split in splits[:-1]: 17 | + new_module = getattr(module, split) 18 | + if new_module is None: 19 | + raise ValueError(f"{module} has no attribute {split}.") 20 | + parent = module 21 | + module = new_module 22 | + tensor_name = splits[-1] 23 | + old_value = getattr(module, tensor_name) 24 | + if old_value.shape != param.shape and isinstance(module, torch.nn.Linear): 25 | + new_module = torch.nn.Linear( 26 | + param.shape[1], 27 | + param.shape[0], 28 | + bias=module.bias is not None, 29 | + dtype=module.weight.dtype, 30 | + device=module.weight.device 31 | + ) 32 | + setattr(parent, splits[-2], new_module) 33 | + 34 | + module_reshape(peft_model_state_dict) 35 | + 36 | load_result = model.load_state_dict(peft_model_state_dict, strict=False) 37 | if config.is_prompt_learning: 38 | model.prompt_encoder[adapter_name].embedding.load_state_dict( 39 | -------------------------------------------------------------------------------- /Shears/preprocess/mpt_process/mpt-7b-modifications-for-shears-usage.patch: -------------------------------------------------------------------------------- 1 | diff --git a/attention.py b/attention.py 2 | index 5cc3be7..6c6e308 100644 3 | --- a/attention.py 4 | +++ b/attention.py 5 | @@ -242,9 +242,15 @@ class GroupedQueryAttention(nn.Module): 6 | fc_kwargs: dict[str, Any] = {'bias': bias} 7 | if fc_type != 'te': 8 | fc_kwargs['device'] = device 9 | - self.Wqkv = FC_CLASS_REGISTRY[fc_type](self.d_model, self.d_model + 2 * self.kv_n_heads * self.head_dim, **fc_kwargs) 10 | - fuse_splits = [i * self.head_dim for i in range(1, self.n_heads + 2 * self.kv_n_heads)] 11 | - self.Wqkv._fused = (0, fuse_splits) 12 | + 13 | + # Separating QKV brings more flexibility for pruning. 14 | + # self.Wqkv = FC_CLASS_REGISTRY[fc_type](self.d_model, self.d_model + 2 * self.kv_n_heads * self.head_dim, **fc_kwargs) 15 | + # fuse_splits = [i * self.head_dim for i in range(1, self.n_heads + 2 * self.kv_n_heads)] 16 | + # self.Wqkv._fused = (0, fuse_splits) 17 | + self.q_proj = FC_CLASS_REGISTRY[fc_type](self.d_model, self.d_model, **fc_kwargs) 18 | + self.k_proj = FC_CLASS_REGISTRY[fc_type](self.d_model, self.kv_n_heads * self.head_dim, **fc_kwargs) 19 | + self.v_proj = FC_CLASS_REGISTRY[fc_type](self.d_model, self.kv_n_heads * self.head_dim, **fc_kwargs) 20 | + 21 | if self.qk_ln: 22 | norm_class = NORM_CLASS_REGISTRY[norm_type.lower()] 23 | self.q_ln = norm_class(self.d_model, device=device) 24 | @@ -261,10 +267,16 @@ class GroupedQueryAttention(nn.Module): 25 | self.out_proj._is_residual = True 26 | 27 | def forward(self, x: torch.Tensor, past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]]=None, attn_bias: Optional[torch.Tensor]=None, attention_mask: Optional[torch.Tensor]=None, is_causal: bool=True, needs_weights: bool=False) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor, torch.Tensor]]]: 28 | - qkv = self.Wqkv(x) 29 | + # qkv = self.Wqkv(x) 30 | + # if self.clip_qkv: 31 | + # qkv = qkv.clamp(min=-self.clip_qkv, max=self.clip_qkv) 32 | + # (query, key, value) = qkv.split([self.d_model, self.kv_n_heads * self.head_dim, self.kv_n_heads * self.head_dim], dim=2) 33 | + query, key, value = self.q_proj(x), self.k_proj(x), self.v_proj(x) 34 | if self.clip_qkv: 35 | - qkv = qkv.clamp(min=-self.clip_qkv, max=self.clip_qkv) 36 | - (query, key, value) = qkv.split([self.d_model, self.kv_n_heads * self.head_dim, self.kv_n_heads * self.head_dim], dim=2) 37 | + query = query.clamp(min=-self.clip_qkv, max=self.clip_qkv) 38 | + key = key.clamp(min=-self.clip_qkv, max=self.clip_qkv) 39 | + value = value.clamp(min=-self.clip_qkv, max=self.clip_qkv) 40 | + 41 | key_padding_mask = attention_mask 42 | if self.qk_ln: 43 | dtype = query.dtype 44 | -------------------------------------------------------------------------------- /Shears/preprocess/mpt_process/split_qkv_preprocess.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import torch 5 | 6 | Q_PROJ_RANGE = (0, 4096) 7 | K_PROJ_RANGE = (4096, 8192) 8 | V_PROJ_RANGE = (8192, 12288) 9 | 10 | 11 | def main(): 12 | parser = argparse.ArgumentParser() 13 | 14 | parser.add_argument("--base_model_name_or_path", default="mpt-7b", type=str, help="Path to mpt-7b model") 15 | 16 | args = parser.parse_args() 17 | base_model_name_or_path = args.base_model_name_or_path 18 | paths = [ 19 | f"{base_model_name_or_path}/pytorch_model-00001-of-00002.bin", 20 | f"{base_model_name_or_path}/pytorch_model-00002-of-00002.bin", 21 | ] 22 | new_state_dict = {} 23 | 24 | for path in paths: 25 | old_state_dict = torch.load(path) 26 | keys = list(old_state_dict.keys()) 27 | for key in keys: 28 | if key.endswith("Wqkv.weight"): 29 | prefix = ".".join(key.split(".")[:-2]) 30 | new_state_dict[prefix + ".q_proj.weight"] = old_state_dict[key][ 31 | Q_PROJ_RANGE[0] : Q_PROJ_RANGE[1] 32 | ].clone() 33 | new_state_dict[prefix + ".k_proj.weight"] = old_state_dict[key][ 34 | K_PROJ_RANGE[0] : K_PROJ_RANGE[1] 35 | ].clone() 36 | new_state_dict[prefix + ".v_proj.weight"] = old_state_dict[key][ 37 | V_PROJ_RANGE[0] : V_PROJ_RANGE[1] 38 | ].clone() 39 | else: 40 | new_state_dict[key] = old_state_dict[key].clone() 41 | 42 | os.system(f"rm {base_model_name_or_path}/pytorch_model*") 43 | torch.save(new_state_dict, f"{base_model_name_or_path}/pytorch_model.bin") 44 | 45 | 46 | if __name__ == "__main__": 47 | main() 48 | -------------------------------------------------------------------------------- /Shears/utils/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Some NNCF config preprocessing code for Shears. 3 | 4 | This module provides preprocessing functionality for NNCF (Neural Network Compression Framework) configuration files 5 | used in Shears. It includes utility functions for handling JSON files and preprocessing NNCF configurations. 6 | """ 7 | import json 8 | from pathlib import Path 9 | from nncf import NNCFConfig 10 | from nncf.common.utils.os import safe_open 11 | 12 | def parse_nncf_config(nncf_config_path, num=1): 13 | 14 | with safe_open(Path(nncf_config_path)) as f: 15 | loaded_json = json.load(f) 16 | 17 | base_overwrite_groups = loaded_json["bootstrapNAS"]["training"]["elasticity"]["width"]["overwrite_groups"] 18 | base_overwrite_groups_widths = loaded_json["bootstrapNAS"]["training"]["elasticity"]["width"][ 19 | "overwrite_groups_widths"] 20 | overwrite_groups, overwrite_groups_widths = [], [] 21 | for group, width in zip(base_overwrite_groups, base_overwrite_groups_widths): 22 | if group[0].startswith("{re}"): 23 | new_group = [[item.replace("{re}", "").replace("{*}", str(i)) for item in group] for i in range(num)] 24 | new_width = [width for _ in range(num)] 25 | else: 26 | new_group = [group] 27 | new_width = [width] 28 | overwrite_groups.extend(new_group) 29 | overwrite_groups_widths.extend(new_width) 30 | 31 | loaded_json["bootstrapNAS"]["training"]["elasticity"]["width"]["overwrite_groups"] = overwrite_groups 32 | loaded_json["bootstrapNAS"]["training"]["elasticity"]["width"][ 33 | "overwrite_groups_widths"] = overwrite_groups_widths 34 | return loaded_json 35 | 36 | def add_lr_epochs(nncf_config, lr=3e-4, epochs=3): 37 | stage_desc = nncf_config["bootstrapNAS"]["training"]["schedule"]["list_stage_descriptions"][0] 38 | if stage_desc["init_lr"] == -1: 39 | stage_desc["init_lr"] = lr 40 | if stage_desc["epochs"] == -1: 41 | stage_desc["epochs"] = epochs 42 | stage_desc["epochs_lr"] = epochs 43 | 44 | return nncf_config 45 | 46 | def load_nncf_config(nncf_config, lr=3e-4, epochs=3, num_hidden_layers=32): 47 | loaded_json = parse_nncf_config(nncf_config, num=num_hidden_layers) 48 | loaded_json = add_lr_epochs(loaded_json, lr=lr, epochs=epochs) 49 | nncf_config = NNCFConfig.from_dict(loaded_json) 50 | return nncf_config 51 | -------------------------------------------------------------------------------- /SparAMX/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | docs/_output/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # IPython 79 | profile_default/ 80 | ipython_config.py 81 | 82 | # pyenv 83 | .python-version 84 | 85 | # celery beat schedule file 86 | celerybeat-schedule.* 87 | 88 | # SageMath parsed files 89 | *.sage.py 90 | 91 | # Environments 92 | .env 93 | .venv 94 | env/ 95 | venv/ 96 | ENV/ 97 | env.bak/ 98 | venv.bak/ 99 | 100 | # Spyder project settings 101 | .spyderproject 102 | .spyderproject/config/ 103 | 104 | # Rope project settings 105 | .ropeproject 106 | 107 | # mkdocs documentation 108 | /site 109 | 110 | # mypy 111 | .mypy_cache/ 112 | .dmypy.json 113 | dmypy.json 114 | 115 | # Pyre type checker 116 | .pyre/ 117 | 118 | # pytype static type analyzer 119 | .pytype/ 120 | 121 | # Cython debug symbols 122 | cython_debug/ 123 | 124 | # PyTorch 125 | *.pt 126 | *.pth 127 | *.onnx 128 | 129 | # PyCharm 130 | .idea/ 131 | 132 | # VS Code 133 | .vscode/ 134 | 135 | # Ignore data files 136 | data/ 137 | datasets/ 138 | 139 | # Ignore generated files 140 | logs/ 141 | results/ 142 | checkpoints/ 143 | 144 | # Ignore virtual environments 145 | venv/ 146 | venv.bak/ 147 | .env/ 148 | .venv/ 149 | ENV/ 150 | env.bak/ 151 | venv.bak/ 152 | -------------------------------------------------------------------------------- /SparAMX/README.md: -------------------------------------------------------------------------------- 1 | # SparAMX: Accelerating Compressed LLMs Token Generation on AMX-powered CPUs 2 | 3 | Official implementation of SparAMX: Accelerating Compressed LLMs Token Generation on AMX-powered CPUs. 4 | 5 | This repo contains the code for **SparAMX**, a set of open-source customized sparse kernels that can speed up any PyTorch model by automatically replacing all linear layers with our customized layer. Furthermore, we demonstrate for the first time the use of unstructured sparsity in the attention computation and achieving \textbf{1.14}$\times$ speedup over the current systems without compromising accuracy. 6 | 7 | | Stock PyTorch | SparAMX | 8 | |:-----------:|:-----------:| 9 | | Stock | TLD | 10 | 11 | # torch-custom-linear 12 | Custom implementation of linear through torch extension 13 | 14 | ### Dependency 15 | ```pip install -r requirements.txt``` 16 | 17 | ### Build & Install Custom Kernel 18 | ``` 19 | python setup.py install 20 | ``` 21 | 22 | ### Run Experiments Example 23 | 24 | Please make sure you're logged in to HuggingFace through the CLI if you'll be using a private model. 25 | 26 | You need to define the experiments you want to run in `generate_experiments.py` then run 27 | ``` 28 | python generate_experiments.py 29 | ``` 30 | 31 | A file `experiments.csv` is generated. Modify it if needed. After that run 32 | ``` 33 | ./run_experiments.sh 34 | ``` 35 | 36 | Your results will be saved inside folder `experiment_results/YYYY-MM-DD_HH-MM-SS`. 37 | 38 | ## Citation 39 | If you find our SparAMX code and paper helpful, please kindly cite: 40 | ```bibtex 41 | @misc{abouelhamayed2025sparamxacceleratingcompressedllms, 42 | title={SparAMX: Accelerating Compressed LLMs Token Generation on AMX-powered CPUs}, 43 | author={Ahmed F. AbouElhamayed and Jordan Dotzel and Yash Akhauri and Chi-Chih Chang and Sameh Gobriel and J. Pablo Muñoz and Vui Seng Chua and Nilesh Jain and Mohamed S. Abdelfattah}, 44 | year={2025}, 45 | eprint={2502.12444}, 46 | archivePrefix={arXiv}, 47 | primaryClass={cs.LG}, 48 | url={https://arxiv.org/abs/2502.12444}, 49 | } 50 | ''' 51 | 52 | -------------------------------------------------------------------------------- /SparAMX/Videos/sparamx.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/7549413d38677dd6eb92f918f7cc003dc65d1deb/SparAMX/Videos/sparamx.gif -------------------------------------------------------------------------------- /SparAMX/Videos/stock.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/7549413d38677dd6eb92f918f7cc003dc65d1deb/SparAMX/Videos/stock.gif -------------------------------------------------------------------------------- /SparAMX/benchmark_deepsparse.sh: -------------------------------------------------------------------------------- 1 | numactl --cpunodebind 0 --membind 0 --physcpubind=0-31 deepsparse.benchmark hf:neuralmagic/Llama2-7b-chat-pruned50-quant-ds -x benchmark -b 1 2 | -------------------------------------------------------------------------------- /SparAMX/compare_stock_vs_custom_linear.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import time 4 | 5 | from layer.dense_linear import DenseLinear 6 | from statistics import median 7 | 8 | torch.set_num_threads(1) 9 | d=4096 10 | o=1024 11 | 12 | one_layer_net_torch_stock = nn.Linear(in_features=d, out_features=o, bias=False) 13 | 14 | # Make sure all weights are representable in bfloat16. 15 | one_layer_net_torch_stock.weight.data = one_layer_net_torch_stock.weight.data.to(torch.bfloat16).to(torch.float) 16 | # one_layer_net_torch_stock.weight.data = torch.tensor([[1.,1.,2,1], [5.,1,1,1]]) 17 | print("\n[Info]: Creating one layer neural network with nn.Linear") 18 | print(f"one_layer_net_torch_stock:\n{one_layer_net_torch_stock}") 19 | 20 | one_layer_net_torch_extcpp = DenseLinear.from_linear(one_layer_net_torch_stock) 21 | print("\n[Info]: Creating one layer neural network with DenseFC") 22 | print(f"one_layer_net_torch_extcpp:\n{one_layer_net_torch_extcpp}") 23 | 24 | N=1 25 | x = torch.randn(N, d, dtype=torch.bfloat16).to(torch.float) 26 | x_bf = x.to(torch.bfloat16) 27 | # x = torch.tensor([[1., 1, 1, 1]]) 28 | print(f"\n[Info]: Creating test input x ({N}, {d})") 29 | print(f"x: {x.shape}\n{x}") 30 | 31 | times_stock = [] 32 | times_amx = [] 33 | with torch.no_grad(): 34 | for i in range(5): # warmup 35 | x = torch.randn(N, d, dtype=torch.bfloat16).to(torch.float) 36 | o_torch_stock = one_layer_net_torch_stock(x) 37 | o_torch_extcpp = one_layer_net_torch_extcpp(x) 38 | for i in range(1000): # measurement 39 | x = torch.randn(N, d, dtype=torch.bfloat16).to(torch.float) 40 | start = time.time() 41 | o_torch_stock = one_layer_net_torch_stock(x) 42 | time_stock = time.time() - start 43 | times_stock.append(time_stock) 44 | start = time.time() 45 | o_torch_extcpp = one_layer_net_torch_extcpp(x) 46 | time_amx = time.time() - start 47 | times_amx.append(time_amx) 48 | 49 | print(f"\none_layer_net_torch_stock(x): {o_torch_stock.shape}\n{o_torch_stock}") 50 | print(f"\none_layer_net_torch_extcpp(x): {o_torch_extcpp.shape}\n{o_torch_extcpp}") 51 | 52 | # print(f"\none_layer_net_torch_stock(x) == one_layer_net_torch_extcpp(x) ???\n{torch.testing.assert_close(o_torch_extcpp, o_torch_stock)}") 53 | print(f"Time Stock: {median(times_stock)}, Time AMX: {median(times_amx)}") 54 | 55 | -------------------------------------------------------------------------------- /SparAMX/compare_stock_vs_onednn_linear.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import time 4 | 5 | from layer.onednn_linear import OneDnnLinear 6 | from statistics import median 7 | 8 | torch.set_num_threads(1) 9 | d=14336 10 | o=4096 11 | 12 | execute_custom_layer = True 13 | execute_stock_layer = False 14 | 15 | one_layer_net_torch_stock = nn.Linear(in_features=d, out_features=o, bias=False) 16 | 17 | # Make sure all weights are representable in bfloat16. 18 | one_layer_net_torch_stock.weight.data = one_layer_net_torch_stock.weight.data.to(torch.bfloat16).to(torch.float) 19 | # one_layer_net_torch_stock.weight.data = torch.tensor([[0.,0]]) 20 | # import pdb; pdb.set_trace() 21 | print("\n[Info]: Creating one layer neural network with nn.Linear") 22 | print(f"one_layer_net_torch_stock:\n{one_layer_net_torch_stock}") 23 | 24 | one_layer_net_torch_extcpp = OneDnnLinear.from_linear(one_layer_net_torch_stock) 25 | print("\n[Info]: Creating one layer neural network with DenseFC") 26 | print(f"one_layer_net_torch_extcpp:\n{one_layer_net_torch_extcpp}") 27 | 28 | N=1 29 | x = torch.randn(N, d, dtype=torch.bfloat16).to(torch.float) 30 | # x_bf = x.to(torch.bfloat16) 31 | # x = torch.tensor([[1., 0]]) 32 | # import pdb; pdb.set_trace() 33 | print(f"\n[Info]: Creating test input x ({N}, {d})") 34 | print(f"x: {x.shape}\n{x}") 35 | 36 | times_stock = [] 37 | times_amx = [] 38 | with torch.no_grad(): 39 | for i in range(5): # warmup 40 | x = torch.randn(N, d, dtype=torch.bfloat16).to(torch.float) 41 | if execute_stock_layer: 42 | o_torch_stock = one_layer_net_torch_stock(x) 43 | if execute_custom_layer: 44 | o_torch_extcpp = one_layer_net_torch_extcpp(x) 45 | for i in range(1000): # measurement 46 | x = torch.randn(N, d, dtype=torch.bfloat16).to(torch.float) 47 | start = time.time() 48 | if execute_stock_layer: 49 | o_torch_stock = one_layer_net_torch_stock(x) 50 | time_stock = time.time() - start 51 | times_stock.append(time_stock) 52 | start = time.time() 53 | if execute_custom_layer: 54 | o_torch_extcpp = one_layer_net_torch_extcpp(x) 55 | time_amx = time.time() - start 56 | times_amx.append(time_amx) 57 | 58 | # print(f"\none_layer_net_torch_stock(x): {o_torch_stock.shape}\n{o_torch_stock}") 59 | # print(f"\none_layer_net_torch_extcpp(x): {o_torch_extcpp.shape}\n{o_torch_extcpp}") 60 | if execute_stock_layer and execute_custom_layer: 61 | torch.testing.assert_close(o_torch_extcpp, o_torch_stock) 62 | 63 | if execute_stock_layer: 64 | print(f"Time Stock: {median(times_stock)}\n") 65 | if execute_custom_layer: 66 | print(f"Time AMX: {median(times_amx)}\n") 67 | -------------------------------------------------------------------------------- /SparAMX/deepsparse_optimized_llama2.py: -------------------------------------------------------------------------------- 1 | # Example Run: numactl --cpunodebind 0 --membind 0 --physcpubind=0-31 python deepsparse_optimized_llama2.py --batch_size 32 --num_generation_tokens 128 --warmup_iterations 5 --num_iterations 5 2 | import time 3 | import numpy as np 4 | import argparse 5 | from deepsparse import TextGeneration 6 | 7 | # def benchmark(batch_size, num_generation_tokens, input_size, num_iterations, warmup_iterations): 8 | def benchmark(batch_size, num_generation_tokens, num_iterations, warmup_iterations): 9 | # Initialize the pipeline outside of timing 10 | pipeline = TextGeneration(model="hf:neuralmagic/Llama2-7b-chat-pruned50-quant-ds") 11 | 12 | # Prepare batch inputs (use realistic prompts of specified input size) 13 | batch_inputs = ["" for _ in range(batch_size)] 14 | 15 | # Warm-up phase 16 | for _ in range(warmup_iterations): 17 | _ = pipeline(batch_inputs, max_new_tokens=num_generation_tokens) 18 | 19 | # Benchmarking 20 | timings = [] 21 | import pdb; pdb.set_trace() 22 | for _ in range(num_iterations): 23 | start_time = time.time() 24 | out = pipeline(batch_inputs, max_new_tokens=num_generation_tokens) 25 | end_time = time.time() 26 | timings.append(end_time - start_time) 27 | 28 | # Compute statistics 29 | mean_time = np.mean(timings) 30 | std_time = np.std(timings) 31 | median_time = np.median(timings) 32 | throughput = (batch_size * num_generation_tokens) / mean_time # Tokens per second 33 | # Output results 34 | print(f"Mean inference time per batch: {mean_time:.4f} seconds") 35 | print(f"Standard deviation: {std_time:.4f} seconds") 36 | print(f"Median inference time per batch: {median_time:.4f} seconds") 37 | print(f"Throughput: {throughput:.2f} tokens per second") 38 | 39 | 40 | 41 | if __name__ == "__main__": 42 | parser = argparse.ArgumentParser(description="Benchmark DeepSparse Text Generation Pipeline") 43 | parser.add_argument("--batch_size", type=int, default=1, help="Batch size for the benchmark") 44 | parser.add_argument("--num_generation_tokens", type=int, default=1, help="Number of tokens to generate") 45 | # parser.add_argument("--input_size", type=int, default=1, help="Size of the input prompt (number of characters)") 46 | parser.add_argument("--num_iterations", type=int, default=100, help="Number of iterations for benchmarking") 47 | parser.add_argument("--warmup_iterations", type=int, default=10, help="Number of warm-up iterations") 48 | 49 | args = parser.parse_args() 50 | # Run the benchmark 51 | benchmark( 52 | batch_size=args.batch_size, 53 | num_generation_tokens=args.num_generation_tokens, 54 | # input_size=args.input_size, 55 | num_iterations=args.num_iterations, 56 | warmup_iterations=args.warmup_iterations 57 | ) 58 | -------------------------------------------------------------------------------- /SparAMX/generate_experiments.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | # Example arrays (you can replace these with your actual data) 4 | num_threads = [32] 5 | cores = [32] 6 | # modes = ['avx_sparse', 'sparse', 'stock'] 7 | modes = ['stock'] 8 | context_lengths = [512, 1024, 2048, 4096, 8192, 16384] 9 | generation_lengths = [2] 10 | batch_sizes = [1] 11 | model_ids = ['meta-llama/Meta-Llama-3-8B'] 12 | 13 | num_groups = [32] 14 | use_custom_attention = True 15 | use_custom_k = [True, False] 16 | use_custom_v = [True, False] 17 | k_pruning_percentages = [50, 60, 70, 80] 18 | v_pruning_percentages = [50, 60, 70, 80] 19 | 20 | # Define the CSV file path 21 | csv_file = "experiments.csv" 22 | 23 | # Create a CSV file and write the header 24 | with open(csv_file, mode="w", newline="") as file: 25 | writer = csv.writer(file) 26 | writer.writerow(["Model ID", "Saved Model Path", "Context Length", "Number of generated Tokens", "Mode", "Number of Column Groups", "Number of Threads", "Number of Cores", "Batch Size", "Use Custom Attention", "Use Custom K", "Use Custom V", "K Pruning Percentage", "V Pruning Percentage", "Prefill Latency", "Decode Latency", "Total Latency", "Layer Times"]) 27 | 28 | # Loop through the arrays and write the data 29 | for model_id in model_ids: 30 | for mode in modes: 31 | placeholder_saved_model_path = f"processed_models/{mode}/{model_id.split('/')[-1]}_num_threads_threads" 32 | for generation_length in generation_lengths: 33 | for context_length in context_lengths: 34 | for core in cores: 35 | for batch_size in batch_sizes: 36 | for use_custom_k_val in use_custom_k: 37 | for use_custom_v_val in use_custom_v: 38 | for num_thread in num_threads: 39 | saved_model_path = placeholder_saved_model_path.replace('_num_threads', f'_{num_thread}') 40 | for num_group in num_groups: 41 | original_model_path = f'{saved_model_path}' 42 | if mode == 'avx_sparse': 43 | saved_model_path = f'{saved_model_path}_{num_group}_groups' 44 | for k_pruning in k_pruning_percentages: 45 | for v_pruning in v_pruning_percentages: 46 | writer.writerow([model_id, saved_model_path, context_length, generation_length, mode, num_group, num_thread, core, batch_size, use_custom_attention, use_custom_k_val, use_custom_v_val, k_pruning, v_pruning]) 47 | if not(use_custom_k_val): 48 | break 49 | if not(use_custom_v_val): 50 | break 51 | saved_model_path = original_model_path 52 | if mode != 'avx_sparse': 53 | break 54 | if mode == 'stock': 55 | break 56 | 57 | print(f"CSV file '{csv_file}' has been generated with the data.") 58 | -------------------------------------------------------------------------------- /SparAMX/layer/onednn_linear.py: -------------------------------------------------------------------------------- 1 | import math 2 | from torch import Tensor, nn 3 | from torch.autograd import Function 4 | import torch 5 | 6 | import custom_onednn_linear 7 | 8 | 9 | class OneDnnLinear(nn.Linear): 10 | def __init__(self, in_features: int, out_features: int, bias: bool = True, device=None, dtype=None) -> None: 11 | super().__init__(in_features, out_features, bias, device, dtype) 12 | # import pdb; pdb.set_trace() 13 | self.onednn_primitive = custom_onednn_linear.get_onednn_descriptor(1,in_features, out_features) 14 | 15 | # we only override the forward to map to our custom kernel 16 | # backward will fallback to nn.Linear.backward() 17 | 18 | def forward(self, input: Tensor) -> Tensor: 19 | # print("[Info]: Entering custom linear implementation") 20 | # import pdb; pdb.set_trace() 21 | return custom_onednn_linear.forward(self.onednn_primitive, input, self.weight, self.bias) 22 | 23 | @classmethod 24 | def from_linear(cls, nn_linear_inst: nn.Linear, shallow=False): 25 | # TODO: not an efficient implementation, we will create another copy of parameters, imagine Billion scale llm! 26 | # see llm_pipeline.py for a "Wrapper" way 27 | new_inst = cls(nn_linear_inst.in_features, 28 | nn_linear_inst.out_features, 29 | nn_linear_inst.bias is not None, 30 | next(nn_linear_inst.parameters()).device, 31 | next(nn_linear_inst.parameters()).dtype) 32 | 33 | # new_inst.weight.data = nn_linear_inst.weight.data 34 | print(f"Before weight conversion: {nn_linear_inst.weight.data}") 35 | # Convert to AMX format: Group each 2 consecutive elements, then transpose then reshape to have the same original shape. 36 | new_inst.weight.data = nn_linear_inst.weight.data 37 | # ^ The above handles corner cases for the below. 38 | # new_inst.weight.data = nn_linear_inst.weight.data.view(-1, 2, 2).transpose(0, 1).reshape(nn_linear_inst.weight.data.shape) 39 | print(f"After weight conversion: {new_inst.weight.data}") 40 | if new_inst.bias is not None: 41 | new_inst.bias.data = nn_linear_inst.bias.data 42 | return new_inst -------------------------------------------------------------------------------- /SparAMX/requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate==0.31.0 2 | aiohappyeyeballs==2.4.0 3 | aiohttp==3.10.11 4 | aiosignal==1.3.1 5 | asttokens==2.4.1 6 | attrs==24.2.0 7 | certifi==2024.7.4 8 | charset-normalizer==3.3.2 9 | coloredlogs==15.0.1 10 | comm==0.2.2 11 | contourpy==1.3.0 12 | cycler==0.12.1 13 | datasets==3.0.0 14 | debugpy==1.8.5 15 | decorator==5.1.1 16 | dill==0.3.8 17 | distlib==0.3.8 18 | evaluate==0.4.3 19 | executing==2.1.0 20 | filelock==3.15.4 21 | flatbuffers==24.3.25 22 | fonttools==4.53.1 23 | frozenlist==1.4.1 24 | fsspec==2024.6.1 25 | huggingface-hub==0.24.6 26 | humanfriendly==10.0 27 | idna==3.8 28 | iniconfig==2.0.0 29 | inquirerpy==0.3.4 30 | ipykernel==6.29.5 31 | ipython==8.27.0 32 | jedi==0.19.1 33 | Jinja2==3.1.5 34 | jupyter_client==8.6.2 35 | jupyter_core==5.7.2 36 | kiwisolver==1.4.7 37 | MarkupSafe==2.1.5 38 | matplotlib==3.9.2 39 | matplotlib-inline==0.1.7 40 | mpmath==1.3.0 41 | multidict==6.1.0 42 | multiprocess==0.70.16 43 | nest-asyncio==1.6.0 44 | networkx==3.3 45 | numpy==1.26.4 46 | nvidia-cublas-cu12==12.1.3.1 47 | nvidia-cuda-cupti-cu12==12.1.105 48 | nvidia-cuda-nvrtc-cu12==12.1.105 49 | nvidia-cuda-runtime-cu12==12.1.105 50 | nvidia-cudnn-cu12==8.9.2.26 51 | nvidia-cufft-cu12==11.0.2.54 52 | nvidia-curand-cu12==10.3.2.106 53 | nvidia-cusolver-cu12==11.4.5.107 54 | nvidia-cusparse-cu12==12.1.0.106 55 | nvidia-nccl-cu12==2.20.5 56 | nvidia-nvjitlink-cu12==12.6.20 57 | nvidia-nvtx-cu12==12.1.105 58 | onnx==1.17.0 59 | onnxconverter-common==1.14.0 60 | onnxruntime==1.19.2 61 | onnxruntime-tools==1.7.0 62 | optimum==1.22.0 63 | packaging==24.1 64 | pandas==2.2.2 65 | parso==0.8.4 66 | pexpect==4.9.0 67 | pfzy==0.3.4 68 | pillow==10.4.0 69 | platformdirs==4.3.2 70 | pluggy==1.5.0 71 | prompt_toolkit==3.0.47 72 | protobuf==3.20.2 73 | psutil==6.0.0 74 | ptyprocess==0.7.0 75 | pure_eval==0.2.3 76 | py-cpuinfo==9.0.0 77 | py3nvml==0.2.7 78 | pyarrow==17.0.0 79 | Pygments==2.18.0 80 | pyparsing==3.1.4 81 | pytest==8.3.2 82 | python-dateutil==2.9.0.post0 83 | pytz==2024.2 84 | PyYAML==6.0.2 85 | pyzmq==26.2.0 86 | regex==2024.7.24 87 | requests==2.32.3 88 | safetensors==0.4.4 89 | seaborn==0.13.2 90 | sentencepiece==0.2.0 91 | setuptools==73.0.1 92 | six==1.16.0 93 | sparse_linear==0.0.0 94 | stack-data==0.6.3 95 | sympy==1.13.2 96 | tf2onnx==1.16.1 97 | tokenizers==0.19.1 98 | torch==2.3.1 99 | tornado==6.4.2 100 | tqdm==4.66.5 101 | traitlets==5.14.3 102 | transformers==4.48.0 103 | typing_extensions==4.12.2 104 | tzdata==2024.1 105 | urllib3==2.2.2 106 | virtualenv==20.26.6 107 | wcwidth==0.2.13 108 | xmltodict==0.13.0 109 | xxhash==3.5.0 110 | yarl==1.11.1 111 | -------------------------------------------------------------------------------- /SparAMX/run_experiments.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Input and output CSV files 4 | input_csv="experiments.csv" 5 | time=$(date +"%Y-%m-%d_%H-%M-%S") 6 | dir="experiment_results/$time" 7 | mkdir -p $dir 8 | output_csv="$dir/results.csv" 9 | prefill_time_file="prefill_time.txt" 10 | decode_time_file="decode_time.txt" 11 | total_time_file="total_time.txt" 12 | layer_times_file="layer_times.txt" 13 | num_repeats=1 14 | # Read the header and write it to the output CSV 15 | header=$(head -n 1 "$input_csv") 16 | echo "$header" > "$output_csv" 17 | 18 | # Process the CSV file line by line, skipping the header 19 | tail -n +2 "$input_csv" | tr -d '\r' | while IFS=, read -r model_id saved_model_path context_length num_generated_tokens mode num_groups num_threads core batch_size enable_custom_attention use_custom_k use_custom_v k_pruning_percentage v_pruning_percentage; do 20 | 21 | for ((i=1; i<=$num_repeats; i++)); do 22 | rm -f $prefill_time_file 23 | rm -f $decode_time_file 24 | 25 | if [ "$mode" = "avx_sparse" ]; then 26 | sed -i '/# CHANGE BELOW FOR GROUP SIZE/!b;n;cNUM_GROUPS = '"$num_groups" layer/avx_sparse_linear.py 27 | sed -i '/\/\/ CHANGE BELOW FOR GROUP SIZE/!b;n;c#define NUM_COL_GROUPS '"$num_groups" csrc/avx_sparse_linear.cpp 28 | python setup.py install 29 | fi 30 | 31 | # construct new variable core_vals = "0-($core-1)" 32 | core_vals="0-$(($core-1))" 33 | 34 | enable_custom_attention_str="" 35 | if [ "$enable_custom_attention" = "True" ]; then 36 | enable_custom_attention_str="--enable_custom_attention" 37 | fi 38 | 39 | use_custom_k_str="" 40 | if [ "$use_custom_k" = "True" ]; then 41 | use_custom_k_str="--use_custom_k" 42 | fi 43 | 44 | use_custom_v_str="" 45 | if [ "$use_custom_v" = "True" ]; then 46 | use_custom_v_str="--use_custom_v" 47 | fi 48 | 49 | # Run the experiment 50 | numactl --cpunodebind 0 --membind 0 --physcpubind=$core_vals python llm_pipeline.py --model_id $model_id --saved_model_path $saved_model_path --context_length $context_length --num_generated_tokens $num_generated_tokens --mode $mode --num_threads $num_threads --batch_size $batch_size $enable_custom_attention_str $use_custom_k_str $use_custom_v_str --k_pruning $k_pruning_percentage --v_pruning $v_pruning_percentage 51 | 52 | # Read the result from the result file 53 | time_prefill=$(<$prefill_time_file) 54 | time_decode=$(<$decode_time_file) 55 | total_time=$(<$total_time_file) 56 | # layer_times=$(<$layer_times_file) 57 | 58 | # Construct the new CSV line with the result 59 | new_line="$model_id,$saved_model_path,$context_length,$num_generated_tokens,$mode,$num_groups,$num_threads,$core,$batch_size,$enable_custom_attention,$use_custom_k,$use_custom_v,$k_pruning_percentage,$v_pruning_percentage,$time_prefill,$time_decode,$total_time" 60 | # new_line="$model_id,$saved_model_path,$context_length,$num_generated_tokens,$mode,$num_groups,$num_threads,$core,$batch_size,$enable_custom_attention,$use_custom_k,$use_custom_v,$k_pruning_percentage,$v_pruning_percentage,$time_prefill,$time_decode,$total_time,$layer_times" 61 | # new_line="$model_id,$saved_model_path,$context_length,$num_generated_tokens,$mode,$num_groups,$num_threads,$core,$batch_size,$time_prefill,$time_decode" 62 | 63 | # Append the new line to the output CSV 64 | echo "$new_line" >> "$output_csv" 65 | done 66 | done 67 | -------------------------------------------------------------------------------- /SparAMX/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import shutil 4 | import torch 5 | from setuptools import setup, find_packages, Command 6 | from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDA_HOME 7 | from pathlib import Path 8 | 9 | # Get PyTorch library path 10 | TORCH_LIB_PATH = str(Path(torch.__file__).parent / 'lib') 11 | 12 | # Add torch lib path to environment 13 | if 'LD_LIBRARY_PATH' in os.environ: 14 | os.environ['LD_LIBRARY_PATH'] = f"{TORCH_LIB_PATH}:{os.environ['LD_LIBRARY_PATH']}" 15 | else: 16 | os.environ['LD_LIBRARY_PATH'] = TORCH_LIB_PATH 17 | 18 | class CustomCleanCommand(Command): 19 | """Custom clean command to tidy up the project root.""" 20 | user_options = [] 21 | 22 | def initialize_options(self): 23 | pass 24 | 25 | def finalize_options(self): 26 | pass 27 | 28 | def run(self): 29 | patterns_to_remove = [ 30 | 'sparamx.egg-info', 31 | 'sparamx/*.so', 32 | 'sparamx/*.pyd', 33 | ] 34 | 35 | build_dirs = ['./build', './dist'] 36 | for dir_path in build_dirs: 37 | if os.path.exists(dir_path): 38 | print(f'Removing directory: {dir_path}') 39 | shutil.rmtree(dir_path) 40 | 41 | for pattern in patterns_to_remove: 42 | for item in glob.glob(pattern): 43 | if os.path.isdir(item): 44 | print(f'Removing directory: {item}') 45 | shutil.rmtree(item) 46 | elif os.path.isfile(item): 47 | print(f'Removing file: {item}') 48 | os.remove(item) 49 | 50 | for root, dirs, files in os.walk('./sparamx'): 51 | if '__pycache__' in dirs: 52 | cache_dir = os.path.join(root, '__pycache__') 53 | print(f'Removing directory: {cache_dir}') 54 | shutil.rmtree(cache_dir) 55 | 56 | # Common compiler and linker arguments 57 | extra_compile_args = [ 58 | '-mamx-tile', 59 | '-mamx-int8', 60 | '-mamx-bf16', 61 | '-fopenmp', 62 | '-O3', 63 | '-DNDEBUG', 64 | '-march=sapphirerapids', 65 | '-mavx512f', 66 | '-mavx512dq' 67 | ] 68 | 69 | # Add PyTorch include paths 70 | include_dirs = [ 71 | os.path.join(torch.utils.cpp_extension.include_paths()[0], 'torch', 'csrc', 'api', 'include'), 72 | os.path.join(torch.utils.cpp_extension.include_paths()[0], 'torch', 'lib'), 73 | ] 74 | 75 | # Define extensions 76 | extension_specs = [ 77 | ("sparse_linear", "csrc/sparse_linear.cpp"), 78 | ("avx_sparse_linear", "csrc/avx_sparse_linear.cpp"), 79 | ("quantized_sparse_linear", "csrc/quantized_sparse_linear.cpp"), 80 | ("quantized_dense_linear", "csrc/quantized_dense_linear.cpp"), 81 | ("dense_linear", "csrc/dense_linear.cpp"), 82 | ] 83 | 84 | extensions = [] 85 | for name, source in extension_specs: 86 | source_path = os.path.abspath(source) 87 | print(f"Setting up extension {name} from source {source_path}") 88 | 89 | if not os.path.exists(source_path): 90 | print(f"WARNING: Source file not found: {source_path}") 91 | continue 92 | 93 | ext = CppExtension( 94 | name=f"sparamx.{name}", 95 | sources=[source_path], 96 | include_dirs=include_dirs, 97 | extra_compile_args=extra_compile_args, 98 | extra_link_args=['-lgomp', f'-Wl,-rpath,{TORCH_LIB_PATH}'] 99 | ) 100 | extensions.append(ext) 101 | 102 | setup( 103 | name="sparamx", 104 | version="0.1.0", 105 | packages=find_packages(), 106 | ext_modules=extensions, 107 | cmdclass={ 108 | 'build_ext': BuildExtension, 109 | 'clean': CustomCleanCommand, 110 | } 111 | ) -------------------------------------------------------------------------------- /security.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | Intel is committed to rapidly addressing security vulnerabilities affecting our customers and providing clear guidance on the solution, impact, severity and mitigation. 3 | 4 | ## Reporting a Vulnerability 5 | Please report any security vulnerabilities in this project [utilizing the guidelines here](https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html). 6 | --------------------------------------------------------------------------------