├── .gitattributes
├── .gitignore
├── BootstrapNAS
├── README.md
├── architecture.png
├── examples
│ ├── .gitignore
│ ├── BootstrapNAS.ipynb
│ ├── README.md
│ ├── bootstrapnas_utils.py
│ ├── imgs
│ │ ├── architecture.png
│ │ └── search_progression.png
│ ├── imports_bnas.py
│ ├── third_party_search
│ │ ├── dynas-t_external_search_resnet50_supernet.ipynb
│ │ ├── dynast_bootstrapnas_resnet50_cifar10_example.png
│ │ └── sigopt_external_search_resnet50_supernet.ipynb
│ └── yolox-nano
│ │ ├── README.md
│ │ ├── class_list.txt
│ │ ├── nohup_wo_ignored_scope.out
│ │ ├── search_progression.png
│ │ └── yolox-bootstrapnas.patch
├── instructions
│ ├── Configuration.md
│ ├── Home.md
│ ├── Quickstart.md
│ └── Subnetwork_Search.md
├── models
│ ├── pretrained
│ │ └── resnet50.pt
│ └── supernets
│ │ └── cifar10
│ │ └── resnet50
│ │ ├── config.json
│ │ ├── elasticity.pth
│ │ ├── results.md
│ │ ├── search_progression.png
│ │ └── supernet_weights.pth
└── requirements.txt
├── EFTNAS
├── README.md
├── eftnas_configs
│ ├── nncf_base_config_for_bert_base.json
│ ├── nncf_eftnas_s1_bert_base_cola.json
│ ├── nncf_eftnas_s1_bert_base_mnli.json
│ ├── nncf_eftnas_s1_bert_base_mrpc.json
│ ├── nncf_eftnas_s1_bert_base_qnli.json
│ ├── nncf_eftnas_s1_bert_base_qqp.json
│ ├── nncf_eftnas_s1_bert_base_rte.json
│ ├── nncf_eftnas_s1_bert_base_squadv1.json
│ ├── nncf_eftnas_s1_bert_base_squadv2.json
│ ├── nncf_eftnas_s1_bert_base_sst2.json
│ ├── nncf_eftnas_s2_bert_medium_cola.json
│ ├── nncf_eftnas_s2_bert_medium_mnli.json
│ ├── nncf_eftnas_s2_bert_medium_mrpc.json
│ ├── nncf_eftnas_s2_bert_medium_qnli.json
│ ├── nncf_eftnas_s2_bert_medium_qqp.json
│ ├── nncf_eftnas_s2_bert_medium_rte.json
│ ├── nncf_eftnas_s2_bert_medium_squadv1.json
│ ├── nncf_eftnas_s2_bert_medium_squadv2.json
│ └── nncf_eftnas_s2_bert_medium_sst2.json
├── eftnas_search_space
│ └── generate_eftnas_search_space.py
├── figures
│ └── eftnas_pipeline.png
├── install.sh
├── patches
│ ├── nncf.patch
│ └── transformers.patch
└── running_commands
│ ├── cola.sh
│ ├── mnli.sh
│ ├── mrpc.sh
│ ├── qnli.sh
│ ├── qqp.sh
│ ├── rte.sh
│ ├── squadv1.sh
│ ├── squadv2.sh
│ └── sst2.sh
├── EZNAS
├── README.md
├── dataset_utils.py
├── evol_config.yaml
├── evol_utilities.py
├── gp_func_defs.py
├── nasspace.py
├── novel_search.py
├── reproduce.sh
├── reproduce
│ └── all_tests.csv
├── runjob.sh
├── setup_script.sh
└── verify_scores.py
├── LICENSE
├── LoNAS
├── README.md
├── install.sh
├── nncf_config
│ ├── glue
│ │ ├── nncf_lonas_bert_base_cola.json
│ │ ├── nncf_lonas_bert_base_mnli.json
│ │ ├── nncf_lonas_bert_base_mrpc.json
│ │ ├── nncf_lonas_bert_base_qnli.json
│ │ ├── nncf_lonas_bert_base_qqp.json
│ │ ├── nncf_lonas_bert_base_rte.json
│ │ ├── nncf_lonas_bert_base_sst2.json
│ │ └── nncf_lonas_bert_base_stsb.json
│ ├── unified_commonsense
│ │ └── nncf_lonas_llama_7b.json
│ └── unified_math
│ │ ├── nncf_lonas_bloomz_7b.json
│ │ ├── nncf_lonas_llama_13b.json
│ │ └── nncf_lonas_llama_7b.json
├── patches
│ ├── nncf-544d5141.patch
│ ├── peft-v0.5.0.patch
│ └── transformers-v4.31.0.patch
├── run_commonsense.py
├── run_glue.py
├── run_math.py
└── running_commands
├── Mamba-Shedder
├── README.md
├── eval.py
├── extract
│ ├── README.md
│ └── extract_mamba.py
├── hybrid
│ ├── Hymba-Pruning
│ │ ├── README.md
│ │ ├── eval.py
│ │ ├── extract
│ │ │ ├── README.md
│ │ │ └── extract_hymba.py
│ │ ├── patches
│ │ │ ├── hymba-e1b7ee9.patch
│ │ │ └── transformers-v4.47.0.patch
│ │ ├── prune.py
│ │ ├── recovery
│ │ │ ├── README.md
│ │ │ ├── finetune_hymba.py
│ │ │ └── merge.py
│ │ └── results
│ │ │ ├── README.md
│ │ │ └── hymba-1.5b-base
│ │ │ ├── eval.res.config.hymba_block.5.json
│ │ │ ├── eval.res.config.hymba_block.6.json
│ │ │ ├── eval.res.config.hymba_block.7.json
│ │ │ └── pruning_config.json
│ └── Zamba2-Pruning
│ │ ├── README.md
│ │ ├── eval.py
│ │ ├── extract
│ │ └── extract_zamba2.py
│ │ ├── install.sh
│ │ ├── patches
│ │ └── zamba2-7593823.patch
│ │ ├── preprocess.py
│ │ ├── prune.py
│ │ ├── prune_hybrid.py
│ │ ├── recovery
│ │ └── finetune_zamba2.py
│ │ └── results
│ │ ├── README.md
│ │ └── zamba2-2.7b
│ │ ├── ratio_10
│ │ ├── eval.res.config.ssm.40.json
│ │ └── pruning_config.json
│ │ └── ratio_15
│ │ ├── eval.res.config.ssm.45.json
│ │ └── pruning_config.json
├── install.sh
├── patches
│ └── mamba-62db608.patch
├── prune.py
├── recovery
│ └── finetune_mamba.py
├── results
│ ├── README.md
│ ├── mamba-2.8b
│ │ ├── eval.res.config.mamba_block.13.json
│ │ ├── eval.res.config.mamba_block.6.json
│ │ └── pruning_config.json
│ └── mamba2-2.7b
│ │ ├── eval.res.config.ssm.15.json
│ │ ├── eval.res.config.ssm.19.json
│ │ ├── eval.res.config.ssm.21.json
│ │ ├── eval.res.config.ssm.23.json
│ │ └── pruning_config.json
└── utils.py
├── MultiPruner
├── README.md
├── eval.py
├── extract
│ ├── README.md
│ └── extract_model.py
├── install.sh
├── patches
│ └── transformers-v4.45.0.patch
├── recovery
│ ├── README.md
│ ├── finetune.py
│ └── merge.py
├── requirements.txt
├── results
│ ├── Baichuan2-13B-Base
│ │ └── ratio_24
│ │ │ ├── eval.res.json
│ │ │ └── pruning_config.json
│ ├── Baichuan2-7B-Base
│ │ └── ratio_22
│ │ │ ├── eval.res.json
│ │ │ └── pruning_config.json
│ ├── Llama-2-13B
│ │ └── ratio_25
│ │ │ ├── eval.res.json
│ │ │ └── pruning_config.json
│ ├── Llama-2-7B
│ │ ├── ratio_10
│ │ │ ├── eval.res.json
│ │ │ └── pruning_config.json
│ │ ├── ratio_12
│ │ │ ├── eval.res.json
│ │ │ └── pruning_config.json
│ │ ├── ratio_14
│ │ │ ├── eval.res.json
│ │ │ └── pruning_config.json
│ │ ├── ratio_15
│ │ │ ├── eval.res.json
│ │ │ └── pruning_config.json
│ │ ├── ratio_18
│ │ │ ├── eval.res.json
│ │ │ └── pruning_config.json
│ │ ├── ratio_22
│ │ │ ├── eval.res.json
│ │ │ └── pruning_config.json
│ │ └── ratio_7
│ │ │ ├── eval.res.json
│ │ │ └── pruning_config.json
│ ├── Llama-3.1-8B
│ │ ├── ratio_10
│ │ │ ├── eval.res.json
│ │ │ └── pruning_config.json
│ │ ├── ratio_17
│ │ │ ├── eval.res.json
│ │ │ └── pruning_config.json
│ │ └── ratio_20
│ │ │ ├── eval.res.json
│ │ │ └── pruning_config.json
│ ├── Llama-3.2-3B
│ │ └── ratio_9
│ │ │ ├── eval.res.json
│ │ │ └── pruning_config.json
│ ├── Meta-Llama-3-8B
│ │ ├── ratio_10
│ │ │ ├── eval.res.json
│ │ │ └── pruning_config.json
│ │ ├── ratio_17
│ │ │ ├── eval.res.json
│ │ │ └── pruning_config.json
│ │ └── ratio_20
│ │ │ ├── eval.res.json
│ │ │ └── pruning_config.json
│ ├── Qwen1.5-14B
│ │ └── ratio_24
│ │ │ ├── eval.res.json
│ │ │ └── pruning_config.json
│ ├── Qwen1.5-7B
│ │ └── ratio_22
│ │ │ ├── eval.res.json
│ │ │ └── pruning_config.json
│ ├── Qwen2.5-7B
│ │ ├── ratio_10
│ │ │ ├── eval.res.json
│ │ │ └── pruning_config.json
│ │ └── ratio_20
│ │ │ ├── eval.res.json
│ │ │ └── pruning_config.json
│ └── README.md
├── run_multipruner.py
└── utils.py
├── README.md
├── SQFT
├── README.md
├── install.sh
├── legacy
│ ├── README.md
│ ├── eval
│ │ └── evaluate_math.py
│ ├── install.sh
│ ├── install_inference.sh
│ ├── modules
│ │ └── sqft_linear.py
│ ├── notebooks
│ │ ├── sqft_lora.ipynb
│ │ ├── sqft_nls.ipynb
│ │ ├── sqft_qa_sparsepeft_lora.ipynb
│ │ ├── sqft_qa_sparsepeft_nls.ipynb
│ │ ├── sqft_sparsepeft_lora.ipynb
│ │ └── sqft_sparsepeft_nls.ipynb
│ ├── opea
│ │ ├── Dockerfile
│ │ ├── README.md
│ │ ├── dataset
│ │ │ ├── arce_train_instruct.json
│ │ │ └── preprocess_arc.py
│ │ ├── example_nncf_config
│ │ │ └── nncf_config.json
│ │ └── search.py
│ ├── patches
│ │ ├── nncf-f143e1c.patch
│ │ ├── peft-v0.10.0.patch
│ │ ├── transformers-v4.44.2.patch
│ │ └── wanda-8e8fc87.patch
│ ├── run_command
│ │ ├── README.md
│ │ ├── llama-3-8b
│ │ │ ├── run.sh
│ │ │ └── sparse_quantization.sh
│ │ ├── mistral-7b-v0.3
│ │ │ ├── run.sh
│ │ │ └── sparse_quantization.sh
│ │ └── phi-3-mini-4k-instruct
│ │ │ ├── run.sh
│ │ │ └── sparse_quantization.sh
│ ├── run_instruction_tuning.py
│ ├── run_standard_tuning.py
│ └── utils
│ │ ├── check_sparsity.py
│ │ ├── create_sqft_nncf_config.py
│ │ ├── extract_sub_adapter.py
│ │ ├── load_dataset.py
│ │ ├── merge.py
│ │ └── quantization.py
├── modules
│ ├── elastic_lora_linear.py
│ └── sqft_qa_linear.py
├── patches
│ ├── peft-v0.10.0.patch
│ └── wanda-8e8fc87.patch
├── run_sqft.py
└── utils
│ ├── check_sparsity.py
│ ├── extract_sub_adapter.py
│ ├── load_dataset.py
│ ├── merge.py
│ └── quantization.py
├── Shears
├── README.md
├── example_commonsense.py
├── example_math.py
├── install.sh
├── nncf_config
│ ├── nncf_config.md
│ ├── nncf_shears_llama.json
│ ├── nncf_shears_llama_with_gate_proj.json
│ └── nncf_shears_mpt.json
├── patches
│ ├── nncf-544d5141.patch
│ ├── peft-v0.5.0-inference.patch
│ ├── peft-v0.5.0.patch
│ └── transformers-v4.31.0.patch
├── preprocess
│ └── mpt_process
│ │ ├── mpt-7b-modifications-for-shears-usage.patch
│ │ ├── split_qkv_preprocess.py
│ │ └── wanda
│ │ ├── main_mpt.py
│ │ └── prune_mpt.py
├── run_commonsense.py
├── run_gsm8k.py
├── run_math.py
├── running_commands
├── search
│ ├── load_and_explore_supernet.ipynb
│ └── supernet.py
└── utils
│ └── utils.py
├── SparAMX
├── .gitignore
├── README.md
├── Videos
│ ├── sparamx.gif
│ └── stock.gif
├── benchmark_deepsparse.sh
├── compare_stock_vs_custom_linear.py
├── compare_stock_vs_onednn_linear.py
├── compare_stock_vs_sparse_linear.py
├── csrc
│ ├── avx_sparse_linear.cpp
│ ├── dense_linear.cpp
│ ├── example_utils.hpp
│ ├── onednn_linear.cpp
│ ├── quantized_dense_linear.cpp
│ ├── quantized_sparse_linear.cpp
│ ├── sparse_linear.cpp
│ └── sparse_linear_temp.cpp
├── custom_llama_attention.py
├── deepsparse_optimized_llama2.py
├── generate_attention_experiments.py
├── generate_experiments.py
├── layer
│ ├── avx_sparse_linear.py
│ ├── dense_linear.py
│ ├── onednn_linear.py
│ ├── quantized_dense_linear.py
│ ├── quantized_sparse_linear.py
│ └── sparse_linear.py
├── llm_pipeline.py
├── openvino
│ └── run_llama_2_8b.py
├── requirements.txt
├── run_experiments.sh
├── run_experiments_attention.sh
├── setup.py
├── test_avx_sparse_attention.py
├── test_avx_sparse_layer.py
├── test_dense_attention.py
├── test_dense_layer.py
├── test_onednn_layer.py
├── test_quantized_dense_layer.py
├── test_quantized_sparse_layer.py
├── test_sparse_attention.py
└── test_sparse_layer.py
└── security.md
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.pt filter=lfs diff=lfs merge=lfs -text
2 | *.pth filter=lfs diff=lfs merge=lfs -text
3 | *.bin filter=lfs diff=lfs merge=lfs -text
4 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 | # Mac
132 | .DS_Store
133 |
134 | # Pycharm
135 | .idea
--------------------------------------------------------------------------------
/BootstrapNAS/README.md:
--------------------------------------------------------------------------------
1 | # BootstrapNAS Jupyter Notebooks
2 |
3 | ---
4 |
5 |
6 |
7 |
8 |
9 | BootstrapNAS (1) takes a pre-trained model as input. (2) It uses this model to generate a weight-sharing super-network. (3) BootstrapNAS then applies a training strategy, and once the super-network has been trained, (4) it searches for efficient subnetworks that satisfy the user's requirements. (5) The configuration of the discovered sub-network(s) is returned to the user.
10 |
11 | ## Quickstart
12 |
13 | Please follow the instructions [here](https://github.com/jpablomch/bootstrapnas/wiki/Quickstart).
14 |
15 | If you already have a super-network trained with BootstrapNAS, please follow the instructions to search for sub-networks [here](https://github.com/jpablomch/bootstrapnas/wiki/Subnetwork_Search).
16 |
17 | More information about BootstrapNAS is available in our papers:
18 |
19 | [Automated Super-Network Generation for Scalable Neural Architecture Search](https://openreview.net/pdf?id=HK-zmbTB8gq).
20 |
21 | ```bibtex
22 | @inproceedings{
23 | munoz2022automated,
24 | title={Automated Super-Network Generation for Scalable Neural Architecture Search},
25 | author={Muñoz, J. Pablo and Lyalyushkin, Nikolay and Lacewell, Chaunte and Senina, Anastasia and Cummings, Daniel and Sarah, Anthony and Kozlov, Alexander and Jain, Nilesh},
26 | booktitle={First Conference on Automated Machine Learning (Main Track)},
27 | year={2022},
28 | url={https://openreview.net/pdf?id=HK-zmbTB8gq}
29 | }
30 | ```
31 | [Enabling NAS with Automated Super-Network Generation](https://arxiv.org/abs/2112.10878)
32 |
33 | ```BibTex
34 | @article{
35 | bootstrapNAS,
36 | author = {Muñoz, J. Pablo and Lyalyushkin, Nikolay and Akhauri, Yash and Senina, Anastasia and Kozlov, Alexander and Jain, Nilesh},
37 | title = {Enabling NAS with Automated Super-Network Generation},
38 | journal = {1st International Workshop on Practical
39 | Deep Learning in the Wild at AAAI},
40 | year = {2022},
41 | url = {https://practical-dl.github.io/2022/short_paper/21.pdf},
42 | }
43 | ```
44 |
45 | ## Contributing to BootstrapNAS
46 | Please follow the contribution guidelines in [NNCF](https://github.com/openvinotoolkit/nncf).
47 |
48 |
--------------------------------------------------------------------------------
/BootstrapNAS/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/7549413d38677dd6eb92f918f7cc003dc65d1deb/BootstrapNAS/architecture.png
--------------------------------------------------------------------------------
/BootstrapNAS/examples/.gitignore:
--------------------------------------------------------------------------------
1 | data
2 | model
3 | output
4 | .DS_Store
--------------------------------------------------------------------------------
/BootstrapNAS/examples/README.md:
--------------------------------------------------------------------------------
1 |
2 | Automated Neural Architecture Search with BootstrapNAS
3 |
4 |
5 | This notebook demonstrates how to use [BootstrapNAS](https://arxiv.org/abs/2112.10878), a capability in NNCF to generate weight-sharing super-networks from pre-trained models. Once the super-network has been generated, BootstrapNAS can train it and search for efficient sub-networks.
6 |
7 | ## Examples for using third-party solutions to search for subnetworks.
8 |
9 | [SigOpt](third_party_search/sigopt_external_search_resnet50_supernet.ipynb)
10 | [DyNAS-T](third_party_search/dynas-t_external_search_resnet50_supernet.ipynb)
--------------------------------------------------------------------------------
/BootstrapNAS/examples/imgs/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/7549413d38677dd6eb92f918f7cc003dc65d1deb/BootstrapNAS/examples/imgs/architecture.png
--------------------------------------------------------------------------------
/BootstrapNAS/examples/imgs/search_progression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/7549413d38677dd6eb92f918f7cc003dc65d1deb/BootstrapNAS/examples/imgs/search_progression.png
--------------------------------------------------------------------------------
/BootstrapNAS/examples/imports_bnas.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Intel Corporation
2 | # SPDX-License-Identifier: MIT
3 |
4 | import sys
5 | import time
6 | import zipfile
7 | from pathlib import Path
8 | import logging
9 | import warnings # to disable warnings on export to ONNX
10 | warnings.filterwarnings("ignore")
11 | warnings.simplefilter('ignore')
12 |
13 | import torch
14 | import torch.nn as nn
15 | import torch.nn.parallel
16 | import torch.optim
17 | import torch.utils.data
18 | import torch.utils.data.distributed
19 | import torchvision.datasets as datasets
20 | import torchvision.models as models
21 | import torchvision.transforms as transforms
22 |
23 | import nncf # Important - should be imported directly after torch
24 | from nncf.common.utils.logger import set_log_level
25 | set_log_level(logging.ERROR) # Disables all NNCF info and warning messages
26 | from nncf import NNCFConfig
27 | from nncf.config.structures import BNAdaptationInitArgs
28 | from nncf.experimental.torch.nas.bootstrapNAS import EpochBasedTrainingAlgorithm
29 | from nncf.experimental.torch.nas.bootstrapNAS import SearchAlgorithm
30 | from nncf.torch import create_compressed_model, register_default_init_args
31 | from nncf.torch.initialization import wrap_dataloader_for_init
32 | from nncf.torch.model_creation import create_nncf_network
33 |
34 | from bootstrapnas_utils import resnet50_cifar10, validate, train_epoch, create_folders_demo, create_cifar10_dataloader, download_file
35 |
36 | torch.manual_seed(0)
37 |
38 | print("Imported PyTorch and NNCF")
--------------------------------------------------------------------------------
/BootstrapNAS/examples/third_party_search/dynast_bootstrapnas_resnet50_cifar10_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/7549413d38677dd6eb92f918f7cc003dc65d1deb/BootstrapNAS/examples/third_party_search/dynast_bootstrapnas_resnet50_cifar10_example.png
--------------------------------------------------------------------------------
/BootstrapNAS/examples/yolox-nano/README.md:
--------------------------------------------------------------------------------
1 | # YoloX-NAS
2 |
3 | ----
4 | ### Prepare Dataset
5 | ```
6 | cd /data/dataset/
7 | wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
8 | tar xf VOCtrainval_06-Nov-2007.tar
9 | mkdir voc2007_coco
10 | cd voc2007_coco
11 |
12 | wget https://raw.githubusercontent.com/yukkyo/voc2coco/master/voc2coco.py
13 | mkdir annotations
14 | ln -s ../VOCdevkit/VOC2007/Annotations .
15 | ln -s ../VOCdevkit/VOC2007/ImageSets .
16 | ```
17 |
18 | Add class_list.txt to this directory.
19 |
20 | ```
21 | python3 voc2coco.py --ann_dir Annotations/ --ann_ids ImageSets/Main/train.txt --labels class_list.txt --output annotations/instances_train.json --ext xml --extract_num_from_imgid
22 | python3 voc2coco.py --ann_dir Annotations/ --ann_ids ImageSets/Main/val.txt --labels class_list.txt --output annotations/instances_val.json --ext xml --extract_num_from_imgid
23 |
24 | ln -s ../VOCdevkit/VOC2007/JPEGImages/ train2017
25 | ln -s ../VOCdevkit/VOC2007/JPEGImages/ val2017
26 |
27 | **change to working directory.**
28 |
29 | git clone https://github.com/Megvii-BaseDetection/YOLOX.git && cd YOLOX
30 | git checkout -b bootstrapnas bb9185c095dfd7a8015a1b82f3e9a065090860b8
31 | git apply < /path/to/yolox-bootstrapnas.patch
32 |
33 | cd datasets && ln -s ../../VOCdevkit/VOC2007 && cd -
34 | cd datasets && ln -s /data/dataset/voc2007_coco/ VOC2007 && cd -
35 |
36 | poetry config --local virtualenvs.in-project true
37 | poetry install
38 | poetry shell
39 |
40 |
41 | #train without BootstrapNAS to get pretrained weight
42 | wget https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_nano.pth
43 | C=yolox_nano.pth
44 | PYTHONPATH=. nohup 2>&1 python tools/train.py -f exps/default/yolox_nano_voc-e50.py -d 1 -b 6 -o -c $C --cache
45 |
46 | #train with BootstrapNAS using pretrained initial weight
47 | C=YOLOX_outputs/yolox_nano_voc-e50/best_ckpt.pth
48 | PYTHONPATH=. nohup 2>&1 python tools/train.py -f exps/default/first_try.py -d 1 -b 6 -o -c $C --cache --nncf_config_path nncf_config_yolox_bootstrapNAS.json
49 | ```
50 |
--------------------------------------------------------------------------------
/BootstrapNAS/examples/yolox-nano/class_list.txt:
--------------------------------------------------------------------------------
1 | aeroplane
2 | bicycle
3 | bird
4 | boat
5 | bottle
6 | bus
7 | car
8 | cat
9 | chair
10 | cow
11 | diningtable
12 | dog
13 | horse
14 | motorbike
15 | person
16 | pottedplant
17 | sheep
18 | sofa
19 | train
20 | tvmonitor
--------------------------------------------------------------------------------
/BootstrapNAS/examples/yolox-nano/search_progression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/7549413d38677dd6eb92f918f7cc003dc65d1deb/BootstrapNAS/examples/yolox-nano/search_progression.png
--------------------------------------------------------------------------------
/BootstrapNAS/instructions/Home.md:
--------------------------------------------------------------------------------
1 | ### BootstrapNAS: Automated Super-Network Generation for Scalable Neural Architecture Search
2 |
3 | ### [Quickstart](https://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/tree/main/BootstrapNAS/instructions/Quickstart.md)
4 |
5 | ### [Sub-network Search](https://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/tree/main/BootstrapNAS/instructions/Subnetwork_Search.md)
6 |
7 | ### [Configuration](https://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/tree/main/BootstrapNAS/instructions/Configuration.md)
8 |
9 |
--------------------------------------------------------------------------------
/BootstrapNAS/instructions/Subnetwork_Search.md:
--------------------------------------------------------------------------------
1 | ### Search on an existing super-network
2 |
3 | If you have a trained super-network, you can start the search stage directly using the ```bootstrap_nas_search.py``` script located [here](https://github.com/openvinotoolkit/nncf/blob/develop/examples/experimental/torch/classification/bootstrap_nas_search.py).
4 |
5 | You must pass the path where the weights and elasticity information have been stored, which is your log directory by default.
6 |
7 | ```shell
8 | python bootstrap_nas_search.py -m
9 | train
10 | --config
11 | --log-dir
12 | --dataset
13 |
14 | --data
15 | --elasticity-state-path
16 |
17 | --supernet-weights
18 | --search-mode
19 | ```
20 |
21 | #### Hardware-aware search
22 |
23 | BootstrapNAS can be made hardware-aware when searching for efficient sub-networks. To accomplish this, you can pass your own `efficiency evaluator` for the target hardware to the search component.
--------------------------------------------------------------------------------
/BootstrapNAS/models/pretrained/resnet50.pt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:af4a5794552d80db4554b1f6fe260bdd3357da77b0707d00a39cb206c2811b90
3 | size 94403311
4 |
--------------------------------------------------------------------------------
/BootstrapNAS/models/supernets/cifar10/resnet50/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "model": "resnet50_cifar10",
3 | "num_classes": 10,
4 | "dataset": "cifar10",
5 | "input_info": {
6 | "sample_size": [1, 3, 32, 32]
7 | },
8 | "batch_size": 64,
9 | "batch_size_val": 2000,
10 | "multiprocessing_distributed": false,
11 | "optimizer": {
12 | "type": "sgd",
13 | "momentum": 0.9,
14 | "nesterov": true,
15 | "weight_decay": 3e-7,
16 | "base_lr": 2.5e-4,
17 | "label_smoothing": 0.1,
18 | "no_decay_keys": "bn#bias"
19 | },
20 | "bootstrapNAS": {
21 | "training": {
22 | "algorithm": "progressive_shrinking",
23 | "progressivity_of_elasticity": ["depth", "width"],
24 | "batchnorm_adaptation": {
25 | "num_bn_adaptation_samples": 1500
26 | },
27 | "schedule": {
28 | "list_stage_descriptions": [
29 | {"train_dims": ["depth", "width"], "epochs": 125, "depth_indicator": 2, "width_indicator": 4, "init_lr": 2.5e-4, "epochs_lr": 125, "reorg_weights": true}
30 | ]
31 | },
32 | "elasticity": {
33 | "available_elasticity_dims": ["width", "depth"],
34 | "width": {
35 | "max_num_widths": 4,
36 | "min_out_channels": 32,
37 | "width_step": 32,
38 | "width_multipliers": [1, 0.80, 0.60, 0.50]
39 | },
40 | "depth": {
41 | "mode": "manual",
42 | "skipped_blocks": [
43 | ["ResNet/Sequential[layer1]/Bottleneck[1]/ReLU[relu]/relu__2", "ResNet/Sequential[layer1]/Bottleneck[2]/ReLU[relu]/relu__2"],
44 | ["ResNet/Sequential[layer2]/Bottleneck[1]/ReLU[relu]/relu__2", "ResNet/Sequential[layer2]/Bottleneck[2]/ReLU[relu]/relu__2"],
45 | ["ResNet/Sequential[layer2]/Bottleneck[2]/ReLU[relu]/relu__2", "ResNet/Sequential[layer2]/Bottleneck[3]/ReLU[relu]/relu__2"],
46 | ["ResNet/Sequential[layer3]/Bottleneck[3]/ReLU[relu]/relu__2", "ResNet/Sequential[layer3]/Bottleneck[4]/ReLU[relu]/relu__2"],
47 | ["ResNet/Sequential[layer3]/Bottleneck[4]/ReLU[relu]/relu__2", "ResNet/Sequential[layer3]/Bottleneck[5]/ReLU[relu]/relu__2"],
48 | ["ResNet/Sequential[layer4]/Bottleneck[1]/ReLU[relu]/relu__2", "ResNet/Sequential[layer4]/Bottleneck[2]/ReLU[relu]/relu__2"]
49 | ]
50 | }
51 | }
52 | },
53 | "search": {
54 | "algorithm": "NSGA2",
55 | "batchnorm_adaptation": {
56 | "num_bn_adaptation_samples": 6000
57 | },
58 | "num_evals": 3000,
59 | "population": 50,
60 | "ref_acc": 93.65
61 | }
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/BootstrapNAS/models/supernets/cifar10/resnet50/elasticity.pth:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:106473f6f954a1bd0d93339cc10454e6f9ab9b5aff3b872067414cbdd0d9f08c
3 | size 6645
4 |
--------------------------------------------------------------------------------
/BootstrapNAS/models/supernets/cifar10/resnet50/results.md:
--------------------------------------------------------------------------------
1 | # ResNet50-CIFAR10
2 |
3 | ## 1. Current Results
4 |
5 |
6 |
7 | | Architecture | MACs | Acc@1 |
8 | |-|-|-|
9 | [Pretrained](train_results_others/pretrained.pt) | 325.8M | 93.65 |
10 | [SuperNet](supernet_weights.pth) | 325.8M | 94.09 |
11 | Minimum SubNet | 64.1M | 92.61 |
12 | [Best Found SubNet](search_results_others/subnetwork_best.pth) | 68.3M | 92.96 |
13 |
14 |
15 |
16 |
17 | ## 2. Search Progression
18 |
19 |
20 |
--------------------------------------------------------------------------------
/BootstrapNAS/models/supernets/cifar10/resnet50/search_progression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/7549413d38677dd6eb92f918f7cc003dc65d1deb/BootstrapNAS/models/supernets/cifar10/resnet50/search_progression.png
--------------------------------------------------------------------------------
/BootstrapNAS/models/supernets/cifar10/resnet50/supernet_weights.pth:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6dbb4aaf604f5a27f6031603e3cc52bca41213dc6839210540f5c26e71d97085
3 | size 94455127
4 |
--------------------------------------------------------------------------------
/BootstrapNAS/requirements.txt:
--------------------------------------------------------------------------------
1 | nncf[torch]
2 | ipywidgets
3 |
--------------------------------------------------------------------------------
/EFTNAS/figures/eftnas_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/7549413d38677dd6eb92f918f7cc003dc65d1deb/EFTNAS/figures/eftnas_pipeline.png
--------------------------------------------------------------------------------
/EFTNAS/install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | set -x
4 |
5 | EFTNAS_PATH=$PWD
6 |
7 | git clone https://github.com/openvinotoolkit/nncf.git nncf_eftnas
8 | cd nncf_eftnas
9 | git checkout 415c3c4d
10 | # Apply Patch
11 | git apply $EFTNAS_PATH/patches/nncf.patch
12 | pip install -e .
13 | cd ..
14 |
15 |
16 | git clone https://github.com/huggingface/transformers.git transformers_eftnas
17 | cd transformers_eftnas
18 | git checkout v4.29.1
19 | git apply $EFTNAS_PATH/patches/transformers.patch
20 | pip install -e .
21 | pip install -r examples/pytorch/text-classification/requirements.txt
22 | cd ..
23 |
--------------------------------------------------------------------------------
/EFTNAS/running_commands/cola.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | EFTNAS_PATH=$PWD
4 | DEVICES=0
5 |
6 | # Model: BERT-base
7 | # Dataset: cola
8 |
9 | cd transformers_eftnas
10 | CUDA_VISIBLE_DEVICES=${DEVICES} python examples/pytorch/text-classification/run_glue.py \
11 | --model_name_or_path bert-base-uncased \
12 | --task_name cola \
13 | --nncf_config ${EFTNAS_PATH}/eftnas_configs/nncf_eftnas_s1_bert_base_cola.json \
14 | --output_dir ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-cola/movement_sparsity \
15 | --do_train \
16 | --do_eval \
17 | --max_seq_length 128 \
18 | --per_device_train_batch_size 32 \
19 | --per_device_eval_batch_size 128 \
20 | --learning_rate 2e-5 \
21 | --num_train_epochs 30 \
22 | --evaluation_strategy epoch \
23 | --save_strategy epoch \
24 | --save_total_limit 1 \
25 | --seed 42 \
26 | --fp16 \
27 | --only_generate_importance_weight True
28 | cd ..
29 |
30 | CUDA_VISIBLE_DEVICES=${DEVICES} python eftnas_search_space/generate_eftnas_search_space.py \
31 | --source_config eftnas_configs/nncf_eftnas_s1_bert_base_cola.json \
32 | --model_name_or_path bert-base-uncased \
33 | --importance_weight_dir trained_models/eftnas-bert-base-cola/movement_sparsity \
34 | --target_config results/generated_configs/nncf_eftnas_s1_bert_base_cola.json
35 |
36 | cd transformers_eftnas
37 | CUDA_VISIBLE_DEVICES=${DEVICES} python examples/pytorch/text-classification/run_glue.py \
38 | --model_name_or_path bert-base-uncased \
39 | --kd_teacher_model ModelTC/bert-base-uncased-cola \
40 | --reorg_cache_model ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-cola/movement_sparsity/pytorch_model.bin \
41 | --task_name cola \
42 | --nncf_config ${EFTNAS_PATH}/results/generated_configs/nncf_eftnas_s1_bert_base_cola.json \
43 | --output_dir ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-cola \
44 | --do_train \
45 | --do_eval \
46 | --do_search \
47 | --max_seq_length 128 \
48 | --per_device_train_batch_size 32 \
49 | --per_device_eval_batch_size 128 \
50 | --evaluation_strategy epoch \
51 | --save_strategy epoch \
52 | --save_total_limit 1 \
53 | --seed 42 \
54 | --fp16
55 |
--------------------------------------------------------------------------------
/EFTNAS/running_commands/mnli.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | EFTNAS_PATH=$PWD
4 | DEVICES=0
5 |
6 | # Model: BERT-base
7 | # Dataset: mnli
8 |
9 | cd transformers_eftnas
10 | CUDA_VISIBLE_DEVICES=${DEVICES} python examples/pytorch/text-classification/run_glue.py \
11 | --model_name_or_path bert-base-uncased \
12 | --task_name mnli \
13 | --nncf_config ${EFTNAS_PATH}/eftnas_configs/nncf_eftnas_s1_bert_base_mnli.json \
14 | --output_dir ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-mnli/movement_sparsity \
15 | --do_train \
16 | --do_eval \
17 | --max_seq_length 128 \
18 | --per_device_train_batch_size 32 \
19 | --per_device_eval_batch_size 128 \
20 | --learning_rate 2e-5 \
21 | --num_train_epochs 6 \
22 | --evaluation_strategy epoch \
23 | --save_strategy epoch \
24 | --save_total_limit 1 \
25 | --seed 42 \
26 | --fp16 \
27 | --only_generate_importance_weight True
28 | cd ..
29 |
30 | CUDA_VISIBLE_DEVICES=${DEVICES} python eftnas_search_space/generate_eftnas_search_space.py \
31 | --source_config eftnas_configs/nncf_eftnas_s1_bert_base_mnli.json \
32 | --model_name_or_path bert-base-uncased \
33 | --importance_weight_dir trained_models/eftnas-bert-base-mnli/movement_sparsity \
34 | --target_config results/generated_configs/nncf_eftnas_s1_bert_base_mnli.json
35 |
36 | cd transformers_eftnas
37 | CUDA_VISIBLE_DEVICES=${DEVICES} python examples/pytorch/text-classification/run_glue.py \
38 | --model_name_or_path bert-base-uncased \
39 | --kd_teacher_model JeremiahZ/bert-base-uncased-mnli \
40 | --reorg_cache_model ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-mnli/movement_sparsity/pytorch_model.bin \
41 | --task_name mnli \
42 | --nncf_config ${EFTNAS_PATH}/results/generated_configs/nncf_eftnas_s1_bert_base_mnli.json \
43 | --output_dir ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-mnli \
44 | --do_train \
45 | --do_eval \
46 | --do_search \
47 | --max_seq_length 128 \
48 | --per_device_train_batch_size 32 \
49 | --per_device_eval_batch_size 128 \
50 | --evaluation_strategy epoch \
51 | --save_strategy epoch \
52 | --save_total_limit 1 \
53 | --seed 42 \
54 | --fp16
55 |
--------------------------------------------------------------------------------
/EFTNAS/running_commands/mrpc.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | EFTNAS_PATH=$PWD
4 | DEVICES=0
5 |
6 | # Model: BERT-base
7 | # Dataset: mrpc
8 |
9 | cd transformers_eftnas
10 | CUDA_VISIBLE_DEVICES=${DEVICES} python examples/pytorch/text-classification/run_glue.py \
11 | --model_name_or_path bert-base-uncased \
12 | --task_name mrpc \
13 | --nncf_config ${EFTNAS_PATH}/eftnas_configs/nncf_eftnas_s1_bert_base_mrpc.json \
14 | --output_dir ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-mrpc/movement_sparsity \
15 | --do_train \
16 | --do_eval \
17 | --max_seq_length 128 \
18 | --per_device_train_batch_size 32 \
19 | --per_device_eval_batch_size 128 \
20 | --learning_rate 2e-5 \
21 | --num_train_epochs 30 \
22 | --evaluation_strategy epoch \
23 | --save_strategy epoch \
24 | --save_total_limit 1 \
25 | --seed 42 \
26 | --fp16 \
27 | --only_generate_importance_weight True
28 | cd ..
29 |
30 | CUDA_VISIBLE_DEVICES=${DEVICES} python eftnas_search_space/generate_eftnas_search_space.py \
31 | --source_config eftnas_configs/nncf_eftnas_s1_bert_base_mrpc.json \
32 | --model_name_or_path bert-base-uncased \
33 | --importance_weight_dir trained_models/eftnas-bert-base-mrpc/movement_sparsity \
34 | --target_config results/generated_configs/nncf_eftnas_s1_bert_base_mrpc.json
35 |
36 | cd transformers_eftnas
37 | CUDA_VISIBLE_DEVICES=${DEVICES} python examples/pytorch/text-classification/run_glue.py \
38 | --model_name_or_path bert-base-uncased \
39 | --kd_teacher_model Intel/bert-base-uncased-mrpc \
40 | --reorg_cache_model ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-mrpc/movement_sparsity/pytorch_model.bin \
41 | --task_name mrpc \
42 | --nncf_config ${EFTNAS_PATH}/results/generated_configs/nncf_eftnas_s1_bert_base_mrpc.json \
43 | --output_dir ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-mrpc \
44 | --do_train \
45 | --do_eval \
46 | --do_search \
47 | --max_seq_length 128 \
48 | --per_device_train_batch_size 32 \
49 | --per_device_eval_batch_size 128 \
50 | --evaluation_strategy epoch \
51 | --save_strategy epoch \
52 | --save_total_limit 1 \
53 | --seed 42 \
54 | --fp16
55 |
--------------------------------------------------------------------------------
/EFTNAS/running_commands/qnli.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | EFTNAS_PATH=$PWD
4 | DEVICES=0
5 |
6 | # Model: BERT-base
7 | # Dataset: qnli
8 |
9 | cd transformers_eftnas
10 | CUDA_VISIBLE_DEVICES=${DEVICES} python examples/pytorch/text-classification/run_glue.py \
11 | --model_name_or_path bert-base-uncased \
12 | --task_name qnli \
13 | --nncf_config ${EFTNAS_PATH}/eftnas_configs/nncf_eftnas_s1_bert_base_qnli.json \
14 | --output_dir ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-qnli/movement_sparsity \
15 | --do_train \
16 | --do_eval \
17 | --max_seq_length 128 \
18 | --per_device_train_batch_size 32 \
19 | --per_device_eval_batch_size 128 \
20 | --learning_rate 2e-5 \
21 | --num_train_epochs 6 \
22 | --evaluation_strategy epoch \
23 | --save_strategy epoch \
24 | --save_total_limit 1 \
25 | --seed 42 \
26 | --fp16 \
27 | --only_generate_importance_weight True
28 | cd ..
29 |
30 | CUDA_VISIBLE_DEVICES=${DEVICES} python eftnas_search_space/generate_eftnas_search_space.py \
31 | --source_config eftnas_configs/nncf_eftnas_s1_bert_base_qnli.json \
32 | --model_name_or_path bert-base-uncased \
33 | --importance_weight_dir trained_models/eftnas-bert-base-qnli/movement_sparsity \
34 | --target_config results/generated_configs/nncf_eftnas_s1_bert_base_qnli.json
35 |
36 | cd transformers_eftnas
37 | CUDA_VISIBLE_DEVICES=${DEVICES} python examples/pytorch/text-classification/run_glue.py \
38 | --model_name_or_path bert-base-uncased \
39 | --kd_teacher_model ModelTC/bert-base-uncased-qnli \
40 | --reorg_cache_model ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-qnli/movement_sparsity/pytorch_model.bin \
41 | --task_name qnli \
42 | --nncf_config ${EFTNAS_PATH}/results/generated_configs/nncf_eftnas_s1_bert_base_qnli.json \
43 | --output_dir ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-qnli \
44 | --do_train \
45 | --do_eval \
46 | --do_search \
47 | --max_seq_length 128 \
48 | --per_device_train_batch_size 32 \
49 | --per_device_eval_batch_size 128 \
50 | --evaluation_strategy epoch \
51 | --save_strategy epoch \
52 | --save_total_limit 1 \
53 | --seed 42 \
54 | --fp16
55 |
--------------------------------------------------------------------------------
/EFTNAS/running_commands/qqp.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | EFTNAS_PATH=$PWD
4 | DEVICES=0
5 |
6 | # Model: BERT-base
7 | # Dataset: qqp
8 |
9 | cd transformers_eftnas
10 | CUDA_VISIBLE_DEVICES=${DEVICES} python examples/pytorch/text-classification/run_glue.py \
11 | --model_name_or_path bert-base-uncased \
12 | --task_name qqp \
13 | --nncf_config ${EFTNAS_PATH}/eftnas_configs/nncf_eftnas_s1_bert_base_qqp.json \
14 | --output_dir ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-qqp/movement_sparsity \
15 | --do_train \
16 | --do_eval \
17 | --max_seq_length 128 \
18 | --per_device_train_batch_size 32 \
19 | --per_device_eval_batch_size 128 \
20 | --learning_rate 2e-5 \
21 | --num_train_epochs 6 \
22 | --evaluation_strategy epoch \
23 | --save_strategy epoch \
24 | --save_total_limit 1 \
25 | --seed 42 \
26 | --fp16 \
27 | --only_generate_importance_weight True
28 | cd ..
29 |
30 | CUDA_VISIBLE_DEVICES=${DEVICES} python eftnas_search_space/generate_eftnas_search_space.py \
31 | --source_config eftnas_configs/nncf_eftnas_s1_bert_base_qqp.json \
32 | --model_name_or_path bert-base-uncased \
33 | --importance_weight_dir trained_models/eftnas-bert-base-qqp/movement_sparsity \
34 | --target_config results/generated_configs/nncf_eftnas_s1_bert_base_qqp.json
35 |
36 | cd transformers_eftnas
37 | CUDA_VISIBLE_DEVICES=${DEVICES} python examples/pytorch/text-classification/run_glue.py \
38 | --model_name_or_path bert-base-uncased \
39 | --kd_teacher_model JeremiahZ/bert-base-uncased-qqp \
40 | --reorg_cache_model ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-qqp/movement_sparsity/pytorch_model.bin \
41 | --task_name qqp \
42 | --nncf_config ${EFTNAS_PATH}/results/generated_configs/nncf_eftnas_s1_bert_base_qqp.json \
43 | --output_dir ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-qqp \
44 | --do_train \
45 | --do_eval \
46 | --do_search \
47 | --max_seq_length 128 \
48 | --per_device_train_batch_size 32 \
49 | --per_device_eval_batch_size 128 \
50 | --evaluation_strategy epoch \
51 | --save_strategy epoch \
52 | --save_total_limit 1 \
53 | --seed 42 \
54 | --fp16
55 |
--------------------------------------------------------------------------------
/EFTNAS/running_commands/rte.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | EFTNAS_PATH=$PWD
4 | DEVICES=0
5 |
6 | # Model: BERT-base
7 | # Dataset: rte
8 |
9 | cd transformers_eftnas
10 | CUDA_VISIBLE_DEVICES=${DEVICES} python examples/pytorch/text-classification/run_glue.py \
11 | --model_name_or_path bert-base-uncased \
12 | --task_name rte \
13 | --nncf_config ${EFTNAS_PATH}/eftnas_configs/nncf_eftnas_s1_bert_base_rte.json \
14 | --output_dir ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-rte/movement_sparsity \
15 | --do_train \
16 | --do_eval \
17 | --max_seq_length 128 \
18 | --per_device_train_batch_size 32 \
19 | --per_device_eval_batch_size 128 \
20 | --learning_rate 2e-5 \
21 | --num_train_epochs 30 \
22 | --evaluation_strategy epoch \
23 | --save_strategy epoch \
24 | --save_total_limit 1 \
25 | --seed 42 \
26 | --fp16 \
27 | --only_generate_importance_weight True
28 | cd ..
29 |
30 | CUDA_VISIBLE_DEVICES=${DEVICES} python eftnas_search_space/generate_eftnas_search_space.py \
31 | --source_config eftnas_configs/nncf_eftnas_s1_bert_base_rte.json \
32 | --model_name_or_path bert-base-uncased \
33 | --importance_weight_dir trained_models/eftnas-bert-base-rte/movement_sparsity \
34 | --target_config results/generated_configs/nncf_eftnas_s1_bert_base_rte.json
35 |
36 | cd transformers_eftnas
37 | CUDA_VISIBLE_DEVICES=${DEVICES} python examples/pytorch/text-classification/run_glue.py \
38 | --model_name_or_path bert-base-uncased \
39 | --kd_teacher_model textattack/bert-base-uncased-RTE \
40 | --reorg_cache_model ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-rte/movement_sparsity/pytorch_model.bin \
41 | --task_name rte \
42 | --nncf_config ${EFTNAS_PATH}/results/generated_configs/nncf_eftnas_s1_bert_base_rte.json \
43 | --output_dir ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-rte \
44 | --do_train \
45 | --do_eval \
46 | --do_search \
47 | --max_seq_length 128 \
48 | --per_device_train_batch_size 32 \
49 | --per_device_eval_batch_size 128 \
50 | --evaluation_strategy epoch \
51 | --save_strategy epoch \
52 | --save_total_limit 1 \
53 | --seed 42 \
54 | --fp16
55 |
--------------------------------------------------------------------------------
/EFTNAS/running_commands/squadv1.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | EFTNAS_PATH=$PWD
4 | DEVICES=0
5 |
6 | # Model: BERT-base
7 | # Dataset: squadv1
8 |
9 | cd transformers_eftnas
10 | CUDA_VISIBLE_DEVICES=${DEVICES} python examples/pytorch/question-answering/run_qa.py \
11 | --model_name_or_path bert-base-uncased \
12 | --do_train \
13 | --do_eval \
14 | --dataset_name squad \
15 | --learning_rate 2e-5 \
16 | --per_gpu_train_batch_size 16 \
17 | --per_gpu_eval_batch_size 128 \
18 | --output_dir ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-squadv1/movement_sparsity \
19 | --max_seq_length 384 \
20 | --doc_stride 128 \
21 | --nncf_config ${EFTNAS_PATH}/eftnas_configs/nncf_eftnas_s1_bert_base_squadv1.json \
22 | --evaluation_strategy epoch \
23 | --save_strategy epoch \
24 | --metric_for_best_model f1 \
25 | --overwrite_output_dir \
26 | --save_total_limit 1 \
27 | --num_train_epochs 8 \
28 | --fp16 \
29 | --only_generate_importance_weight True
30 |
31 | CUDA_VISIBLE_DEVICES=${DEVICES} python eftnas_search_space/generate_eftnas_search_space.py \
32 | --source_config eftnas_configs/nncf_eftnas_s1_bert_base_squadv1.json \
33 | --model_name_or_path bert-base-uncased \
34 | --importance_weight_dir trained_models/eftnas-bert-base-squadv1/movement_sparsity \
35 | --target_config results/generated_configs/nncf_eftnas_s1_bert_base_squadv1.json
36 |
37 | cd transformers_eftnas
38 | CUDA_VISIBLE_DEVICES=${DEVICES} python examples/pytorch/question-answering/run_qa.py \
39 | --model_name_or_path bert-base-uncased \
40 | --kd_teacher_model csarron/bert-base-uncased-squad-v1 \
41 | --reorg_cache_model ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-squadv1/movement_sparsity/pytorch_model.bin \
42 | --do_train \
43 | --do_eval \
44 | --do_search \
45 | --dataset_name squad \
46 | --learning_rate 3e-5 \
47 | --per_gpu_train_batch_size 16 \
48 | --per_gpu_eval_batch_size 128 \
49 | --output_dir ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-squadv1 \
50 | --max_seq_length 384 \
51 | --doc_stride 128 \
52 | --nncf_config ${EFTNAS_PATH}/results/generated_configs/nncf_eftnas_s1_bert_base_squadv1.json \
53 | --evaluation_strategy epoch \
54 | --save_strategy epoch \
55 | --metric_for_best_model f1 \
56 | --save_total_limit 1 \
57 | --num_train_epochs 8 \
58 | --fp16
59 |
--------------------------------------------------------------------------------
/EFTNAS/running_commands/squadv2.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | EFTNAS_PATH=$PWD
4 | DEVICES=0
5 |
6 | # Model: BERT-base
7 | # Dataset: squadv2
8 |
9 | cd transformers_eftnas
10 | CUDA_VISIBLE_DEVICES=${DEVICES} python examples/pytorch/question-answering/run_qa.py \
11 | --model_name_or_path bert-base-uncased \
12 | --do_train \
13 | --do_eval \
14 | --dataset_name squad_v2 \
15 | --learning_rate 2e-5 \
16 | --per_gpu_train_batch_size 16 \
17 | --per_gpu_eval_batch_size 128 \
18 | --output_dir ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-squadv2/movement_sparsity \
19 | --max_seq_length 384 \
20 | --doc_stride 128 \
21 | --nncf_config ${EFTNAS_PATH}/eftnas_configs/nncf_eftnas_s1_bert_base_squadv2.json \
22 | --evaluation_strategy epoch \
23 | --save_strategy epoch \
24 | --metric_for_best_model f1 \
25 | --overwrite_output_dir \
26 | --save_total_limit 1 \
27 | --num_train_epochs 8 \
28 | --fp16 \
29 | --only_generate_importance_weight True
30 |
31 | CUDA_VISIBLE_DEVICES=${DEVICES} python eftnas_search_space/generate_eftnas_search_space.py \
32 | --source_config eftnas_configs/nncf_eftnas_s1_bert_base_squadv2.json \
33 | --model_name_or_path bert-base-uncased \
34 | --importance_weight_dir trained_models/eftnas-bert-base-squadv2/movement_sparsity \
35 | --target_config results/generated_configs/nncf_eftnas_s1_bert_base_squadv2.json
36 |
37 | cd transformers_eftnas
38 | CUDA_VISIBLE_DEVICES=${DEVICES} python examples/pytorch/question-answering/run_qa.py \
39 | --model_name_or_path bert-base-uncased \
40 | --kd_teacher_model deepset/bert-base-uncased-squad2 \
41 | --reorg_cache_model ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-squadv2/movement_sparsity/pytorch_model.bin \
42 | --do_train \
43 | --do_eval \
44 | --do_search \
45 | --dataset_name squad_v2 \
46 | --learning_rate 3e-5 \
47 | --per_gpu_train_batch_size 16 \
48 | --per_gpu_eval_batch_size 128 \
49 | --version_2_with_negative \
50 | --output_dir ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-squadv2 \
51 | --max_seq_length 384 \
52 | --doc_stride 128 \
53 | --nncf_config ${EFTNAS_PATH}/results/generated_configs/nncf_eftnas_s1_bert_base_squadv2.json \
54 | --evaluation_strategy epoch \
55 | --save_strategy epoch \
56 | --metric_for_best_model f1 \
57 | --ddp_find_unused_parameters True \
58 | --save_total_limit 1 \
59 | --num_train_epochs 8 \
60 | --fp16
61 |
--------------------------------------------------------------------------------
/EFTNAS/running_commands/sst2.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | EFTNAS_PATH=$PWD
4 | DEVICES=0
5 |
6 | # Model: BERT-base
7 | # Dataset: sst2
8 |
9 | cd transformers_eftnas
10 | CUDA_VISIBLE_DEVICES=${DEVICES} python examples/pytorch/text-classification/run_glue.py \
11 | --model_name_or_path bert-base-uncased \
12 | --task_name sst2 \
13 | --nncf_config ${EFTNAS_PATH}/eftnas_configs/nncf_eftnas_s1_bert_base_sst2.json \
14 | --output_dir ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-sst2/movement_sparsity \
15 | --do_train \
16 | --do_eval \
17 | --max_seq_length 128 \
18 | --per_device_train_batch_size 32 \
19 | --per_device_eval_batch_size 128 \
20 | --learning_rate 2e-5 \
21 | --num_train_epochs 6 \
22 | --evaluation_strategy epoch \
23 | --save_strategy epoch \
24 | --save_total_limit 1 \
25 | --seed 42 \
26 | --fp16 \
27 | --only_generate_importance_weight True
28 | cd ..
29 |
30 | CUDA_VISIBLE_DEVICES=${DEVICES} python eftnas_search_space/generate_eftnas_search_space.py \
31 | --source_config eftnas_configs/nncf_eftnas_s1_bert_base_sst2.json \
32 | --model_name_or_path bert-base-uncased \
33 | --importance_weight_dir trained_models/eftnas-bert-base-sst2/movement_sparsity \
34 | --target_config results/generated_configs/nncf_eftnas_s1_bert_base_sst2.json
35 |
36 | cd transformers_eftnas
37 | CUDA_VISIBLE_DEVICES=${DEVICES} python examples/pytorch/text-classification/run_glue.py \
38 | --model_name_or_path bert-base-uncased \
39 | --kd_teacher_model JeremiahZ/bert-base-uncased-sst2 \
40 | --reorg_cache_model ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-sst2/movement_sparsity/pytorch_model.bin \
41 | --task_name sst2 \
42 | --nncf_config ${EFTNAS_PATH}/results/generated_configs/nncf_eftnas_s1_bert_base_sst2.json \
43 | --output_dir ${EFTNAS_PATH}/results/trained_models/eftnas-bert-base-sst2 \
44 | --do_train \
45 | --do_eval \
46 | --do_search \
47 | --max_seq_length 128 \
48 | --per_device_train_batch_size 32 \
49 | --per_device_eval_batch_size 128 \
50 | --evaluation_strategy epoch \
51 | --save_strategy epoch \
52 | --save_total_limit 1 \
53 | --seed 42 \
54 | --fp16
55 |
--------------------------------------------------------------------------------
/EZNAS/README.md:
--------------------------------------------------------------------------------
1 | # EZNAS: Evolving Zero-Cost Proxies For Neural Architecture Scoring
2 |
3 | EZNAS is a genetic programming-driven methodology for automatically discovering Zero-Cost Neural Architecture Scoring Metrics (ZC-NASMs). It aims to provide an interpretable, generalizable, and efficient approach to rank neural networks without the expensive training routines, significantly reducing the carbon footprint of Neural Architecture Search (NAS).
4 |
5 | ## Installation
6 |
7 | Follow these steps to set up and run EZNAS:
8 |
9 | ### Step 1: Base Set-up
10 | Run the provided setup_script.sh to install all necessary packages and dependencies.
11 |
12 | ```bash
13 | bash setup_script.sh
14 | ```
15 |
16 | This script should handle:
17 |
18 | 1. Installation of required Python packages.
19 | 2. Cloning of external GitHub repositories.
20 | 3. Setting up datasets and additional files necessary for running the project.
21 |
22 | ### Step 2: Set Environment Variable
23 |
24 | Set the PROJ_HOME environment variable to the path of your project:
25 |
26 | ```bash
27 | export PROJ_HOME=""
28 | ```
29 |
30 | ### Step 3: Run evaluation
31 |
32 | For SLURM based execution, modify runjob.sh as per server specification.
33 |
34 | To reproduce results for a specific data-set, simply run the appropriate command in quotes from the reproduce.sh file.
35 |
36 | ```bash
37 | python verify_scores.py --batch_size 16 --search_space NASBench201 --dataset cifar10 --nds_space ''
38 | ```
39 |
40 | ### Results
41 |
42 | | Search Space | Kendall τ | Spearman ρ |
43 | |------------------------|--------------|--------------|
44 | | NASBench-201 CIFAR-10 | 0.6195383854 | 0.8084988792 |
45 | | NASBench-201 CIFAR-100 | 0.6168760649 | 0.7983379022 |
46 | | NATSBench-SSS | 0.7073727282 | 0.8873359833 |
47 | | NDS DARTS | 0.5466290384 | 0.7364709542 |
48 | | NDS Amoeba | 0.4130041903 | 0.5775007582 |
49 | | NDS ENAS | 0.5111310224 | 0.6932549307 |
50 | | NDS PNAS | 0.4781835008 | 0.656343803 |
51 | | NDS NASNet | 0.4312498051 | 0.6050820615 |
52 |
53 |
54 | Note that the above table is for a batch size of 16. For better results, a higher batch-size is recommended! For instance, for NATSBench-SS at batch-size of 64, the Spearman ρ is 0.91.
55 |
56 | # Citation
57 |
58 | If you use the code or data in your research, please use the following BibTex entry:
59 |
60 | ```
61 | @inproceedings{
62 | akhauri2022eznas,
63 | title={{EZNAS}: Evolving Zero-Cost Proxies For Neural Architecture Scoring},
64 | author={Yash Akhauri and Juan Pablo Munoz and Nilesh Jain and Ravishankar Iyer},
65 | booktitle={Advances in Neural Information Processing Systems},
66 | editor={Alice H. Oh and Alekh Agarwal and Danielle Belgrave and Kyunghyun Cho},
67 | year={2022},
68 | url={https://openreview.net/forum?id=lSqaDG4dvdt}
69 | }
70 | ```
--------------------------------------------------------------------------------
/EZNAS/evol_config.yaml:
--------------------------------------------------------------------------------
1 | NUM_MATH_OPS: 28
2 | STATIC_ADDRS: 22
3 | LEN_IND_ADDR: 3
4 | NUM_DYNAMIC_ADDR_SPACES: 5
5 | NEW_INST_MIN_LEN: 8
6 | NEW_INST_MAX_LEN: 24
7 | NGEN: 10
8 | POPSIZE: 50
9 | TOURSIZE: 4
10 | MU: 25
11 | lambda_ : 50
12 | CXPR: 0.4
13 | MUTPR: 0.4
14 | nproc: 16
15 | NUM_NETS: 500
16 | SUBSAMPLE_NETS: 20
17 | NUM_SAMPLING_EVAL: 4
18 | rangemix: True
19 | MIN_TREE_DEPTH: 2
20 | MAX_TREE_DEPTH: 6
21 | data_folder: "data2"
--------------------------------------------------------------------------------
/EZNAS/reproduce.sh:
--------------------------------------------------------------------------------
1 | bash runjob.sh "python verify_scores.py --batch_size 16 " "nb2_cf10"
2 | bash runjob.sh "python verify_scores.py --batch_size 16 --dataset cifar100" "nb2_cf100"
3 | bash runjob.sh "python verify_scores.py --batch_size 16 --dataset ImageNet16-120" "nb2_in"
4 | bash runjob.sh "python verify_scores.py --batch_size 16 --nds_space nds_amoeba --search_space NDS" "amoeba"
5 | bash runjob.sh "python verify_scores.py --batch_size 16 --nds_space nds_darts --search_space NDS" "darts"
6 | bash runjob.sh "python verify_scores.py --batch_size 16 --nds_space nds_pnas --search_space NDS" "pnas"
7 | bash runjob.sh "python verify_scores.py --batch_size 16 --nds_space nds_nasnet --search_space NDS" "nasnet"
8 | bash runjob.sh "python verify_scores.py --batch_size 16 --nds_space nds_enas --search_space NDS" "enas"
9 | bash runjob.sh "python verify_scores.py --batch_size 64 --search_space NATSBench" "nats"
10 |
11 |
--------------------------------------------------------------------------------
/EZNAS/reproduce/all_tests.csv:
--------------------------------------------------------------------------------
1 | 1,-1,16,cifar10,NASBench201,,0.6195383853731455,0.4902176532853689,0.8084988792039854
2 | 1,-1,16,cifar100,NASBench201,,0.6168760649367213,0.5991489645246415,0.7983379021533304
3 | 1,-1,16,cifar10,NATSBench,,0.7073727281824621,0.8655292939139214,0.8873359832758938
4 | 1,-1,16,cifar10,NDS,nds_darts,0.5466290384387654,0.2101314187641226,0.7364709541852409
5 | 1,-1,16,cifar10,NDS,nds_amoeba,0.41300419025409685,0.13047597007496348,0.5775007581755685
6 | 1,-1,16,cifar10,NDS,nds_enas,0.5111310223594074,0.12183248307551318,0.6932549307473483
7 | 1,-1,16,cifar10,NDS,nds_pnas,0.47818350082450095,0.0657925903642663,0.6563438030327016
8 | 1,-1,16,cifar10,NDS,nds_nasnet,0.43124980513279043,0.16231595543650768,0.6050820615327234
--------------------------------------------------------------------------------
/EZNAS/runjob.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Check if two arguments are provided
4 | if [ "$#" -ne 2 ]; then
5 | echo "Usage: $0 [command] [job-name]"
6 | exit 1
7 | fi
8 |
9 | COMMAND=$1
10 | JOB_NAME=$2
11 |
12 | # Create a temporary SLURM script
13 | TMP_SLURM_SCRIPT="tmp_${JOB_NAME}.slurm"
14 |
15 | cat > $TMP_SLURM_SCRIPT < \
10 | --pruned_model_config_file /pruning_config.json # Or specify the config file of a pruning step from the `pruned_model_configs` folder, e.g., /pruned_model_configs/config.mamba_block.${eval_step}.json
11 |
12 | # Mamba-2 (SSM Pruning)
13 | python extract/extract_mamba.py \
14 | --model_path state-spaces/mamba2-2.7b \
15 | --output_path \
16 | --pruned_model_config_file /pruning_config.json # Or specify the config file of a pruning step from the `pruned_model_configs` folder, e.g., /pruned_model_configs/config.ssm.${eval_step}.json
17 | ```
18 |
19 | - `model_path`: Path to the pre-trained model.
20 | - `pruned_model_config_file`: JSON file for the pruned model configuration.
21 | - `output_path`: Directory to save the compressed model.
22 |
--------------------------------------------------------------------------------
/Mamba-Shedder/extract/extract_mamba.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | import logging
4 | import os
5 | import torch
6 |
7 | from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel
8 | from transformers import AutoTokenizer
9 |
10 |
11 | MAMBA_MODULES = [
12 | "backbone.layers.*.mixer.dt_bias",
13 | "backbone.layers.*.mixer.A_log",
14 | "backbone.layers.*.mixer.D",
15 | "backbone.layers.*.mixer.in_proj.weight",
16 | "backbone.layers.*.mixer.conv1d.weight",
17 | "backbone.layers.*.mixer.conv1d.bias",
18 | "backbone.layers.*.mixer.norm.weight",
19 | "backbone.layers.*.mixer.out_proj.weight",
20 | "backbone.layers.*.mixer.dt_proj.weight", # Mamba-1
21 | "backbone.layers.*.mixer.dt_proj.bias", # Mamba-1
22 | "backbone.layers.*.mixer.x_proj.weight", # Mamba-1
23 | "backbone.layers.*.norm.weight",
24 | ]
25 |
26 | # only for Mamba-2
27 | SSM_MODULES = [
28 | "backbone.layers.*.mixer.D",
29 | "backbone.layers.*.mixer.dt_bias",
30 | ]
31 |
32 |
33 | def main():
34 | parser = argparse.ArgumentParser()
35 | parser.add_argument(
36 | "--model_path",
37 | type=str,
38 | help="Path to the Mamba model."
39 | )
40 | parser.add_argument(
41 | "--output_path",
42 | type=str,
43 | help="Directory to save the compressed model."
44 | )
45 | parser.add_argument(
46 | "--pruned_model_config_file",
47 | type=str,
48 | help="Path to the pruned model configuration file."
49 | )
50 |
51 | args = parser.parse_args()
52 | model_path = args.model_path
53 | output_path = args.output_path
54 | # Create output directory if it doesn't exist
55 | os.makedirs(output_path, exist_ok=True)
56 | pruned_model_config_file = args.pruned_model_config_file
57 |
58 | # Load model and tokenizer
59 | tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
60 | model = MambaLMHeadModel.from_pretrained(model_path, device="cuda", dtype=torch.float16)
61 |
62 | # Load pruning results
63 | with open(pruned_model_config_file, "r") as f:
64 | pruned_config = json.load(f)
65 | logging.info(f"Detect a pruned model config: {pruned_config}")
66 | state_dict = model.state_dict()
67 |
68 | def prune_modules(state_dict, idx, module_names):
69 | for module_name in module_names:
70 | module_name = module_name.replace("*", str(idx))
71 | if module_name in state_dict:
72 | del state_dict[module_name]
73 |
74 | if pruned_config.get("pruned_mamba_block_idx"):
75 | pruned_mamba_block_idx = pruned_config["pruned_mamba_block_idx"]
76 | for idx in pruned_mamba_block_idx:
77 | prune_modules(state_dict, idx, MAMBA_MODULES)
78 | if pruned_config.get("pruned_ssm_idx"):
79 | pruned_ssm_idx = pruned_config["pruned_ssm_idx"]
80 | for idx in pruned_ssm_idx:
81 | prune_modules(state_dict, idx, SSM_MODULES)
82 |
83 | model.save_pretrained(output_path, state_dict=state_dict)
84 | tokenizer.save_pretrained(output_path)
85 |
86 |
87 | if __name__ == "__main__":
88 | main()
89 |
--------------------------------------------------------------------------------
/Mamba-Shedder/hybrid/Hymba-Pruning/eval.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | import logging
4 |
5 | from transformers import AutoModelForCausalLM, AutoTokenizer
6 |
7 | from lm_eval import evaluator
8 | from lm_eval.models.huggingface import HFLM
9 |
10 | TASKS = ["arc_easy", "arc_challenge", "piqa", "winogrande", "hellaswag"]
11 |
12 |
13 | def main():
14 | parser = argparse.ArgumentParser()
15 | parser.add_argument(
16 | "--model_path",
17 | type=str,
18 | )
19 | args = parser.parse_args()
20 | model_path = args.model_path
21 |
22 | model = AutoModelForCausalLM.from_pretrained(
23 | model_path,
24 | device_map="cuda",
25 | torch_dtype="float16",
26 | trust_remote_code=True
27 | )
28 | tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
29 | lm = HFLM(pretrained=model, tokenizer=tokenizer, batch_size=64)
30 |
31 | # Evaluate on selected tasks
32 | logging.info(f"Selected Tasks: {TASKS}")
33 | results = evaluator.simple_evaluate(lm, tasks=TASKS, num_fewshot=0, batch_size=64, log_samples=False)['results']
34 |
35 | metric_vals = {}
36 | for task, result in results.items():
37 | res = result['acc,none']
38 | metric_vals[task] = round(res, 3) * 100
39 |
40 | logging.info(json.dumps(metric_vals, indent=4))
41 |
42 |
43 | if __name__ == "__main__":
44 | main()
45 |
--------------------------------------------------------------------------------
/Mamba-Shedder/hybrid/Hymba-Pruning/extract/README.md:
--------------------------------------------------------------------------------
1 | ## Extract the Compressed Model from Mamba-Shedder
2 |
3 | The final compressed model can be extracted based on the optimal pruning configuration obtained from Mamba-Shedder.
4 |
5 | ```bash
6 | # Hymba
7 | python extract/extract_hymba.py \
8 | --model_path Hymba-1.5B-Base \
9 | --weight_reorder \
10 | --output_path . \
11 | --pruned_model_config_file /pruning_config.json # Or specify the config file of a pruning step from the `pruned_model_configs` folder, e.g., /pruned_model_configs/config.mlp_width.${eval_step}.json
12 |
13 | ```
14 |
15 | - `model_path`: Path to the pre-trained model.
16 | - `weight_reorder`: Flag to indicate whether to perform weight reorder in MLP.
17 | - `pruned_model_config_file`: JSON file for the pruned model configuration.
18 | - `output_path`: Directory to save the compressed model.
19 |
--------------------------------------------------------------------------------
/Mamba-Shedder/hybrid/Hymba-Pruning/recovery/README.md:
--------------------------------------------------------------------------------
1 | ### Recovery Fine-tuning after Pruning
2 |
3 | After obtaining the pruned model ([extract](../extract)), we can finetune it to recover accuracy.
4 | The dataset used for finetuning is [Alpaca](https://huggingface.co/datasets/yahma/alpaca-cleaned).
5 | Here is an example command:
6 |
7 | ```bash
8 | # Finetune the compressed Hymba
9 | python finetune_hymba.py \
10 | --model_path \
11 | --do_train \
12 | --batch_size 4 \
13 | --gradient_accumulation_steps 8 \
14 | --num_train_epochs 3 \
15 | --learning_rate 3e-4 \
16 | --lora \
17 | --lora_r 16 \
18 | --lora_alpha 32 \
19 | --lora_target_modules in_proj,out_proj,down_proj,up_proj \
20 | --output_path \
21 | --do_eval
22 |
23 | # after fine-tuning, merge the adapter to the compressed model
24 | python merge.py \
25 | --base_model_path \
26 | --adapter_model_path \
27 | --output_path \
28 |
29 | ```
30 |
--------------------------------------------------------------------------------
/Mamba-Shedder/hybrid/Hymba-Pruning/recovery/merge.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | from peft import PeftModel
4 | from transformers import AutoModelForCausalLM, AutoTokenizer
5 |
6 |
7 | def main():
8 | parser = argparse.ArgumentParser()
9 | parser.add_argument("--base_model_path", type=str)
10 | parser.add_argument("--adapter_model_path", type=str)
11 | parser.add_argument("--output_path", type=str)
12 | args = parser.parse_args()
13 | base_model_path = args.base_model_path
14 | adapter_model_path = args.adapter_model_path
15 | output_path = args.output_path
16 |
17 | base_model, loading_info = AutoModelForCausalLM.from_pretrained(
18 | base_model_path,
19 | device_map={"": 0},
20 | trust_remote_code=True,
21 | torch_dtype="float16",
22 | output_loading_info=True,
23 | )
24 | model = PeftModel.from_pretrained(base_model, adapter_model_path, device_map={"": 0})
25 | model.eval()
26 | merged_model = model.merge_and_unload()
27 | merged_model.train(False)
28 |
29 | sd = merged_model.state_dict()
30 | base_model.save_pretrained(output_path, state_dict=sd)
31 | tokenizer = AutoTokenizer.from_pretrained(base_model_path, trust_remote_code=True)
32 | tokenizer.save_pretrained(output_path)
33 |
34 |
35 | if __name__ == "__main__":
36 | main()
37 |
--------------------------------------------------------------------------------
/Mamba-Shedder/hybrid/Hymba-Pruning/results/README.md:
--------------------------------------------------------------------------------
1 | ## Run Command (Hymba)
2 |
3 | Here are the commands to reproduce the main results of the paper.
4 |
5 | ### Hymba-1.5B
6 |
7 | ```bash
8 | pruning_result_path=results/hymba-1.5b-base
9 |
10 | python prune.py \
11 | --model_path Hymba-1.5B-Base \
12 | --do_prune \
13 | --output_path ${pruning_result_path} \
14 | --num_block_pruning_steps 8 \
15 | --block_pruning_targets hymba_block \
16 | --importance_metric ppl \
17 | --calibration_dataset alpaca \
18 | --num_calibration_samples 256 \
19 |
20 | for eval_step in 5 6 7; do
21 | python prune.py \
22 | --model_path Hymba-1.5B-Base \
23 | --output_path ${pruning_result_path} \
24 | --do_eval \
25 | --pruned_model_config_file ${pruning_result_path}/pruned_model_configs/config.hymba_block.${eval_step}.json
26 | done
27 | ```
28 |
--------------------------------------------------------------------------------
/Mamba-Shedder/hybrid/Hymba-Pruning/results/hymba-1.5b-base/eval.res.config.hymba_block.5.json:
--------------------------------------------------------------------------------
1 | {
2 | "total_params": 1522797824,
3 | "5cs_acc_avg": 62.3,
4 | "arc_challenge": 44.9,
5 | "arc_easy": 76.0,
6 | "hellaswag": 50.5,
7 | "piqa": 75.8,
8 | "winogrande": 64.1
9 | }
--------------------------------------------------------------------------------
/Mamba-Shedder/hybrid/Hymba-Pruning/results/hymba-1.5b-base/eval.res.config.hymba_block.6.json:
--------------------------------------------------------------------------------
1 | {
2 | "total_params": 1522797824,
3 | "5cs_acc_avg": 61.7,
4 | "arc_challenge": 43.9,
5 | "arc_easy": 74.8,
6 | "hellaswag": 49.9,
7 | "piqa": 74.9,
8 | "winogrande": 64.9
9 | }
--------------------------------------------------------------------------------
/Mamba-Shedder/hybrid/Hymba-Pruning/results/hymba-1.5b-base/eval.res.config.hymba_block.7.json:
--------------------------------------------------------------------------------
1 | {
2 | "total_params": 1522797824,
3 | "5cs_acc_avg": 60.5,
4 | "arc_challenge": 43.2,
5 | "arc_easy": 74.2,
6 | "hellaswag": 49.2,
7 | "piqa": 74.3,
8 | "winogrande": 61.5
9 | }
--------------------------------------------------------------------------------
/Mamba-Shedder/hybrid/Hymba-Pruning/results/hymba-1.5b-base/pruning_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "pruned_hymba_block_idx": [
3 | 2,
4 | 22,
5 | 15,
6 | 28,
7 | 10,
8 | 24,
9 | 26
10 | ]
11 | }
--------------------------------------------------------------------------------
/Mamba-Shedder/hybrid/Zamba2-Pruning/eval.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | import logging
4 |
5 | from transformers import AutoModelForCausalLM, AutoTokenizer
6 |
7 | from lm_eval import evaluator
8 | from lm_eval.models.huggingface import HFLM
9 |
10 | TASKS = ["lambada_openai", "hellaswag", "piqa", "arc_easy", "arc_challenge", "winogrande", "openbookqa"]
11 |
12 |
13 | def main():
14 | parser = argparse.ArgumentParser()
15 | parser.add_argument(
16 | "--model_path",
17 | type=str,
18 | )
19 | args = parser.parse_args()
20 | model_path = args.model_path
21 |
22 | model = AutoModelForCausalLM.from_pretrained(
23 | model_path,
24 | device_map="cuda",
25 | torch_dtype="float16",
26 | trust_remote_code=True
27 | )
28 | tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
29 | lm = HFLM(pretrained=model, tokenizer=tokenizer, batch_size=64)
30 |
31 | # Evaluate on selected tasks
32 | logging.info(f"Selected Tasks: {TASKS}")
33 | results = evaluator.simple_evaluate(lm, tasks=TASKS, log_samples=False)['results']
34 |
35 | metric_vals = {}
36 | for task, result in results.items():
37 | # TODO: fix (all are `acc_norm,none`)
38 | res = result['acc,none'] if task == 'arc_easy' else result.get('acc_norm,none', result['acc,none'])
39 | metric_vals[task] = round(res, 3) * 100
40 | if task == "lambada_openai":
41 | metric_vals[task + "_ppl"] = result['perplexity,none']
42 |
43 | logging.info(json.dumps(metric_vals, indent=4))
44 |
45 |
46 | if __name__ == "__main__":
47 | main()
--------------------------------------------------------------------------------
/Mamba-Shedder/hybrid/Zamba2-Pruning/install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | set -x
4 |
5 | MAMBA_SHEDDER_ZAMBA2_PATH=$PWD
6 |
7 | cp ../../utils.py .
8 | cp ../../patches/mamba-62db608.patch ./patches
9 |
10 | pip install virtualenv
11 | virtualenv venv
12 | source venv/bin/activate
13 |
14 | pip install --upgrade pip setuptools wheel
15 | pip install torch==2.4.0
16 | pip install causal-conv1d>=1.4.0
17 |
18 | mkdir -pv ${MAMBA_SHEDDER_PATH}/third_party
19 |
20 | pushd ${MAMBA_SHEDDER_ZAMBA2_PATH}/third_party
21 | git clone https://github.com/state-spaces/mamba.git
22 | pushd mamba
23 | git checkout 62db608
24 | git apply --ignore-space-change --ignore-whitespace ${MAMBA_SHEDDER_ZAMBA2_PATH}/patches/mamba-62db608.patch
25 | pip install .
26 | pip install lm-eval==0.4.2
27 |
28 | pushd ${MAMBA_SHEDDER_ZAMBA2_PATH}/third_party
29 | git clone https://github.com/Zyphra/transformers_zamba2.git
30 | cd transformers_zamba2
31 | git checkout 7593823
32 | git apply --ignore-space-change --ignore-whitespace ${MAMBA_SHEDDER_ZAMBA2_PATH}/patches/zamba2-7593823.patch
33 | pip install -e . --no-deps
34 | pip install tokenizers==0.19.0 numpy==1.26.4 accelerate
35 | pushd ${MAMBA_SHEDDER_ZAMBA2_PATH}
36 |
37 | echo "Environment all ready. execute 'source venv/bin/activate' to run"
38 |
--------------------------------------------------------------------------------
/Mamba-Shedder/hybrid/Zamba2-Pruning/preprocess.py:
--------------------------------------------------------------------------------
1 | import json
2 | import torch
3 | from safetensors.torch import save_file
4 | from safetensors import safe_open
5 |
6 | NUM_G_LAYERS = 9
7 | NUM_MEM_BLOCKS = 2
8 |
9 |
10 | def preprocess_func(data):
11 | new_data = {}
12 | transformer_weight_data = {}
13 | for key, weight_tensor in data.items():
14 | if "model.blocks" in key:
15 | if "linear_fc1" in key and "lora_A" not in key:
16 | up_proj_weight_tensor, gate_proj_weight_tensor = weight_tensor.chunk(2, dim=0)
17 | transformer_weight_data[key.replace("linear_fc1", "linear_fc1_up")] = up_proj_weight_tensor.clone()
18 | transformer_weight_data[key.replace("linear_fc1", "linear_fc1_gate")] = gate_proj_weight_tensor.clone()
19 | elif "lora_A" in key:
20 | transformer_weight_data[key.replace("linear_fc1", "linear_fc1_up")] = weight_tensor.clone()
21 | transformer_weight_data[key.replace("linear_fc1", "linear_fc1_gate")] = weight_tensor.clone()
22 | else:
23 | transformer_weight_data[key] = weight_tensor.clone()
24 | else:
25 | new_data[key] = weight_tensor.clone()
26 |
27 | for num_layer in range(NUM_G_LAYERS):
28 | if num_layer % NUM_MEM_BLOCKS == 0:
29 | id = 0
30 | else:
31 | id = 1
32 | cur_data = {k: v for k, v in transformer_weight_data.items() if f"model.blocks.{id}" in k}
33 | for k, v in cur_data.items():
34 | if "lora" not in k:
35 | new_data[k.replace(f"model.blocks.{id}", f"model.blocks.{num_layer}")] = v.clone()
36 | elif f"_lora_B_list.{num_layer}" in k: # lora
37 | lora_B = v.clone()
38 | lora_A = cur_data[k.replace(f"lora_B", "lora_A")]
39 | lora_BA = torch.matmul(lora_B, lora_A)
40 | new_data[k.replace(f"model.blocks.{id}", f"model.blocks.{num_layer}").replace(f"_lora_B_list.{num_layer}", "")] += lora_BA.clone()
41 | return new_data
42 |
43 | def load_safetensors(filename):
44 | tensors = {}
45 | with safe_open(filename, framework="pt", device=0) as f:
46 | metadata = f.metadata()
47 | for k in f.keys():
48 | tensors[k] = f.get_tensor(k)
49 | return tensors, metadata
50 |
51 | def save_safetensors(data, metadata, filename):
52 | save_file(data, filename, metadata=metadata)
53 |
54 | new_weight_map = {}
55 |
56 | data, metadata = load_safetensors("./Zamba2-2.7B/model-00001-of-00002.safetensors")
57 | new_data = preprocess_func(data)
58 | save_safetensors(new_data, metadata, "./Zamba2-2.7B/model-00001-of-00002.safetensors")
59 | for key in new_data.keys():
60 | new_weight_map[key] = "model-00001-of-00002.safetensors"
61 |
62 | data, metadata = load_safetensors("./Zamba2-2.7B/model-00002-of-00002.safetensors")
63 | new_data = preprocess_func(data)
64 | save_safetensors(new_data, metadata, "./Zamba2-2.7B/model-00002-of-00002.safetensors")
65 | for key in new_data.keys():
66 | new_weight_map[key] = "model-00002-of-00002.safetensors"
67 |
68 | with open("./Zamba2-2.7B/model.safetensors.index.json", "r") as f:
69 | data = json.load(f)
70 | data["weight_map"] = new_weight_map
71 | with open("./Zamba2-2.7B/model.safetensors.index.json", "w") as f:
72 | json.dump(data, f, indent=4)
73 |
--------------------------------------------------------------------------------
/Mamba-Shedder/hybrid/Zamba2-Pruning/results/README.md:
--------------------------------------------------------------------------------
1 | ## Run Command (Zamba2)
2 |
3 | Here are the commands to reproduce the main results of the paper.
4 |
5 | ### Zamba2-2.7B
6 |
7 | #### Pruning Ratio: 10%
8 |
9 | ```bash
10 | pruning_result_path=results/zamba2-2.7b
11 |
12 | # Multi-granularity Pruning
13 | python prune_hybrid.py \
14 | --model_path Zamba2-2.7B \
15 | --output_path ${pruning_result_path} \
16 | --do_prune \
17 | --target_block_pruning_steps 3 \
18 | --target_width_pruning_steps 20 \
19 | --target_ssm_pruning_steps 18 \
20 | --mlp_channel_group_size 1024 \
21 | --importance_metric ppl \
22 | --calibration_dataset alpaca \
23 | --num_calibration_samples_block 256 \
24 | --num_calibration_samples_width 256 \
25 | --num_calibration_samples_ssm 256
26 |
27 | # Evaluation: w/o SSM Pruning
28 | python prune_hybrid.py \
29 | --model_path Zamba2-2.7B \
30 | --output_path ${pruning_result_path} \
31 | --do_eval \
32 | --pruned_model_config_file ${pruning_result_path}/pruned_model_configs/config.mlp_width.22.json
33 |
34 | # Evaluation: w/ SSM Pruning
35 | python prune_hybrid.py \
36 | --model_path Zamba2-2.7B \
37 | --output_path ${pruning_result_path} \
38 | --do_eval \
39 | --pruned_model_config_file ${pruning_result_path}/pruned_model_configs/config.ssm.40.json
40 | ```
41 |
42 | #### Pruning Ratio: 15%
43 |
44 | ```bash
45 | pruning_result_path=results/zamba2-2.7b
46 |
47 | # Multi-granularity Pruning
48 | python prune_hybrid.py \
49 | --model_path Zamba2-2.7B \
50 | --output_path ${pruning_result_path} \
51 | --do_prune \
52 | --target_block_pruning_steps 8 \
53 | --target_width_pruning_steps 20 \
54 | --target_ssm_pruning_steps 18 \
55 | --mlp_channel_group_size 1024 \
56 | --importance_metric ppl \
57 | --calibration_dataset alpaca \
58 | --num_calibration_samples_block 256 \
59 | --num_calibration_samples_width 256 \
60 | --num_calibration_samples_ssm 256
61 |
62 | # Evaluation: w/o SSM Pruning
63 | python prune_hybrid.py \
64 | --model_path Zamba2-2.7B \
65 | --output_path ${pruning_result_path} \
66 | --do_eval \
67 | --pruned_model_config_file ${pruning_result_path}/pruned_model_configs/config.mlp_width.27.json
68 |
69 | # Evaluation: w/ SSM Pruning
70 | python prune_hybrid.py \
71 | --model_path Zamba2-2.7B \
72 | --output_path ${pruning_result_path} \
73 | --do_eval \
74 | --pruned_model_config_file ${pruning_result_path}/pruned_model_configs/config.ssm.45.json
75 | ```
76 |
--------------------------------------------------------------------------------
/Mamba-Shedder/hybrid/Zamba2-Pruning/results/zamba2-2.7b/ratio_10/eval.res.config.ssm.40.json:
--------------------------------------------------------------------------------
1 | {
2 | "total_params": 3828481440,
3 | "7cs_acc_avg": 65.9,
4 | "openbookqa": 46.6,
5 | "winogrande": 69.5,
6 | "arc_challenge": 48.699999999999996,
7 | "arc_easy": 79.0,
8 | "piqa": 80.0,
9 | "hellaswag": 73.9,
10 | "lambada_openai": 63.4,
11 | "lambada_openai_ppl": 5.180797687033141
12 | }
--------------------------------------------------------------------------------
/Mamba-Shedder/hybrid/Zamba2-Pruning/results/zamba2-2.7b/ratio_10/pruning_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "pruned_mamba_block_idx": [],
3 | "pruned_mha_idx": [],
4 | "pruned_mlp_idx": [
5 | 6,
6 | 8,
7 | 7
8 | ],
9 | "pruned_mlp_channels": {
10 | "0": 3072,
11 | "1": 4096,
12 | "2": 8192,
13 | "3": 10240,
14 | "4": 6144,
15 | "5": 9216,
16 | "6": 10240,
17 | "7": 10240,
18 | "8": 10240
19 | },
20 | "pruned_ssm_idx": [
21 | 47,
22 | 42,
23 | 48,
24 | 43,
25 | 45,
26 | 37,
27 | 51,
28 | 40,
29 | 44,
30 | 46,
31 | 38,
32 | 41,
33 | 53,
34 | 49,
35 | 33,
36 | 31,
37 | 34,
38 | 50
39 | ]
40 | }
--------------------------------------------------------------------------------
/Mamba-Shedder/hybrid/Zamba2-Pruning/results/zamba2-2.7b/ratio_15/eval.res.config.ssm.45.json:
--------------------------------------------------------------------------------
1 | {
2 | "total_params": 3828481440,
3 | "7cs_acc_avg": 61.3,
4 | "openbookqa": 42.8,
5 | "winogrande": 67.7,
6 | "arc_challenge": 41.8,
7 | "arc_easy": 73.4,
8 | "piqa": 77.9,
9 | "hellaswag": 68.89999999999999,
10 | "lambada_openai": 56.49999999999999,
11 | "lambada_openai_ppl": 7.437938064270495
12 | }
--------------------------------------------------------------------------------
/Mamba-Shedder/hybrid/Zamba2-Pruning/results/zamba2-2.7b/ratio_15/pruning_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "pruned_mamba_block_idx": [
3 | 13,
4 | 21,
5 | 44,
6 | 22,
7 | 28
8 | ],
9 | "pruned_mha_idx": [],
10 | "pruned_mlp_idx": [
11 | 6,
12 | 8,
13 | 7
14 | ],
15 | "pruned_mlp_channels": {
16 | "0": 3072,
17 | "1": 4096,
18 | "2": 7168,
19 | "3": 10240,
20 | "4": 6144,
21 | "5": 10240,
22 | "6": 10240,
23 | "7": 10240,
24 | "8": 10240
25 | },
26 | "pruned_ssm_idx": [
27 | 47,
28 | 51,
29 | 48,
30 | 43,
31 | 45,
32 | 37,
33 | 41,
34 | 40,
35 | 42,
36 | 53,
37 | 46,
38 | 49,
39 | 38,
40 | 34,
41 | 50,
42 | 52,
43 | 33,
44 | 31
45 | ]
46 | }
--------------------------------------------------------------------------------
/Mamba-Shedder/install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | set -x
4 |
5 | MAMBA_SHEDDER_PATH=$PWD
6 |
7 | pip install virtualenv
8 | virtualenv venv
9 | source venv/bin/activate
10 |
11 | pip install --upgrade pip setuptools wheel
12 | pip install torch==2.4.0
13 | pip install transformers==4.43.0
14 | pip install causal-conv1d>=1.4.0
15 |
16 | mkdir -pv ${MAMBA_SHEDDER_PATH}/third_party
17 |
18 | pushd ${MAMBA_SHEDDER_PATH}/third_party
19 | git clone https://github.com/state-spaces/mamba.git
20 | pushd mamba
21 | git checkout 62db608
22 | git apply --ignore-space-change --ignore-whitespace ${MAMBA_SHEDDER_PATH}/patches/mamba-62db608.patch
23 | pip install .
24 | pushd ${MAMBA_SHEDDER_PATH}
25 |
26 | pip install lm-eval==0.4.2
27 | echo "Environment all ready. execute 'source venv/bin/activate' to run"
28 |
--------------------------------------------------------------------------------
/Mamba-Shedder/results/README.md:
--------------------------------------------------------------------------------
1 | ## Run Command (Mamba and Mamba2)
2 |
3 | Here are the commands to reproduce the main results of the paper.
4 |
5 | ### Mamba-2.8B
6 |
7 | ```bash
8 | pruning_result_path=results/mamba-2.8b
9 |
10 | # Mamba Block Pruning
11 | python prune.py \
12 | --model_path state-spaces/mamba-2.8b \
13 | --prune_target mamba_block \
14 | --output_path ${pruning_result_path} \
15 | --do_prune \
16 | --target_pruning_steps 14 \
17 | --importance_metric ppl \
18 | --calibration_dataset alpaca \
19 | --num_calibration_samples 256
20 |
21 | # Evaluation for different steps
22 | for eval_step in 6 13; do
23 | python prune.py \
24 | --model_path state-spaces/mamba-2.8b \
25 | --output_path ${pruning_result_path} \
26 | --do_eval \
27 | --pruned_model_config_file ${pruning_result_path}/pruned_model_configs/config.mamba_block.${eval_step}.json
28 | done
29 | ```
30 |
31 | ### Mamba2-2.7B
32 |
33 | ```bash
34 | pruning_result_path=results/mamba2-2.7b
35 |
36 | # SSM Pruning
37 | python prune.py \
38 | --model_path state-spaces/mamba2-2.7b \
39 | --prune_target ssm \
40 | --output_path ${pruning_result_path} \
41 | --do_prune \
42 | --target_pruning_steps 24 \
43 | --importance_metric ppl \
44 | --calibration_dataset alpaca \
45 | --num_calibration_samples 256
46 |
47 | # Evaluation for different steps
48 | for eval_step in 15 19 21 23; do
49 | python prune.py \
50 | --model_path state-spaces/mamba2-2.7b \
51 | --output_path ${pruning_result_path} \
52 | --do_eval \
53 | --pruned_model_config_file ${pruning_result_path}/pruned_model_configs/config.ssm.${eval_step}.json
54 | done
55 | ```
56 |
--------------------------------------------------------------------------------
/Mamba-Shedder/results/mamba-2.8b/eval.res.config.mamba_block.13.json:
--------------------------------------------------------------------------------
1 | {
2 | "total_params": 2768345600,
3 | "7cs_acc_avg": 53.800000000000004,
4 | "openbookqa": 33.2,
5 | "winogrande": 61.1,
6 | "arc_challenge": 32.0,
7 | "arc_easy": 62.7,
8 | "piqa": 71.0,
9 | "hellaswag": 57.599999999999994,
10 | "lambada_openai": 58.9,
11 | "lambada_openai_ppl": 7.50530338198477
12 | }
--------------------------------------------------------------------------------
/Mamba-Shedder/results/mamba-2.8b/eval.res.config.mamba_block.6.json:
--------------------------------------------------------------------------------
1 | {
2 | "total_params": 2768345600,
3 | "7cs_acc_avg": 57.8,
4 | "openbookqa": 37.0,
5 | "winogrande": 62.5,
6 | "arc_challenge": 33.5,
7 | "arc_easy": 68.0,
8 | "piqa": 73.7,
9 | "hellaswag": 63.7,
10 | "lambada_openai": 65.8,
11 | "lambada_openai_ppl": 4.943333997537422
12 | }
--------------------------------------------------------------------------------
/Mamba-Shedder/results/mamba-2.8b/pruning_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "pruned_mamba_block_idx": [
3 | 2,
4 | 6,
5 | 12,
6 | 5,
7 | 10,
8 | 8,
9 | 13,
10 | 11,
11 | 26,
12 | 48,
13 | 19,
14 | 55,
15 | 15,
16 | 3
17 | ]
18 | }
--------------------------------------------------------------------------------
/Mamba-Shedder/results/mamba2-2.7b/eval.res.config.ssm.15.json:
--------------------------------------------------------------------------------
1 | {
2 | "total_params": 2702599680,
3 | "7cs_acc_avg": 59.8,
4 | "openbookqa": 39.2,
5 | "winogrande": 64.0,
6 | "arc_challenge": 37.2,
7 | "arc_easy": 68.60000000000001,
8 | "piqa": 76.4,
9 | "hellaswag": 66.10000000000001,
10 | "lambada_openai": 66.9,
11 | "lambada_openai_ppl": 4.267932476614268
12 | }
--------------------------------------------------------------------------------
/Mamba-Shedder/results/mamba2-2.7b/eval.res.config.ssm.19.json:
--------------------------------------------------------------------------------
1 | {
2 | "total_params": 2702599680,
3 | "7cs_acc_avg": 58.599999999999994,
4 | "openbookqa": 39.2,
5 | "winogrande": 63.6,
6 | "arc_challenge": 36.7,
7 | "arc_easy": 68.89999999999999,
8 | "piqa": 76.1,
9 | "hellaswag": 66.0,
10 | "lambada_openai": 59.8,
11 | "lambada_openai_ppl": 5.897567054174858
12 | }
--------------------------------------------------------------------------------
/Mamba-Shedder/results/mamba2-2.7b/eval.res.config.ssm.21.json:
--------------------------------------------------------------------------------
1 | {
2 | "total_params": 2702599680,
3 | "7cs_acc_avg": 57.8,
4 | "openbookqa": 38.0,
5 | "winogrande": 62.9,
6 | "arc_challenge": 36.5,
7 | "arc_easy": 68.30000000000001,
8 | "piqa": 75.7,
9 | "hellaswag": 65.60000000000001,
10 | "lambada_openai": 57.599999999999994,
11 | "lambada_openai_ppl": 6.498095929049319
12 | }
--------------------------------------------------------------------------------
/Mamba-Shedder/results/mamba2-2.7b/eval.res.config.ssm.23.json:
--------------------------------------------------------------------------------
1 | {
2 | "total_params": 2702599680,
3 | "7cs_acc_avg": 55.50000000000001,
4 | "openbookqa": 38.0,
5 | "winogrande": 62.9,
6 | "arc_challenge": 36.6,
7 | "arc_easy": 67.10000000000001,
8 | "piqa": 74.8,
9 | "hellaswag": 65.8,
10 | "lambada_openai": 43.4,
11 | "lambada_openai_ppl": 14.95706208912779
12 | }
--------------------------------------------------------------------------------
/Mamba-Shedder/results/mamba2-2.7b/pruning_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "pruned_ssm_idx": [
3 | 63,
4 | 54,
5 | 42,
6 | 45,
7 | 53,
8 | 57,
9 | 58,
10 | 59,
11 | 38,
12 | 56,
13 | 50,
14 | 61,
15 | 60,
16 | 43,
17 | 37,
18 | 62,
19 | 49,
20 | 34,
21 | 55,
22 | 33,
23 | 39,
24 | 35,
25 | 44,
26 | 46
27 | ]
28 | }
--------------------------------------------------------------------------------
/MultiPruner/eval.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import logging
4 | import argparse
5 |
6 | from transformers import AutoModelForCausalLM, AutoTokenizer
7 |
8 | from lm_eval import evaluator
9 | from lm_eval.models.huggingface import HFLM
10 |
11 | import utils
12 |
13 |
14 | def main():
15 | parser = argparse.ArgumentParser()
16 | parser.add_argument("--model_path", type=str)
17 | parser.add_argument("--output_path", type=str)
18 | args = parser.parse_args()
19 | model_path = args.model_path
20 | output_path = args.output_path
21 |
22 | # Ensure the output directory exists
23 | if not os.path.exists(output_path):
24 | os.makedirs(output_path)
25 |
26 | model = AutoModelForCausalLM.from_pretrained(
27 | model_path,
28 | device_map="auto",
29 | torch_dtype="float16",
30 | trust_remote_code=True,
31 | )
32 | tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
33 |
34 | # Evaluate on wikitext2 dataset
35 | dataset = utils.get_dataset("wikitext2")
36 | test_dataset = dataset["test"]
37 | test_loader = utils.prepare_test_dataloader(
38 | dataset=test_dataset,
39 | tokenizer=tokenizer,
40 | seqlen=2048,
41 | batch_size=1
42 | )
43 | dataset_ppl = utils.evaluate_ppl(
44 | model=model,
45 | dataloader=test_loader,
46 | pad_token_id=model.config.eos_token_id,
47 | )
48 | dataset_ppl = round(dataset_ppl, 2)
49 | logging.info(f'wikitext2 PPL: {dataset_ppl}')
50 |
51 | # Evaluate on selected tasks
52 | hflm = HFLM(pretrained=model, tokenizer=tokenizer, batch_size=64)
53 |
54 | task_names = ["piqa", "winogrande", "hellaswag", "arc_easy", "arc_challenge"]
55 | logging.info(f"Selected Tasks: {task_names}")
56 |
57 | results = evaluator.simple_evaluate(hflm, tasks=task_names, num_fewshot=0, batch_size=64, log_samples=False)['results']
58 |
59 | metric_vals = {task: round(result.get('acc_norm,none', result['acc,none']), 4) * 100 for task, result in results.items()}
60 | logging.info(json.dumps(metric_vals, indent=4))
61 |
62 | def calculate_avg_accuracy(task_names, results):
63 | n_tasks = len(task_names)
64 | acc_cumul = sum(result.get('acc_norm,none', result['acc,none']) for task, result in results.items())
65 | return round(acc_cumul / n_tasks, 4) * 100
66 |
67 | acc_avg = calculate_avg_accuracy(task_names, results)
68 | logging.info(f"Average accuracy across tasks: {acc_avg}")
69 |
70 | # Save evaluation results
71 | overall_results = {
72 | "ppl_wikitext2": dataset_ppl,
73 | "5cs_acc_avg": acc_avg,
74 | **metric_vals
75 | }
76 | eval_result_path = os.path.join(output_path, f"eval.res.json")
77 | with open(eval_result_path, "w") as f:
78 | json.dump(overall_results, f, indent=4)
79 |
80 |
81 | if __name__ == "__main__":
82 | main()
83 |
--------------------------------------------------------------------------------
/MultiPruner/extract/README.md:
--------------------------------------------------------------------------------
1 | ## Extract the Compressed Model from MultiPruner
2 |
3 | The final compressed model can be extracted based on the optimal pruning configuration obtained from **MultiPruner**.
4 | Here is an example command for the compressed Llama-2-7B:
5 |
6 | ```bash
7 | python extract/extract_model.py \
8 | --model_path meta-llama/Llama-2-7b-hf \
9 | --weight_reorder \
10 | --pruned_model_config_file /pruning_config.json \
11 | --output_path
12 | ```
13 |
14 | - `model_path`: Path to the pre-trained model.
15 | - `weight_reorder`: Flag to indicate whether to perform weight reordering.
16 | - `pruned_model_config_file`: JSON file for the pruned model configuration.
17 | - `output_path`: Directory to save the compressed model.
18 |
--------------------------------------------------------------------------------
/MultiPruner/install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | set -x
4 |
5 | MULTIPRUNER_PATH=$PWD
6 |
7 | python3.10 -m venv venv
8 | source venv/bin/activate
9 |
10 | mkdir -pv third_party
11 | pushd third_party
12 |
13 | git clone https://github.com/huggingface/transformers.git
14 | pushd transformers
15 | git checkout v4.45.0
16 | git apply --ignore-space-change --ignore-whitespace ${MULTIPRUNER_PATH}/patches/transformers-v4.45.0.patch
17 | pip install -e .
18 |
19 | pushd ${MULTIPRUNER_PATH}
20 |
21 | pip install -r requirements.txt
22 |
23 | echo "Environment all ready. execute 'source venv/bin/activate' to run"
24 |
25 |
--------------------------------------------------------------------------------
/MultiPruner/recovery/README.md:
--------------------------------------------------------------------------------
1 | ### Recovery Fine-tuning after Pruning
2 |
3 | After obtaining the compressed model ([here](../extract)), we can finetune it to recover accuracy.
4 | The dataset used for finetuning is [Alpaca](https://huggingface.co/datasets/yahma/alpaca-cleaned).
5 | Here is an example command:
6 |
7 | ```bash
8 | python recovery/finetune.py \
9 | --model_path \
10 | --do_train \
11 | --batch_size 8 \
12 | --gradient_accumulation_steps 4 \
13 | --max_steps 3000 \
14 | --learning_rate 1e-4 \
15 | --lora \
16 | --lora_r 16 \
17 | --lora_alpha 32 \
18 | --lora_target_modules q_proj,k_proj,v_proj,o_proj,down_proj,up_proj,gate_proj \
19 | --output_path \
20 | --do_eval
21 | ```
22 |
23 | After fine-tuning, we can merge the trained adapter into the pruned base model.
24 |
25 | ```bash
26 | python recovery/merge.py \
27 | --base_model_path \
28 | --adapter_model_path \
29 | --output_path
30 | ```
31 |
--------------------------------------------------------------------------------
/MultiPruner/recovery/merge.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | from peft import PeftModel
4 | from transformers import AutoModelForCausalLM, AutoTokenizer
5 |
6 |
7 | def main():
8 | parser = argparse.ArgumentParser()
9 | parser.add_argument("--base_model_path", type=str)
10 | parser.add_argument("--adapter_model_path", type=str)
11 | parser.add_argument("--output_path", type=str)
12 | args = parser.parse_args()
13 | base_model_path = args.base_model_path
14 | adapter_model_path = args.adapter_model_path
15 | output_path = args.output_path
16 |
17 | base_model, loading_info = AutoModelForCausalLM.from_pretrained(
18 | base_model_path,
19 | device_map={"": 0},
20 | trust_remote_code=True,
21 | torch_dtype="float16",
22 | output_loading_info=True,
23 | )
24 | model = PeftModel.from_pretrained(base_model, adapter_model_path, device_map={"": 0})
25 | model.eval()
26 | merged_model = model.merge_and_unload()
27 | merged_model.train(False)
28 |
29 | sd = merged_model.state_dict()
30 |
31 | if len(loading_info["missing_keys"]) > 0:
32 | for key in loading_info["missing_keys"]:
33 | del sd[key]
34 |
35 | base_model.save_pretrained(output_path, state_dict=sd)
36 | tokenizer = AutoTokenizer.from_pretrained(base_model_path, trust_remote_code=True)
37 | tokenizer.save_pretrained(output_path)
38 |
39 |
40 | if __name__ == "__main__":
41 | main()
42 |
--------------------------------------------------------------------------------
/MultiPruner/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy<2.0.0
2 | setuptools==70.0.0
3 | datasets
4 | accelerate
5 | sentencepiece
6 | protobuf
7 | bitsandbytes
8 | lm-eval==0.4.2
9 | torch==2.3.1
10 |
--------------------------------------------------------------------------------
/MultiPruner/results/Baichuan2-13B-Base/ratio_24/eval.res.json:
--------------------------------------------------------------------------------
1 | {
2 | "total_params": 13896668160,
3 | "pruned_params": 10559575040,
4 | "ratio": 24.013620254712908,
5 | "ppl_wikitext2": 10.99,
6 | "5cs_acc_avg": 59.12,
7 | "arc_challenge": 37.2,
8 | "arc_easy": 57.95,
9 | "hellaswag": 64.03,
10 | "winogrande": 66.46,
11 | "piqa": 69.97
12 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Baichuan2-13B-Base/ratio_24/pruning_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "pruned_attn_idx": [
3 | 33,
4 | 31,
5 | 26,
6 | 32,
7 | 34,
8 | 27,
9 | 6,
10 | 29,
11 | 28
12 | ],
13 | "pruned_mlp_idx": [
14 | 34,
15 | 6,
16 | 31,
17 | 35
18 | ],
19 | "pruned_attn_width": {
20 | "0": 5120,
21 | "1": 5120,
22 | "2": 4608,
23 | "3": 5120,
24 | "4": 4992,
25 | "5": 4864,
26 | "6": 5120,
27 | "7": 4864,
28 | "8": 5120,
29 | "9": 4992,
30 | "10": 5120,
31 | "11": 5120,
32 | "12": 5120,
33 | "13": 5120,
34 | "14": 4992,
35 | "15": 4992,
36 | "16": 4864,
37 | "17": 5120,
38 | "18": 4992,
39 | "19": 4864,
40 | "20": 4992,
41 | "21": 5120,
42 | "22": 4864,
43 | "23": 5120,
44 | "24": 5120,
45 | "25": 4480,
46 | "26": 5120,
47 | "27": 5120,
48 | "28": 5120,
49 | "29": 5120,
50 | "30": 4096,
51 | "31": 5120,
52 | "32": 5120,
53 | "33": 5120,
54 | "34": 5120,
55 | "35": 4608,
56 | "36": 4992,
57 | "37": 4992,
58 | "38": 4864,
59 | "39": 4480
60 | },
61 | "pruned_mlp_width": {
62 | "0": 8576,
63 | "1": 12672,
64 | "2": 9600,
65 | "3": 13696,
66 | "4": 12672,
67 | "5": 6528,
68 | "6": 13696,
69 | "7": 12672,
70 | "8": 12672,
71 | "9": 12672,
72 | "10": 12672,
73 | "11": 10624,
74 | "12": 12672,
75 | "13": 12672,
76 | "14": 9600,
77 | "15": 10624,
78 | "16": 12672,
79 | "17": 13696,
80 | "18": 12672,
81 | "19": 11648,
82 | "20": 11648,
83 | "21": 12672,
84 | "22": 13696,
85 | "23": 12672,
86 | "24": 9600,
87 | "25": 9600,
88 | "26": 6528,
89 | "27": 7552,
90 | "28": 6528,
91 | "29": 4480,
92 | "30": 13696,
93 | "31": 13696,
94 | "32": 4480,
95 | "33": 13696,
96 | "34": 13696,
97 | "35": 13696,
98 | "36": 13696,
99 | "37": 10624,
100 | "38": 13696,
101 | "39": 13696
102 | }
103 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Baichuan2-7B-Base/ratio_22/eval.res.json:
--------------------------------------------------------------------------------
1 | {
2 | "total_params": 7505973248,
3 | "pruned_params": 5824057344,
4 | "ratio": 22.407699154112414,
5 | "ppl_wikitext2": 12.37,
6 | "5cs_acc_avg": 57.599999999999994,
7 | "arc_challenge": 34.9,
8 | "arc_easy": 56.989999999999995,
9 | "hellaswag": 60.919999999999995,
10 | "winogrande": 64.72,
11 | "piqa": 70.46
12 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Baichuan2-7B-Base/ratio_22/pruning_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "pruned_attn_idx": [
3 | 4,
4 | 24,
5 | 26,
6 | 25,
7 | 21,
8 | 22,
9 | 27
10 | ],
11 | "pruned_mlp_idx": [
12 | 4,
13 | 26
14 | ],
15 | "pruned_attn_width": {
16 | "0": 3840,
17 | "1": 4096,
18 | "2": 3840,
19 | "3": 2560,
20 | "4": 4096,
21 | "5": 3584,
22 | "6": 3968,
23 | "7": 3968,
24 | "8": 3840,
25 | "9": 4096,
26 | "10": 4096,
27 | "11": 4096,
28 | "12": 4096,
29 | "13": 4096,
30 | "14": 4096,
31 | "15": 3968,
32 | "16": 3840,
33 | "17": 4096,
34 | "18": 3968,
35 | "19": 4096,
36 | "20": 3968,
37 | "21": 4096,
38 | "22": 4096,
39 | "23": 4096,
40 | "24": 4096,
41 | "25": 4096,
42 | "26": 4096,
43 | "27": 4096,
44 | "28": 4096,
45 | "29": 4096,
46 | "30": 4096,
47 | "31": 4096
48 | },
49 | "pruned_mlp_width": {
50 | "0": 11008,
51 | "1": 2816,
52 | "2": 11008,
53 | "3": 11008,
54 | "4": 11008,
55 | "5": 2816,
56 | "6": 11008,
57 | "7": 11008,
58 | "8": 11008,
59 | "9": 11008,
60 | "10": 11008,
61 | "11": 9984,
62 | "12": 11008,
63 | "13": 11008,
64 | "14": 11008,
65 | "15": 2816,
66 | "16": 11008,
67 | "17": 11008,
68 | "18": 11008,
69 | "19": 3840,
70 | "20": 11008,
71 | "21": 11008,
72 | "22": 1792,
73 | "23": 11008,
74 | "24": 768,
75 | "25": 11008,
76 | "26": 11008,
77 | "27": 11008,
78 | "28": 768,
79 | "29": 1792,
80 | "30": 11008,
81 | "31": 11008
82 | }
83 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Llama-2-13B/ratio_25/eval.res.json:
--------------------------------------------------------------------------------
1 | {
2 | "total_params": 13015864320,
3 | "pruned_params": 9760035840,
4 | "ratio": 25.01430869248643,
5 | "ppl_wikitext2": 7.19,
6 | "5cs_acc_avg": 68.64,
7 | "arc_challenge": 46.5,
8 | "arc_easy": 71.38,
9 | "hellaswag": 75.62,
10 | "winogrande": 71.89999999999999,
11 | "piqa": 77.8
12 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Llama-2-13B/ratio_25/pruning_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "pruned_attn_idx": [
3 | 27,
4 | 32,
5 | 29,
6 | 33,
7 | 23,
8 | 30,
9 | 31,
10 | 34,
11 | 25,
12 | 28,
13 | 26,
14 | 35,
15 | 24
16 | ],
17 | "pruned_mlp_idx": [
18 | 33
19 | ],
20 | "pruned_attn_width": {
21 | "0": 4864,
22 | "1": 3328,
23 | "2": 5120,
24 | "3": 5120,
25 | "4": 4480,
26 | "5": 4736,
27 | "6": 4992,
28 | "7": 5120,
29 | "8": 4736,
30 | "9": 5120,
31 | "10": 5120,
32 | "11": 5120,
33 | "12": 5120,
34 | "13": 5120,
35 | "14": 4992,
36 | "15": 5120,
37 | "16": 4864,
38 | "17": 4992,
39 | "18": 4992,
40 | "19": 4736,
41 | "20": 4864,
42 | "21": 4480,
43 | "22": 4608,
44 | "23": 5120,
45 | "24": 5120,
46 | "25": 5120,
47 | "26": 5120,
48 | "27": 5120,
49 | "28": 5120,
50 | "29": 5120,
51 | "30": 5120,
52 | "31": 5120,
53 | "32": 5120,
54 | "33": 5120,
55 | "34": 5120,
56 | "35": 5120,
57 | "36": 5120,
58 | "37": 5120,
59 | "38": 5120,
60 | "39": 5120
61 | },
62 | "pruned_mlp_width": {
63 | "0": 13824,
64 | "1": 13824,
65 | "2": 12800,
66 | "3": 13824,
67 | "4": 10752,
68 | "5": 9728,
69 | "6": 13824,
70 | "7": 13824,
71 | "8": 13824,
72 | "9": 13824,
73 | "10": 13824,
74 | "11": 13824,
75 | "12": 13824,
76 | "13": 13824,
77 | "14": 7680,
78 | "15": 7680,
79 | "16": 13824,
80 | "17": 13824,
81 | "18": 13824,
82 | "19": 13824,
83 | "20": 13824,
84 | "21": 10752,
85 | "22": 4608,
86 | "23": 13824,
87 | "24": 13824,
88 | "25": 13824,
89 | "26": 2560,
90 | "27": 3584,
91 | "28": 13824,
92 | "29": 1536,
93 | "30": 13824,
94 | "31": 1536,
95 | "32": 13824,
96 | "33": 13824,
97 | "34": 2560,
98 | "35": 2560,
99 | "36": 13824,
100 | "37": 13824,
101 | "38": 13824,
102 | "39": 13824
103 | }
104 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Llama-2-7B/ratio_10/eval.res.json:
--------------------------------------------------------------------------------
1 | {
2 | "total_params": 6738415616,
3 | "pruned_params": 6063132672,
4 | "ratio": 10.02139052385812,
5 | "ppl_wikitext2": 6.55,
6 | "5cs_acc_avg": 67.02,
7 | "arc_challenge": 44.45,
8 | "arc_easy": 71.0,
9 | "hellaswag": 74.07000000000001,
10 | "winogrande": 68.19,
11 | "piqa": 77.37
12 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Llama-2-7B/ratio_10/pruning_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "pruned_attn_idx": [
3 | 25,
4 | 27,
5 | 21,
6 | 23,
7 | 24
8 | ],
9 | "pruned_mlp_idx": [],
10 | "pruned_attn_width": {
11 | "0": 4096,
12 | "1": 3840,
13 | "2": 3840,
14 | "3": 4096,
15 | "4": 4096,
16 | "5": 3968,
17 | "6": 4096,
18 | "7": 4096,
19 | "8": 3968,
20 | "9": 4096,
21 | "10": 4096,
22 | "11": 4096,
23 | "12": 4096,
24 | "13": 4096,
25 | "14": 4096,
26 | "15": 4096,
27 | "16": 4096,
28 | "17": 3968,
29 | "18": 4096,
30 | "19": 3968,
31 | "20": 3968,
32 | "21": 4096,
33 | "22": 3968,
34 | "23": 4096,
35 | "24": 4096,
36 | "25": 4096,
37 | "26": 4096,
38 | "27": 4096,
39 | "28": 3968,
40 | "29": 4096,
41 | "30": 3968,
42 | "31": 4096
43 | },
44 | "pruned_mlp_width": {
45 | "0": 11008,
46 | "1": 11008,
47 | "2": 5888,
48 | "3": 11008,
49 | "4": 11008,
50 | "5": 11008,
51 | "6": 11008,
52 | "7": 9984,
53 | "8": 11008,
54 | "9": 11008,
55 | "10": 11008,
56 | "11": 9984,
57 | "12": 11008,
58 | "13": 11008,
59 | "14": 11008,
60 | "15": 11008,
61 | "16": 11008,
62 | "17": 11008,
63 | "18": 11008,
64 | "19": 11008,
65 | "20": 11008,
66 | "21": 11008,
67 | "22": 11008,
68 | "23": 1792,
69 | "24": 11008,
70 | "25": 11008,
71 | "26": 11008,
72 | "27": 1792,
73 | "28": 11008,
74 | "29": 11008,
75 | "30": 11008,
76 | "31": 11008
77 | }
78 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Llama-2-7B/ratio_12/eval.res.json:
--------------------------------------------------------------------------------
1 | {
2 | "total_params": 6738415616,
3 | "pruned_params": 5931012096,
4 | "ratio": 11.982097365482536,
5 | "ppl_wikitext2": 7.1,
6 | "5cs_acc_avg": 66.47999999999999,
7 | "arc_challenge": 44.03,
8 | "arc_easy": 69.82000000000001,
9 | "hellaswag": 73.77,
10 | "winogrande": 68.43,
11 | "piqa": 76.33
12 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Llama-2-7B/ratio_12/pruning_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "pruned_attn_idx": [
3 | 25,
4 | 27,
5 | 21,
6 | 23,
7 | 24,
8 | 29
9 | ],
10 | "pruned_mlp_idx": [],
11 | "pruned_attn_width": {
12 | "0": 4096,
13 | "1": 4096,
14 | "2": 3840,
15 | "3": 3968,
16 | "4": 4096,
17 | "5": 4096,
18 | "6": 4096,
19 | "7": 4096,
20 | "8": 3968,
21 | "9": 4096,
22 | "10": 4096,
23 | "11": 4096,
24 | "12": 4096,
25 | "13": 4096,
26 | "14": 4096,
27 | "15": 4096,
28 | "16": 3968,
29 | "17": 3968,
30 | "18": 4096,
31 | "19": 3968,
32 | "20": 3968,
33 | "21": 4096,
34 | "22": 3968,
35 | "23": 4096,
36 | "24": 4096,
37 | "25": 4096,
38 | "26": 4096,
39 | "27": 4096,
40 | "28": 3712,
41 | "29": 4096,
42 | "30": 3968,
43 | "31": 4096
44 | },
45 | "pruned_mlp_width": {
46 | "0": 11008,
47 | "1": 11008,
48 | "2": 5888,
49 | "3": 11008,
50 | "4": 11008,
51 | "5": 11008,
52 | "6": 11008,
53 | "7": 9984,
54 | "8": 11008,
55 | "9": 11008,
56 | "10": 11008,
57 | "11": 11008,
58 | "12": 11008,
59 | "13": 11008,
60 | "14": 11008,
61 | "15": 11008,
62 | "16": 11008,
63 | "17": 11008,
64 | "18": 11008,
65 | "19": 11008,
66 | "20": 11008,
67 | "21": 11008,
68 | "22": 11008,
69 | "23": 1792,
70 | "24": 11008,
71 | "25": 1792,
72 | "26": 11008,
73 | "27": 4864,
74 | "28": 11008,
75 | "29": 11008,
76 | "30": 11008,
77 | "31": 11008
78 | }
79 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Llama-2-7B/ratio_14/eval.res.json:
--------------------------------------------------------------------------------
1 | {
2 | "total_params": 6738415616,
3 | "pruned_params": 5796794368,
4 | "ratio": 13.973926537926385,
5 | "ppl_wikitext2": 7.56,
6 | "5cs_acc_avg": 65.93,
7 | "arc_challenge": 43.519999999999996,
8 | "arc_easy": 68.64,
9 | "hellaswag": 72.27,
10 | "winogrande": 67.96,
11 | "piqa": 77.25999999999999
12 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Llama-2-7B/ratio_14/pruning_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "pruned_attn_idx": [
3 | 25,
4 | 27,
5 | 21,
6 | 23,
7 | 24,
8 | 29,
9 | 28
10 | ],
11 | "pruned_mlp_idx": [],
12 | "pruned_attn_width": {
13 | "0": 4096,
14 | "1": 3712,
15 | "2": 3840,
16 | "3": 3840,
17 | "4": 4096,
18 | "5": 4096,
19 | "6": 4096,
20 | "7": 4096,
21 | "8": 3968,
22 | "9": 4096,
23 | "10": 4096,
24 | "11": 4096,
25 | "12": 4096,
26 | "13": 4096,
27 | "14": 4096,
28 | "15": 4096,
29 | "16": 3968,
30 | "17": 3712,
31 | "18": 3968,
32 | "19": 3968,
33 | "20": 4096,
34 | "21": 4096,
35 | "22": 3968,
36 | "23": 4096,
37 | "24": 4096,
38 | "25": 4096,
39 | "26": 4096,
40 | "27": 4096,
41 | "28": 4096,
42 | "29": 4096,
43 | "30": 4096,
44 | "31": 4096
45 | },
46 | "pruned_mlp_width": {
47 | "0": 11008,
48 | "1": 11008,
49 | "2": 5888,
50 | "3": 11008,
51 | "4": 11008,
52 | "5": 11008,
53 | "6": 11008,
54 | "7": 9984,
55 | "8": 11008,
56 | "9": 11008,
57 | "10": 11008,
58 | "11": 1792,
59 | "12": 11008,
60 | "13": 11008,
61 | "14": 11008,
62 | "15": 11008,
63 | "16": 11008,
64 | "17": 11008,
65 | "18": 11008,
66 | "19": 11008,
67 | "20": 11008,
68 | "21": 11008,
69 | "22": 11008,
70 | "23": 1792,
71 | "24": 11008,
72 | "25": 1792,
73 | "26": 11008,
74 | "27": 8960,
75 | "28": 11008,
76 | "29": 11008,
77 | "30": 11008,
78 | "31": 11008
79 | }
80 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Llama-2-7B/ratio_15/eval.res.json:
--------------------------------------------------------------------------------
1 | {
2 | "total_params": 6738415616,
3 | "pruned_params": 5727588352,
4 | "ratio": 15.000963454967753,
5 | "ppl_wikitext2": 7.66,
6 | "5cs_acc_avg": 65.25999999999999,
7 | "arc_challenge": 42.24,
8 | "arc_easy": 68.10000000000001,
9 | "hellaswag": 71.82,
10 | "winogrande": 67.4,
11 | "piqa": 76.77000000000001
12 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Llama-2-7B/ratio_15/pruning_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "pruned_attn_idx": [
3 | 25,
4 | 27,
5 | 21,
6 | 23,
7 | 24,
8 | 29,
9 | 28
10 | ],
11 | "pruned_mlp_idx": [],
12 | "pruned_attn_width": {
13 | "0": 4096,
14 | "1": 3712,
15 | "2": 3456,
16 | "3": 3840,
17 | "4": 4096,
18 | "5": 4096,
19 | "6": 4096,
20 | "7": 4096,
21 | "8": 3968,
22 | "9": 4096,
23 | "10": 4096,
24 | "11": 4096,
25 | "12": 4096,
26 | "13": 4096,
27 | "14": 4096,
28 | "15": 4096,
29 | "16": 3968,
30 | "17": 3712,
31 | "18": 3968,
32 | "19": 3968,
33 | "20": 4096,
34 | "21": 4096,
35 | "22": 3968,
36 | "23": 4096,
37 | "24": 4096,
38 | "25": 4096,
39 | "26": 4096,
40 | "27": 4096,
41 | "28": 4096,
42 | "29": 4096,
43 | "30": 4096,
44 | "31": 4096
45 | },
46 | "pruned_mlp_width": {
47 | "0": 11008,
48 | "1": 11008,
49 | "2": 5888,
50 | "3": 11008,
51 | "4": 11008,
52 | "5": 11008,
53 | "6": 11008,
54 | "7": 9984,
55 | "8": 11008,
56 | "9": 11008,
57 | "10": 11008,
58 | "11": 1792,
59 | "12": 11008,
60 | "13": 11008,
61 | "14": 11008,
62 | "15": 11008,
63 | "16": 11008,
64 | "17": 11008,
65 | "18": 11008,
66 | "19": 11008,
67 | "20": 11008,
68 | "21": 11008,
69 | "22": 11008,
70 | "23": 1792,
71 | "24": 11008,
72 | "25": 1792,
73 | "26": 11008,
74 | "27": 3840,
75 | "28": 11008,
76 | "29": 11008,
77 | "30": 11008,
78 | "31": 11008
79 | }
80 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Llama-2-7B/ratio_18/eval.res.json:
--------------------------------------------------------------------------------
1 | {
2 | "total_params": 6738415616,
3 | "pruned_params": 5524164608,
4 | "ratio": 18.01982954445296,
5 | "ppl_wikitext2": 8.62,
6 | "5cs_acc_avg": 64.2,
7 | "arc_challenge": 41.89,
8 | "arc_easy": 65.11,
9 | "hellaswag": 71.45,
10 | "winogrande": 66.61,
11 | "piqa": 75.94999999999999
12 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Llama-2-7B/ratio_18/pruning_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "pruned_attn_idx": [
3 | 25,
4 | 27,
5 | 21,
6 | 23,
7 | 24,
8 | 29,
9 | 28,
10 | 18
11 | ],
12 | "pruned_mlp_idx": [],
13 | "pruned_attn_width": {
14 | "0": 4096,
15 | "1": 3712,
16 | "2": 3456,
17 | "3": 3712,
18 | "4": 4096,
19 | "5": 4096,
20 | "6": 4096,
21 | "7": 4096,
22 | "8": 3968,
23 | "9": 4096,
24 | "10": 4096,
25 | "11": 4096,
26 | "12": 3840,
27 | "13": 4096,
28 | "14": 4096,
29 | "15": 4096,
30 | "16": 3968,
31 | "17": 3712,
32 | "18": 4096,
33 | "19": 3840,
34 | "20": 3968,
35 | "21": 4096,
36 | "22": 3968,
37 | "23": 4096,
38 | "24": 4096,
39 | "25": 4096,
40 | "26": 4096,
41 | "27": 4096,
42 | "28": 4096,
43 | "29": 4096,
44 | "30": 3968,
45 | "31": 4096
46 | },
47 | "pruned_mlp_width": {
48 | "0": 11008,
49 | "1": 11008,
50 | "2": 5888,
51 | "3": 11008,
52 | "4": 11008,
53 | "5": 11008,
54 | "6": 11008,
55 | "7": 9984,
56 | "8": 11008,
57 | "9": 11008,
58 | "10": 11008,
59 | "11": 768,
60 | "12": 11008,
61 | "13": 11008,
62 | "14": 11008,
63 | "15": 11008,
64 | "16": 11008,
65 | "17": 3840,
66 | "18": 11008,
67 | "19": 11008,
68 | "20": 11008,
69 | "21": 11008,
70 | "22": 11008,
71 | "23": 1792,
72 | "24": 11008,
73 | "25": 1792,
74 | "26": 11008,
75 | "27": 1792,
76 | "28": 11008,
77 | "29": 11008,
78 | "30": 11008,
79 | "31": 11008
80 | }
81 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Llama-2-7B/ratio_22/eval.res.json:
--------------------------------------------------------------------------------
1 | {
2 | "total_params": 6738415616,
3 | "pruned_params": 5258874880,
4 | "ratio": 21.956804393111508,
5 | "ppl_wikitext2": 9.33,
6 | "5cs_acc_avg": 62.83,
7 | "arc_challenge": 41.13,
8 | "arc_easy": 64.77000000000001,
9 | "hellaswag": 68.94,
10 | "winogrande": 64.64,
11 | "piqa": 74.65
12 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Llama-2-7B/ratio_22/pruning_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "pruned_attn_idx": [
3 | 25,
4 | 27,
5 | 24,
6 | 23,
7 | 21,
8 | 29,
9 | 28,
10 | 18,
11 | 8
12 | ],
13 | "pruned_mlp_idx": [
14 | 8
15 | ],
16 | "pruned_attn_width": {
17 | "0": 4096,
18 | "1": 3328,
19 | "2": 3328,
20 | "3": 4096,
21 | "4": 4096,
22 | "5": 3968,
23 | "6": 4096,
24 | "7": 4096,
25 | "8": 4096,
26 | "9": 4096,
27 | "10": 4096,
28 | "11": 4096,
29 | "12": 4096,
30 | "13": 4096,
31 | "14": 4096,
32 | "15": 4096,
33 | "16": 3968,
34 | "17": 3584,
35 | "18": 4096,
36 | "19": 3840,
37 | "20": 3968,
38 | "21": 4096,
39 | "22": 3968,
40 | "23": 4096,
41 | "24": 4096,
42 | "25": 4096,
43 | "26": 4096,
44 | "27": 4096,
45 | "28": 4096,
46 | "29": 4096,
47 | "30": 3968,
48 | "31": 4096
49 | },
50 | "pruned_mlp_width": {
51 | "0": 11008,
52 | "1": 11008,
53 | "2": 5888,
54 | "3": 11008,
55 | "4": 11008,
56 | "5": 11008,
57 | "6": 11008,
58 | "7": 6912,
59 | "8": 11008,
60 | "9": 11008,
61 | "10": 11008,
62 | "11": 11008,
63 | "12": 5888,
64 | "13": 11008,
65 | "14": 11008,
66 | "15": 11008,
67 | "16": 11008,
68 | "17": 3840,
69 | "18": 11008,
70 | "19": 11008,
71 | "20": 11008,
72 | "21": 11008,
73 | "22": 11008,
74 | "23": 1792,
75 | "24": 11008,
76 | "25": 1792,
77 | "26": 3840,
78 | "27": 1792,
79 | "28": 11008,
80 | "29": 11008,
81 | "30": 11008,
82 | "31": 11008
83 | }
84 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Llama-2-7B/ratio_7/eval.res.json:
--------------------------------------------------------------------------------
1 | {
2 | "total_params": 6738415616,
3 | "pruned_params": 6268653568,
4 | "ratio": 6.971402103553482,
5 | "ppl_wikitext2": 6.33,
6 | "5cs_acc_avg": 67.94,
7 | "arc_challenge": 44.62,
8 | "arc_easy": 73.44000000000001,
9 | "hellaswag": 74.32,
10 | "winogrande": 69.46,
11 | "piqa": 77.86
12 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Llama-2-7B/ratio_7/pruning_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "pruned_attn_idx": [
3 | 25,
4 | 27,
5 | 21,
6 | 23
7 | ],
8 | "pruned_mlp_idx": [],
9 | "pruned_attn_width": {
10 | "0": 4096,
11 | "1": 4096,
12 | "2": 3968,
13 | "3": 3712,
14 | "4": 4096,
15 | "5": 4096,
16 | "6": 4096,
17 | "7": 4096,
18 | "8": 4096,
19 | "9": 4096,
20 | "10": 4096,
21 | "11": 4096,
22 | "12": 4096,
23 | "13": 4096,
24 | "14": 4096,
25 | "15": 4096,
26 | "16": 4096,
27 | "17": 4096,
28 | "18": 4096,
29 | "19": 4096,
30 | "20": 4096,
31 | "21": 4096,
32 | "22": 3968,
33 | "23": 4096,
34 | "24": 4096,
35 | "25": 4096,
36 | "26": 4096,
37 | "27": 4096,
38 | "28": 4096,
39 | "29": 4096,
40 | "30": 3968,
41 | "31": 4096
42 | },
43 | "pruned_mlp_width": {
44 | "0": 11008,
45 | "1": 11008,
46 | "2": 8960,
47 | "3": 11008,
48 | "4": 11008,
49 | "5": 11008,
50 | "6": 11008,
51 | "7": 11008,
52 | "8": 11008,
53 | "9": 11008,
54 | "10": 11008,
55 | "11": 11008,
56 | "12": 11008,
57 | "13": 11008,
58 | "14": 11008,
59 | "15": 11008,
60 | "16": 11008,
61 | "17": 11008,
62 | "18": 11008,
63 | "19": 11008,
64 | "20": 11008,
65 | "21": 11008,
66 | "22": 11008,
67 | "23": 6912,
68 | "24": 11008,
69 | "25": 11008,
70 | "26": 11008,
71 | "27": 1792,
72 | "28": 11008,
73 | "29": 11008,
74 | "30": 11008,
75 | "31": 11008
76 | }
77 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Llama-3.1-8B/ratio_10/eval.res.json:
--------------------------------------------------------------------------------
1 | {
2 | "total_params": 8030261248,
3 | "pruned_model_params": 7224954880,
4 | "ppl_wikitext2": 8.93,
5 | "5cs_acc_avg": 69.27,
6 | "arc_challenge": 47.78,
7 | "arc_easy": 75.13,
8 | "hellaswag": 73.72999999999999,
9 | "winogrande": 71.27,
10 | "piqa": 78.45
11 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Llama-3.1-8B/ratio_10/pruning_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "pruned_attn_idx": [
3 | 25,
4 | 26,
5 | 20,
6 | 24,
7 | 23,
8 | 21
9 | ],
10 | "pruned_mlp_idx": [],
11 | "pruned_attn_width": {
12 | "0": 4096,
13 | "1": 4096,
14 | "2": 4096,
15 | "3": 4096,
16 | "4": 4096,
17 | "5": 4096,
18 | "6": 4096,
19 | "7": 4096,
20 | "8": 4096,
21 | "9": 4096,
22 | "10": 4096,
23 | "11": 4096,
24 | "12": 4096,
25 | "13": 4096,
26 | "14": 4096,
27 | "15": 4096,
28 | "16": 4096,
29 | "17": 4096,
30 | "18": 4096,
31 | "19": 4096,
32 | "20": 4096,
33 | "21": 4096,
34 | "22": 4096,
35 | "23": 4096,
36 | "24": 4096,
37 | "25": 4096,
38 | "26": 4096,
39 | "27": 4096,
40 | "28": 4096,
41 | "29": 4096,
42 | "30": 4096,
43 | "31": 4096
44 | },
45 | "pruned_mlp_width": {
46 | "0": 14336,
47 | "1": 14336,
48 | "2": 14336,
49 | "3": 14336,
50 | "4": 14336,
51 | "5": 14336,
52 | "6": 14336,
53 | "7": 9216,
54 | "8": 14336,
55 | "9": 14336,
56 | "10": 14336,
57 | "11": 9216,
58 | "12": 14336,
59 | "13": 14336,
60 | "14": 14336,
61 | "15": 12288,
62 | "16": 14336,
63 | "17": 14336,
64 | "18": 3072,
65 | "19": 14336,
66 | "20": 14336,
67 | "21": 14336,
68 | "22": 14336,
69 | "23": 4096,
70 | "24": 3072,
71 | "25": 14336,
72 | "26": 14336,
73 | "27": 14336,
74 | "28": 14336,
75 | "29": 14336,
76 | "30": 14336,
77 | "31": 14336
78 | }
79 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Llama-3.1-8B/ratio_17/eval.res.json:
--------------------------------------------------------------------------------
1 | {
2 | "total_params": 8030261248,
3 | "pruned_model_params": 6654529536,
4 | "ppl_wikitext2": 11.64,
5 | "5cs_acc_avg": 65.81,
6 | "arc_challenge": 43.519999999999996,
7 | "arc_easy": 68.52000000000001,
8 | "hellaswag": 69.46,
9 | "winogrande": 71.27,
10 | "piqa": 76.28
11 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Llama-3.1-8B/ratio_17/pruning_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "pruned_attn_idx": [
3 | 25,
4 | 26,
5 | 20,
6 | 24
7 | ],
8 | "pruned_mlp_idx": [],
9 | "pruned_attn_width": {
10 | "0": 4096,
11 | "1": 4096,
12 | "2": 4096,
13 | "3": 4096,
14 | "4": 4096,
15 | "5": 4096,
16 | "6": 4096,
17 | "7": 4096,
18 | "8": 4096,
19 | "9": 4096,
20 | "10": 4096,
21 | "11": 4096,
22 | "12": 4096,
23 | "13": 4096,
24 | "14": 4096,
25 | "15": 4096,
26 | "16": 4096,
27 | "17": 4096,
28 | "18": 4096,
29 | "19": 4096,
30 | "20": 4096,
31 | "21": 4096,
32 | "22": 4096,
33 | "23": 4096,
34 | "24": 4096,
35 | "25": 4096,
36 | "26": 4096,
37 | "27": 4096,
38 | "28": 4096,
39 | "29": 4096,
40 | "30": 4096,
41 | "31": 4096
42 | },
43 | "pruned_mlp_width": {
44 | "0": 14336,
45 | "1": 14336,
46 | "2": 14336,
47 | "3": 14336,
48 | "4": 14336,
49 | "5": 14336,
50 | "6": 14336,
51 | "7": 5120,
52 | "8": 14336,
53 | "9": 14336,
54 | "10": 14336,
55 | "11": 10240,
56 | "12": 14336,
57 | "13": 8192,
58 | "14": 14336,
59 | "15": 3072,
60 | "16": 13312,
61 | "17": 14336,
62 | "18": 6144,
63 | "19": 3072,
64 | "20": 14336,
65 | "21": 14336,
66 | "22": 14336,
67 | "23": 3072,
68 | "24": 3072,
69 | "25": 2048,
70 | "26": 2048,
71 | "27": 14336,
72 | "28": 14336,
73 | "29": 14336,
74 | "30": 14336,
75 | "31": 14336
76 | }
77 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Llama-3.1-8B/ratio_20/eval.res.json:
--------------------------------------------------------------------------------
1 | {
2 | "total_params": 8030261248,
3 | "pruned_model_params": 6423842816,
4 | "ppl_wikitext2": 13.86,
5 | "5cs_acc_avg": 63.07000000000001,
6 | "arc_challenge": 41.72,
7 | "arc_easy": 64.98,
8 | "hellaswag": 65.38000000000001,
9 | "winogrande": 68.97999999999999,
10 | "piqa": 74.27
11 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Llama-3.1-8B/ratio_20/pruning_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "pruned_attn_idx": [
3 | 25,
4 | 26,
5 | 20,
6 | 24,
7 | 23,
8 | 21,
9 | 27,
10 | 19
11 | ],
12 | "pruned_mlp_idx": [
13 | 25
14 | ],
15 | "pruned_attn_width": {
16 | "0": 4096,
17 | "1": 4096,
18 | "2": 4096,
19 | "3": 4096,
20 | "4": 4096,
21 | "5": 4096,
22 | "6": 4096,
23 | "7": 4096,
24 | "8": 4096,
25 | "9": 4096,
26 | "10": 4096,
27 | "11": 4096,
28 | "12": 4096,
29 | "13": 4096,
30 | "14": 4096,
31 | "15": 4096,
32 | "16": 4096,
33 | "17": 4096,
34 | "18": 4096,
35 | "19": 4096,
36 | "20": 4096,
37 | "21": 4096,
38 | "22": 4096,
39 | "23": 4096,
40 | "24": 4096,
41 | "25": 4096,
42 | "26": 4096,
43 | "27": 4096,
44 | "28": 4096,
45 | "29": 4096,
46 | "30": 4096,
47 | "31": 4096
48 | },
49 | "pruned_mlp_width": {
50 | "0": 14336,
51 | "1": 14336,
52 | "2": 14336,
53 | "3": 14336,
54 | "4": 14336,
55 | "5": 14336,
56 | "6": 14336,
57 | "7": 5120,
58 | "8": 14336,
59 | "9": 14336,
60 | "10": 12288,
61 | "11": 1024,
62 | "12": 14336,
63 | "13": 14336,
64 | "14": 14336,
65 | "15": 4096,
66 | "16": 14336,
67 | "17": 14336,
68 | "18": 1024,
69 | "19": 3072,
70 | "20": 8192,
71 | "21": 14336,
72 | "22": 14336,
73 | "23": 3072,
74 | "24": 2048,
75 | "25": 14336,
76 | "26": 14336,
77 | "27": 14336,
78 | "28": 14336,
79 | "29": 14336,
80 | "30": 14336,
81 | "31": 14336
82 | }
83 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Llama-3.2-3B/ratio_9/eval.res.json:
--------------------------------------------------------------------------------
1 | {
2 | "total_params": 3212749824,
3 | "pruned_model_params": 2921769984,
4 | "ppl_wikitext2": 10.46,
5 | "5cs_acc_avg": 64.03999999999999,
6 | "arc_challenge": 43.09,
7 | "arc_easy": 66.96,
8 | "hellaswag": 68.5,
9 | "winogrande": 66.85,
10 | "piqa": 74.81
11 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Llama-3.2-3B/ratio_9/pruning_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "pruned_attn_idx": [
3 | 23,
4 | 24
5 | ],
6 | "pruned_mlp_idx": [],
7 | "pruned_attn_width": {
8 | "0": 3072,
9 | "1": 3072,
10 | "2": 3072,
11 | "3": 3072,
12 | "4": 3072,
13 | "5": 3072,
14 | "6": 3072,
15 | "7": 3072,
16 | "8": 3072,
17 | "9": 3072,
18 | "10": 3072,
19 | "11": 3072,
20 | "12": 3072,
21 | "13": 3072,
22 | "14": 3072,
23 | "15": 3072,
24 | "16": 3072,
25 | "17": 3072,
26 | "18": 3072,
27 | "19": 3072,
28 | "20": 3072,
29 | "21": 3072,
30 | "22": 3072,
31 | "23": 3072,
32 | "24": 3072,
33 | "25": 3072,
34 | "26": 3072,
35 | "27": 3072
36 | },
37 | "pruned_mlp_width": {
38 | "0": 8192,
39 | "1": 8192,
40 | "2": 8192,
41 | "3": 4352,
42 | "4": 8192,
43 | "5": 8192,
44 | "6": 8192,
45 | "7": 8192,
46 | "8": 8192,
47 | "9": 8192,
48 | "10": 512,
49 | "11": 8192,
50 | "12": 8192,
51 | "13": 8192,
52 | "14": 8192,
53 | "15": 7424,
54 | "16": 8192,
55 | "17": 8192,
56 | "18": 8192,
57 | "19": 8192,
58 | "20": 8192,
59 | "21": 2816,
60 | "22": 2048,
61 | "23": 8192,
62 | "24": 8192,
63 | "25": 5888,
64 | "26": 8192,
65 | "27": 8192
66 | }
67 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Meta-Llama-3-8B/ratio_10/eval.res.json:
--------------------------------------------------------------------------------
1 | {
2 | "total_params": 8030261248,
3 | "pruned_model_params": 7220760576,
4 | "ppl_wikitext2": 8.19,
5 | "5cs_acc_avg": 69.03,
6 | "arc_challenge": 48.120000000000005,
7 | "arc_easy": 71.3,
8 | "hellaswag": 75.08,
9 | "winogrande": 71.67,
10 | "piqa": 79.0
11 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Meta-Llama-3-8B/ratio_10/pruning_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "pruned_attn_idx": [
3 | 26,
4 | 24,
5 | 23,
6 | 20
7 | ],
8 | "pruned_mlp_idx": [],
9 | "pruned_attn_width": {
10 | "0": 4096,
11 | "1": 4096,
12 | "2": 4096,
13 | "3": 4096,
14 | "4": 4096,
15 | "5": 4096,
16 | "6": 4096,
17 | "7": 4096,
18 | "8": 4096,
19 | "9": 4096,
20 | "10": 4096,
21 | "11": 4096,
22 | "12": 4096,
23 | "13": 4096,
24 | "14": 4096,
25 | "15": 4096,
26 | "16": 4096,
27 | "17": 4096,
28 | "18": 4096,
29 | "19": 4096,
30 | "20": 4096,
31 | "21": 4096,
32 | "22": 4096,
33 | "23": 4096,
34 | "24": 4096,
35 | "25": 4096,
36 | "26": 4096,
37 | "27": 4096,
38 | "28": 4096,
39 | "29": 4096,
40 | "30": 4096,
41 | "31": 4096
42 | },
43 | "pruned_mlp_width": {
44 | "0": 14336,
45 | "1": 14336,
46 | "2": 14336,
47 | "3": 14336,
48 | "4": 14336,
49 | "5": 14336,
50 | "6": 14336,
51 | "7": 8192,
52 | "8": 14336,
53 | "9": 14336,
54 | "10": 14336,
55 | "11": 14336,
56 | "12": 14336,
57 | "13": 1024,
58 | "14": 14336,
59 | "15": 14336,
60 | "16": 14336,
61 | "17": 14336,
62 | "18": 12288,
63 | "19": 14336,
64 | "20": 14336,
65 | "21": 14336,
66 | "22": 4096,
67 | "23": 5120,
68 | "24": 14336,
69 | "25": 14336,
70 | "26": 14336,
71 | "27": 14336,
72 | "28": 3072,
73 | "29": 14336,
74 | "30": 14336,
75 | "31": 14336
76 | }
77 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Meta-Llama-3-8B/ratio_17/eval.res.json:
--------------------------------------------------------------------------------
1 | {
2 | "total_params": 8030261248,
3 | "pruned_model_params": 6654529536,
4 | "ppl_wikitext2": 11.11,
5 | "5cs_acc_avg": 64.4,
6 | "arc_challenge": 42.58,
7 | "arc_easy": 64.64999999999999,
8 | "hellaswag": 68.97,
9 | "winogrande": 69.53,
10 | "piqa": 76.28
11 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Meta-Llama-3-8B/ratio_17/pruning_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "pruned_attn_idx": [
3 | 26,
4 | 24,
5 | 23,
6 | 20
7 | ],
8 | "pruned_mlp_idx": [],
9 | "pruned_attn_width": {
10 | "0": 4096,
11 | "1": 4096,
12 | "2": 4096,
13 | "3": 4096,
14 | "4": 4096,
15 | "5": 4096,
16 | "6": 4096,
17 | "7": 4096,
18 | "8": 4096,
19 | "9": 4096,
20 | "10": 4096,
21 | "11": 4096,
22 | "12": 4096,
23 | "13": 4096,
24 | "14": 4096,
25 | "15": 4096,
26 | "16": 4096,
27 | "17": 4096,
28 | "18": 4096,
29 | "19": 4096,
30 | "20": 4096,
31 | "21": 4096,
32 | "22": 4096,
33 | "23": 4096,
34 | "24": 4096,
35 | "25": 4096,
36 | "26": 4096,
37 | "27": 4096,
38 | "28": 4096,
39 | "29": 4096,
40 | "30": 4096,
41 | "31": 4096
42 | },
43 | "pruned_mlp_width": {
44 | "0": 14336,
45 | "1": 14336,
46 | "2": 14336,
47 | "3": 14336,
48 | "4": 14336,
49 | "5": 14336,
50 | "6": 14336,
51 | "7": 3072,
52 | "8": 14336,
53 | "9": 14336,
54 | "10": 14336,
55 | "11": 10240,
56 | "12": 14336,
57 | "13": 1024,
58 | "14": 14336,
59 | "15": 5120,
60 | "16": 13312,
61 | "17": 14336,
62 | "18": 3072,
63 | "19": 14336,
64 | "20": 14336,
65 | "21": 14336,
66 | "22": 3072,
67 | "23": 3072,
68 | "24": 14336,
69 | "25": 11264,
70 | "26": 3072,
71 | "27": 14336,
72 | "28": 3072,
73 | "29": 14336,
74 | "30": 14336,
75 | "31": 14336
76 | }
77 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Meta-Llama-3-8B/ratio_20/eval.res.json:
--------------------------------------------------------------------------------
1 | {
2 | "total_params": 8030261248,
3 | "pruned_model_params": 6423842816,
4 | "ppl_wikitext2": 16.01,
5 | "5cs_acc_avg": 63.019999999999996,
6 | "arc_challenge": 41.21,
7 | "arc_easy": 63.09,
8 | "hellaswag": 67.42,
9 | "winogrande": 69.61,
10 | "piqa": 73.78
11 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Meta-Llama-3-8B/ratio_20/pruning_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "pruned_attn_idx": [
3 | 26,
4 | 24,
5 | 23,
6 | 20,
7 | 21,
8 | 27,
9 | 22,
10 | 29
11 | ],
12 | "pruned_mlp_idx": [],
13 | "pruned_attn_width": {
14 | "0": 4096,
15 | "1": 4096,
16 | "2": 4096,
17 | "3": 4096,
18 | "4": 4096,
19 | "5": 4096,
20 | "6": 4096,
21 | "7": 4096,
22 | "8": 4096,
23 | "9": 4096,
24 | "10": 4096,
25 | "11": 4096,
26 | "12": 4096,
27 | "13": 4096,
28 | "14": 4096,
29 | "15": 4096,
30 | "16": 4096,
31 | "17": 4096,
32 | "18": 4096,
33 | "19": 4096,
34 | "20": 4096,
35 | "21": 4096,
36 | "22": 4096,
37 | "23": 4096,
38 | "24": 4096,
39 | "25": 4096,
40 | "26": 4096,
41 | "27": 4096,
42 | "28": 4096,
43 | "29": 4096,
44 | "30": 4096,
45 | "31": 4096
46 | },
47 | "pruned_mlp_width": {
48 | "0": 14336,
49 | "1": 14336,
50 | "2": 14336,
51 | "3": 14336,
52 | "4": 14336,
53 | "5": 13312,
54 | "6": 14336,
55 | "7": 3072,
56 | "8": 5120,
57 | "9": 14336,
58 | "10": 14336,
59 | "11": 14336,
60 | "12": 14336,
61 | "13": 1024,
62 | "14": 14336,
63 | "15": 14336,
64 | "16": 14336,
65 | "17": 14336,
66 | "18": 3072,
67 | "19": 1024,
68 | "20": 5120,
69 | "21": 14336,
70 | "22": 14336,
71 | "23": 3072,
72 | "24": 14336,
73 | "25": 3072,
74 | "26": 2048,
75 | "27": 14336,
76 | "28": 14336,
77 | "29": 14336,
78 | "30": 14336,
79 | "31": 14336
80 | }
81 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Qwen1.5-14B/ratio_24/eval.res.json:
--------------------------------------------------------------------------------
1 | {
2 | "total_params": 14167290880,
3 | "pruned_params": 10836096000,
4 | "ratio": 23.513280755057096,
5 | "ppl_wikitext2": 12.94,
6 | "5cs_acc_avg": 62.41,
7 | "arc_challenge": 41.21,
8 | "arc_easy": 69.19,
9 | "hellaswag": 63.260000000000005,
10 | "winogrande": 62.980000000000004,
11 | "piqa": 75.41
12 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Qwen1.5-14B/ratio_24/pruning_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "pruned_attn_idx": [
3 | 13,
4 | 26,
5 | 35,
6 | 32,
7 | 18,
8 | 8,
9 | 31,
10 | 36,
11 | 4
12 | ],
13 | "pruned_mlp_idx":[
14 | 4,
15 | 7,
16 | 18
17 | ],
18 | "pruned_attn_width": {
19 | "0": 4992,
20 | "1": 4480,
21 | "2": 4736,
22 | "3": 4096,
23 | "4": 5120,
24 | "5": 4864,
25 | "6": 4992,
26 | "7": 5120,
27 | "8": 5120,
28 | "9": 5120,
29 | "10": 5120,
30 | "11": 5120,
31 | "12": 5120,
32 | "13": 5120,
33 | "14": 5120,
34 | "15": 5120,
35 | "16": 4992,
36 | "17": 5120,
37 | "18": 5120,
38 | "19": 5120,
39 | "20": 4992,
40 | "21": 5120,
41 | "22": 5120,
42 | "23": 5120,
43 | "24": 4992,
44 | "25": 5120,
45 | "26": 5120,
46 | "27": 5120,
47 | "28": 5120,
48 | "29": 4608,
49 | "30": 4608,
50 | "31": 5120,
51 | "32": 5120,
52 | "33": 4224,
53 | "34": 4096,
54 | "35": 5120,
55 | "36": 5120,
56 | "37": 5120,
57 | "38": 5120,
58 | "39": 5120
59 | },
60 | "pruned_mlp_width": {
61 | "0": 13696,
62 | "1": 13696,
63 | "2": 13696,
64 | "3": 1408,
65 | "4": 13696,
66 | "5": 11648,
67 | "6": 13696,
68 | "7": 13696,
69 | "8": 3456,
70 | "9": 7552,
71 | "10": 13696,
72 | "11": 13696,
73 | "12": 13696,
74 | "13": 13696,
75 | "14": 13696,
76 | "15": 7552,
77 | "16": 13696,
78 | "17": 7552,
79 | "18": 13696,
80 | "19": 13696,
81 | "20": 11648,
82 | "21": 13696,
83 | "22": 13696,
84 | "23": 9600,
85 | "24": 13696,
86 | "25": 13696,
87 | "26": 13696,
88 | "27": 13696,
89 | "28": 3456,
90 | "29": 13696,
91 | "30": 1408,
92 | "31": 13696,
93 | "32": 1408,
94 | "33": 1408,
95 | "34": 13696,
96 | "35": 3456,
97 | "36": 13696,
98 | "37": 13696,
99 | "38": 13696,
100 | "39": 13696
101 | }
102 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Qwen1.5-7B/ratio_22/eval.res.json:
--------------------------------------------------------------------------------
1 | {
2 | "total_params": 7721324544,
3 | "pruned_params": 6037311488,
4 | "ratio": 21.809898630780832,
5 | "ppl_wikitext2": 18.22,
6 | "5cs_acc_avg": 57.379999999999995,
7 | "arc_challenge": 35.92,
8 | "arc_easy": 59.01,
9 | "hellaswag": 60.6,
10 | "winogrande": 59.589999999999996,
11 | "piqa": 71.76
12 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Qwen1.5-7B/ratio_22/pruning_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "pruned_attn_idx": [
3 | 19,
4 | 6,
5 | 27,
6 | 24,
7 | 26,
8 | 5,
9 | 22,
10 | 25
11 | ],
12 | "pruned_mlp_idx": [
13 | 6,
14 | 5
15 | ],
16 | "pruned_attn_width": {
17 | "0": 3968,
18 | "1": 4096,
19 | "2": 3968,
20 | "3": 3968,
21 | "4": 4096,
22 | "5": 4096,
23 | "6": 4096,
24 | "7": 3968,
25 | "8": 4096,
26 | "9": 3712,
27 | "10": 3968,
28 | "11": 4096,
29 | "12": 4096,
30 | "13": 4096,
31 | "14": 4096,
32 | "15": 3968,
33 | "16": 4096,
34 | "17": 4096,
35 | "18": 3584,
36 | "19": 4096,
37 | "20": 4096,
38 | "21": 3968,
39 | "22": 4096,
40 | "23": 3968,
41 | "24": 4096,
42 | "25": 4096,
43 | "26": 4096,
44 | "27": 4096,
45 | "28": 4096,
46 | "29": 2560,
47 | "30": 4096,
48 | "31": 3968
49 | },
50 | "pruned_mlp_width": {
51 | "32": 11008,
52 | "33": 11008,
53 | "34": 11008,
54 | "35": 1792,
55 | "36": 11008,
56 | "37": 11008,
57 | "38": 11008,
58 | "39": 11008,
59 | "40": 11008,
60 | "41": 11008,
61 | "42": 4864,
62 | "43": 11008,
63 | "44": 11008,
64 | "45": 2816,
65 | "46": 11008,
66 | "47": 11008,
67 | "48": 11008,
68 | "49": 11008,
69 | "50": 768,
70 | "51": 11008,
71 | "52": 11008,
72 | "53": 1792,
73 | "54": 11008,
74 | "55": 11008,
75 | "56": 11008,
76 | "57": 7936,
77 | "58": 768,
78 | "59": 768,
79 | "60": 11008,
80 | "61": 11008,
81 | "62": 11008,
82 | "63": 11008
83 | }
84 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Qwen2.5-7B/ratio_10/eval.res.json:
--------------------------------------------------------------------------------
1 | {
2 | "total_params": 7615616512,
3 | "pruned_model_params": 6852230144,
4 | "ppl_wikitext2": 9.15,
5 | "5cs_acc_avg": 69.71000000000001,
6 | "arc_challenge": 54.269999999999996,
7 | "arc_easy": 80.81,
8 | "hellaswag": 73.31,
9 | "winogrande": 62.43,
10 | "piqa": 77.75
11 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Qwen2.5-7B/ratio_10/pruning_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "pruned_attn_idx": [
3 | 13,
4 | 5,
5 | 24,
6 | 11,
7 | 2
8 | ],
9 | "pruned_mlp_idx": [
10 | 12,
11 | 13
12 | ],
13 | "pruned_attn_width": {
14 | "0": 3584,
15 | "1": 3584,
16 | "2": 3584,
17 | "3": 3584,
18 | "4": 3584,
19 | "5": 3584,
20 | "6": 3584,
21 | "7": 3584,
22 | "8": 3584,
23 | "9": 3584,
24 | "10": 3584,
25 | "11": 3584,
26 | "12": 3584,
27 | "13": 3584,
28 | "14": 3584,
29 | "15": 3584,
30 | "16": 3584,
31 | "17": 3584,
32 | "18": 3584,
33 | "19": 3584,
34 | "20": 3584,
35 | "21": 3584,
36 | "22": 3584,
37 | "23": 3584,
38 | "24": 3584,
39 | "25": 3584,
40 | "26": 3584,
41 | "27": 3584
42 | },
43 | "pruned_mlp_width": {
44 | "0": 13824,
45 | "1": 11776,
46 | "2": 14848,
47 | "3": 18944,
48 | "4": 18944,
49 | "5": 18944,
50 | "6": 18944,
51 | "7": 18944,
52 | "8": 18944,
53 | "9": 18944,
54 | "10": 18944,
55 | "11": 18944,
56 | "12": 18944,
57 | "13": 18944,
58 | "14": 18944,
59 | "15": 18944,
60 | "16": 16896,
61 | "17": 18944,
62 | "18": 18944,
63 | "19": 17920,
64 | "20": 18944,
65 | "21": 18944,
66 | "22": 18944,
67 | "23": 18944,
68 | "24": 18944,
69 | "25": 18944,
70 | "26": 18944,
71 | "27": 18944
72 | }
73 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Qwen2.5-7B/ratio_20/eval.res.json:
--------------------------------------------------------------------------------
1 | {
2 | "total_params": 7615616512,
3 | "pruned_model_params": 6090697216,
4 | "ppl_wikitext2": 13.37,
5 | "5cs_acc_avg": 62.82,
6 | "arc_challenge": 43.86,
7 | "arc_easy": 70.54,
8 | "hellaswag": 66.73,
9 | "winogrande": 58.8,
10 | "piqa": 74.16
11 | }
--------------------------------------------------------------------------------
/MultiPruner/results/Qwen2.5-7B/ratio_20/pruning_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "pruned_attn_idx": [
3 | 13,
4 | 5,
5 | 24,
6 | 11,
7 | 2,
8 | 14
9 | ],
10 | "pruned_mlp_idx": [
11 | 12,
12 | 13,
13 | 5
14 | ],
15 | "pruned_attn_width": {
16 | "0": 3584,
17 | "1": 3584,
18 | "2": 3584,
19 | "3": 3584,
20 | "4": 3584,
21 | "5": 3584,
22 | "6": 3584,
23 | "7": 3584,
24 | "8": 3584,
25 | "9": 3584,
26 | "10": 3584,
27 | "11": 3584,
28 | "12": 3584,
29 | "13": 3584,
30 | "14": 3584,
31 | "15": 3584,
32 | "16": 3584,
33 | "17": 3584,
34 | "18": 3584,
35 | "19": 3584,
36 | "20": 3584,
37 | "21": 3584,
38 | "22": 3584,
39 | "23": 3584,
40 | "24": 3584,
41 | "25": 3584,
42 | "26": 3584,
43 | "27": 3584
44 | },
45 | "pruned_mlp_width": {
46 | "0": 14848,
47 | "1": 10752,
48 | "2": 14848,
49 | "3": 17920,
50 | "4": 18944,
51 | "5": 18944,
52 | "6": 15872,
53 | "7": 18944,
54 | "8": 18944,
55 | "9": 18944,
56 | "10": 18944,
57 | "11": 18944,
58 | "12": 18944,
59 | "13": 18944,
60 | "14": 9728,
61 | "15": 7680,
62 | "16": 9728,
63 | "17": 1536,
64 | "18": 18944,
65 | "19": 18944,
66 | "20": 18944,
67 | "21": 18944,
68 | "22": 17920,
69 | "23": 18944,
70 | "24": 18944,
71 | "25": 18944,
72 | "26": 18944,
73 | "27": 18944
74 | }
75 | }
--------------------------------------------------------------------------------
/SQFT/install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | set -x
4 |
5 | pip install 'numpy<2.0.0' setuptools==69.5.1 wheel
6 | pip install transformers==4.47.0
7 |
8 | # peft
9 | SQFT_PATH=$PWD
10 | mkdir third_party && cd third_party
11 | git clone https://github.com/huggingface/peft.git
12 | cd peft && git checkout v0.10.0 && git apply --ignore-space-change --ignore-whitespace ${SQFT_PATH}/patches/peft-v0.10.0.patch && pip install -e . && cd ..
13 |
14 | pip install datasets accelerate sentencepiece protobuf
15 | pip install optimum --no-deps
16 | pip install git+https://github.com/AutoGPTQ/AutoGPTQ@866b4c8
17 |
18 | # lm-eval-harness
19 | pip install lm-eval==0.4.2
20 |
--------------------------------------------------------------------------------
/SQFT/legacy/install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | set -x
4 |
5 | pip install 'numpy<2.0.0' setuptools==69.5.1 wheel
6 |
7 | SQFT_PATH=$PWD
8 | mkdir third_party && cd third_party
9 |
10 | # transformers
11 | git clone https://github.com/huggingface/transformers.git
12 | cd transformers && git checkout v4.44.2 && git apply --ignore-space-change --ignore-whitespace ${SQFT_PATH}/patches/transformers-v4.44.2.patch && pip install -e . && cd ..
13 |
14 | # peft
15 | git clone https://github.com/huggingface/peft.git
16 | cd peft && git checkout v0.10.0 && git apply --ignore-space-change --ignore-whitespace ${SQFT_PATH}/patches/peft-v0.10.0.patch && pip install -e . && cd ..
17 |
18 | pip install datasets accelerate sentencepiece protobuf
19 | pip install optimum==1.18.0 --no-deps
20 | pip install git+https://github.com/AutoGPTQ/AutoGPTQ@866b4c8
21 |
22 | # nncf
23 | git clone https://github.com/openvinotoolkit/nncf.git
24 | cd nncf && git checkout f143e1c && git apply --ignore-space-change --ignore-whitespace ${SQFT_PATH}/patches/nncf-f143e1c.patch && pip install -e . && cd ..
25 |
26 | # lm-eval-harness
27 | pip install lm-eval==0.4.2
28 |
--------------------------------------------------------------------------------
/SQFT/legacy/install_inference.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | set -x
4 |
5 | pip install 'numpy<2.0.0' setuptools==69.5.1 wheel
6 |
7 | # transformers
8 | pip install transformers==4.44.2
9 | pip install datasets accelerate sentencepiece protobuf
10 | pip install optimum==1.18.0 --no-deps
11 | pip install git+https://github.com/AutoGPTQ/AutoGPTQ@866b4c8
12 |
13 | # peft
14 | SQFT_PATH=$PWD
15 | mkdir third_party_inference && cd third_party_inference
16 | git clone https://github.com/huggingface/peft.git
17 | cd peft && git checkout v0.10.0 && git apply --ignore-space-change --ignore-whitespace ${SQFT_PATH}/patches/peft-v0.10.0.patch && pip install -e . && cd ..
18 |
19 | # lm-eval-harness (for evaluation)
20 | pip install lm-eval==0.4.2
21 |
--------------------------------------------------------------------------------
/SQFT/legacy/opea/Dockerfile:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2024 Intel Corporation
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | # Use the same python version with ray
5 | FROM python:3.10.14
6 |
7 | ARG HF_TOKEN
8 |
9 | ENV HF_TOKEN=$HF_TOKEN
10 |
11 | RUN useradd -m -s /bin/bash user && \
12 | mkdir -p /home/user && \
13 | chown -R user /home/user/
14 |
15 | COPY comps /home/user/comps
16 |
17 | RUN chown -R user /home/user/comps/finetuning
18 |
19 | USER user
20 |
21 | ENV PATH=$PATH:/home/user/.local/bin
22 |
23 | RUN python -m pip install --no-cache-dir --upgrade pip && \
24 | python -m pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu && \
25 | python -m pip install --no-cache-dir intel-extension-for-pytorch && \
26 | python -m pip install --no-cache-dir oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ && \
27 | python -m pip install --no-cache-dir -r /home/user/comps/finetuning/requirements.txt
28 |
29 | WORKDIR /home/user/comps/finetuning
30 |
31 | RUN git clone https://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning.git && \
32 | cp -r Hardware-Aware-Automated-Machine-Learning/SQFT/patches /home/user/comps/finetuning/patches && \
33 | rm -rf Hardware-Aware-Automated-Machine-Learning && \
34 | mkdir third_party
35 |
36 | # Clone and set up transformers
37 | RUN git clone https://github.com/huggingface/transformers.git third_party/transformers && \
38 | cd third_party/transformers && \
39 | git checkout v4.44.2 && \
40 | git apply --ignore-space-change --ignore-whitespace /home/user/comps/finetuning/patches/transformers-v4.44.2.patch && \
41 | pip install -e .
42 |
43 | # Clone and set up peft
44 | RUN git clone https://github.com/huggingface/peft.git third_party/peft && \
45 | cd third_party/peft && \
46 | git checkout v0.10.0 && \
47 | git apply --ignore-space-change --ignore-whitespace /home/user/comps/finetuning/patches/peft-v0.10.0.patch && \
48 | pip install -e .
49 |
50 | # Clone and set up nncf
51 | RUN git clone https://github.com/openvinotoolkit/nncf.git third_party/nncf && \
52 | cd third_party/nncf && \
53 | git checkout f143e1c && \
54 | git apply --ignore-space-change --ignore-whitespace /home/user/comps/finetuning/patches/nncf-f143e1c.patch && \
55 | pip install -e .
56 |
57 | ENV PYTHONPATH=$PYTHONPATH:/home/user
58 |
59 | RUN echo PKGPATH=$(python3 -c "import pkg_resources; print(pkg_resources.get_distribution('oneccl-bind-pt').location)") >> run.sh && \
60 | echo 'export LD_LIBRARY_PATH=$PKGPATH/oneccl_bindings_for_pytorch/opt/mpi/lib/:$LD_LIBRARY_PATH' >> run.sh && \
61 | echo 'source $PKGPATH/oneccl_bindings_for_pytorch/env/setvars.sh' >> run.sh && \
62 | echo ray start --head --dashboard-host=0.0.0.0 >> run.sh && \
63 | echo export RAY_ADDRESS=http://localhost:8265 >> run.sh && \
64 | echo python finetuning_service.py >> run.sh
65 |
66 | CMD bash run.sh
67 |
--------------------------------------------------------------------------------
/SQFT/legacy/opea/dataset/preprocess_arc.py:
--------------------------------------------------------------------------------
1 | import json
2 | from datasets import load_dataset
3 |
4 | def process_arc_document(document):
5 | """Add prompt to ARC dataset document.
6 |
7 | Args:
8 | document (dict): A dictionary containing the ARC dataset document.
9 |
10 | Returns:
11 | dict: The document with the added prompt.
12 | """
13 |
14 | instruction = document["question"]
15 | answer_key = document["answerKey"]
16 | choices = document["choices"]["label"]
17 | texts = document["choices"]["text"]
18 |
19 | def _process_output(document):
20 | """Process the ARC document to extract relevant fields."""
21 | num_to_letter = {"1": "A", "2": "B", "3": "C", "4": "D", "5": "E"}
22 | document["answerKey"] = num_to_letter.get(document["answerKey"], document["answerKey"])
23 | processed_output = {
24 | "choices": document["choices"]["text"],
25 | "gold": ["A", "B", "C", "D", "E"].index(document["answerKey"]),
26 | }
27 | return processed_output
28 |
29 |
30 | processed_output = _process_output(document)
31 | answer = processed_output["choices"][processed_output["gold"]]
32 | new_entry = {
33 | "instruction": instruction,
34 | "input": "",
35 | "output": answer
36 | }
37 | return new_entry
38 |
39 |
40 | dataset = load_dataset("ai2_arc", "ARC-Easy", split="train")
41 | new_data = [process_arc_document(doc) for doc in dataset]
42 |
43 | print(len(new_data))
44 |
45 | with open("arce_train_instruct.json", "w", encoding="utf-8") as f:
46 | json.dump(new_data, f, ensure_ascii=False, indent=4)
47 |
--------------------------------------------------------------------------------
/SQFT/legacy/patches/wanda-8e8fc87.patch:
--------------------------------------------------------------------------------
1 | diff --git a/lib/data.py b/lib/data.py
2 | index b6842c4..ce2b55c 100644
3 | --- a/lib/data.py
4 | +++ b/lib/data.py
5 | @@ -40,8 +40,8 @@ def get_wikitext2(nsamples, seed, seqlen, tokenizer):
6 | # Load and process c4 dataset
7 | def get_c4(nsamples, seed, seqlen, tokenizer):
8 | # Load train and validation datasets
9 | - traindata = load_dataset('allenai/c4', 'allenai--c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train')
10 | - valdata = load_dataset('allenai/c4', 'allenai--c4', data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'}, split='validation')
11 | + traindata = load_dataset('allenai/c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train')
12 | + valdata = load_dataset('allenai/c4', data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'}, split='validation')
13 |
14 | # Generate samples from training set
15 | random.seed(seed)
16 | diff --git a/lib/prune.py b/lib/prune.py
17 | index 01d981c..b772908 100644
18 | --- a/lib/prune.py
19 | +++ b/lib/prune.py
20 | @@ -141,7 +141,11 @@ def prune_wanda(args, model, tokenizer, device=torch.device("cuda:0"), prune_n=0
21 |
22 | if f"model.layers.{i}" in model.hf_device_map: ## handle the case for llama-30B and llama-65B, when the device map has multiple GPUs;
23 | dev = model.hf_device_map[f"model.layers.{i}"]
24 | - inps, outs, attention_mask, position_ids = inps.to(dev), outs.to(dev), attention_mask.to(dev), position_ids.to(dev)
25 | + inps, outs = inps.to(dev), outs.to(dev)
26 | + if attention_mask is not None:
27 | + attention_mask = attention_mask.to(dev)
28 | + if position_ids is not None:
29 | + position_ids = position_ids.to(dev)
30 |
31 | wrapped_layers = {}
32 | for name in subset:
33 | diff --git a/main.py b/main.py
34 | index a94583c..2d5cbec 100644
35 | --- a/main.py
36 | +++ b/main.py
37 | @@ -22,7 +22,20 @@ def get_llm(model_name, cache_dir="llm_weights"):
38 | device_map="auto"
39 | )
40 |
41 | - model.seqlen = model.config.max_position_embeddings
42 | + if not hasattr(model.config, 'max_position_embeddings'):
43 | + raise AttributeError(
44 | + "model.config does not have `max_position_embeddings`, please check the attribute name for the maximum length. "
45 | + "You may need to modify the code accordingly."
46 | + )
47 | + else:
48 | + if model.config.max_position_embeddings > 8192:
49 | + # such as mistralai/Mistral-7B-v0.3 ("max_position_embeddings": 32768)
50 | + model.seqlen = 8192
51 | + print(
52 | + "The maximum length supported by this model is large, setting the maximum length for calibration samples to 8192."
53 | + )
54 | + else:
55 | + model.seqlen = model.config.max_position_embeddings
56 | return model
57 |
58 | def main():
59 |
--------------------------------------------------------------------------------
/SQFT/legacy/run_command/README.md:
--------------------------------------------------------------------------------
1 | ### Run command
2 |
3 | Prepare the datasets from [LLM-Adapters](https://github.com/AGI-Edgerunners/LLM-Adapters) for our math instruction tuning Setting.
4 | ```bash
5 | git clone https://github.com/AGI-Edgerunners/LLM-Adapters.git
6 | mv LLM-Adapters/dataset/ datasets/
7 | mv LLM-Adapters/ft-training_set/* datasets/
8 | ```
9 |
10 | #### Llama-3
11 |
12 | ```bash
13 | bash run_command/llama-3-8b/sparse_quantization.sh $SPARSITY # e.g., SPARSITY=50
14 | bash run_command/llama-3-8b/run.sh $SPARSITY
15 | ```
16 |
17 | #### Mistral-v0.3
18 |
19 | ```bash
20 | bash run_command/mistral-7b-v0.3/sparse_quantization.sh $SPARSITY
21 | bash run_command/mistral-7b-v0.3/run.sh $SPARSITY $TASK
22 | ```
23 | Supported tasks: `gsm8k` and `math`.
24 |
25 | #### Phi-3
26 |
27 | ```bash
28 | bash run_command/phi-3-mini-4k-instruct/sparse_quantization.sh $SPARSITY
29 | bash run_command/phi-3-mini-4k-instruct/run.sh $SPARSITY $TASK
30 | ```
31 | Supported tasks: `cs` and `math`.
32 |
33 | Note that the results presented in the paper were obtained using an older environment setup.
34 | Specifically, we utilized torch version `2.1.2`, transformers version `4.39.1`, and NNCF with commit ID `544d5141`.
35 | The training was conducted on a single Tesla V100-SXM2-32GB GPU.
36 |
--------------------------------------------------------------------------------
/SQFT/legacy/run_command/llama-3-8b/sparse_quantization.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -x
3 | set -e
4 |
5 | SPARSITY=$1
6 |
7 | BASE_MODEL_PATH=meta-llama/Meta-Llama-3-8B
8 | SPARSE_BASE_MODEL_PATH=sqft-llama-3-8b-${SPARSITY}-base
9 | QUANT_BASE_MODEL_PATH=sqft-llama-3-8b-${SPARSITY}-base-gptq
10 |
11 | python wanda/main.py --model ${BASE_MODEL_PATH} --prune_method wanda --sparsity_ratio $(echo "scale=2; ${SPARSITY}/100" | bc) --sparsity_type unstructured --save wanda_out --save_model ${SPARSE_BASE_MODEL_PATH}
12 | python utils/quantization.py --base_model_path ${SPARSE_BASE_MODEL_PATH} --output_dir ${QUANT_BASE_MODEL_PATH}
13 |
--------------------------------------------------------------------------------
/SQFT/legacy/run_command/mistral-7b-v0.3/sparse_quantization.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -x
3 | set -e
4 |
5 | SPARSITY=$1
6 |
7 | BASE_MODEL_PATH=mistralai/Mistral-7B-v0.3
8 | SPARSE_BASE_MODEL_PATH=sqft-mistral-7b-v0.3-${SPARSITY}-base
9 | QUANT_BASE_MODEL_PATH=sqft-mistral-7b-v0.3-${SPARSITY}-base-gptq
10 |
11 | python wanda/main.py --model ${BASE_MODEL_PATH} --prune_method wanda --sparsity_ratio $(echo "scale=2; ${SPARSITY}/100" | bc) --sparsity_type unstructured --save wanda_out --save_model ${SPARSE_BASE_MODEL_PATH}
12 | python utils/quantization.py --base_model_path ${SPARSE_BASE_MODEL_PATH} --output_dir ${QUANT_BASE_MODEL_PATH}
13 |
--------------------------------------------------------------------------------
/SQFT/legacy/run_command/phi-3-mini-4k-instruct/sparse_quantization.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -x
3 | set -e
4 |
5 | SPARSITY=$1
6 |
7 | BASE_MODEL_PATH=microsoft/Phi-3-mini-4k-instruct
8 | SPARSE_BASE_MODEL_PATH=sqft-phi-3-mini-4k-${SPARSITY}-base
9 | QUANT_BASE_MODEL_PATH=sqft-phi-3-mini-4k-${SPARSITY}-base-gptq
10 |
11 | python wanda/main.py --model ${BASE_MODEL_PATH} --prune_method wanda --sparsity_ratio $(echo "scale=2; ${SPARSITY}/100" | bc) --sparsity_type unstructured --save wanda_out --save_model ${SPARSE_BASE_MODEL_PATH}
12 | python utils/quantization.py --base_model_path ${SPARSE_BASE_MODEL_PATH} --output_dir ${QUANT_BASE_MODEL_PATH}
13 |
--------------------------------------------------------------------------------
/SQFT/legacy/utils/quantization.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
3 |
4 | def main():
5 | parser = argparse.ArgumentParser(description="Quantize and save a model.")
6 | parser.add_argument("--base_model_path", type=str, required=True, help="Path to the base model.")
7 | parser.add_argument("--tokenizer_path", type=str, default=None, help="Path to the tokenizer. Defaults to base model path if not provided.")
8 | parser.add_argument("--dtype", type=str, default="float16", help="Data type for model weights.")
9 | parser.add_argument("--block_name_to_quantize", type=str, default=None, help="Specific block name to quantize.")
10 | parser.add_argument("--output_dir", type=str, required=True, help="Directory to save the quantized model and tokenizer.")
11 | args = parser.parse_args()
12 |
13 | base_model_path = args.base_model_path
14 | tokenizer_path = args.tokenizer_path
15 | dtype = args.dtype
16 | block_name_to_quantize = args.block_name_to_quantize
17 | output_dir = args.output_dir
18 |
19 | tokenizer = AutoTokenizer.from_pretrained(
20 | base_model_path if tokenizer_path is None else tokenizer_path,
21 | trust_remote_code=True
22 | )
23 | tokenizer.pad_token = tokenizer.eos_token
24 |
25 | if block_name_to_quantize is None:
26 | quantization_config = GPTQConfig(bits=4, dataset="c4", tokenizer=tokenizer, use_exllama=False)
27 | else:
28 | quantization_config = GPTQConfig(
29 | bits=4, dataset="c4", tokenizer=tokenizer, use_exllama=False, block_name_to_quantize=block_name_to_quantize
30 | )
31 |
32 | quantized_model = AutoModelForCausalLM.from_pretrained(
33 | base_model_path,
34 | device_map="auto",
35 | torch_dtype=dtype,
36 | trust_remote_code=True,
37 | quantization_config=quantization_config
38 | )
39 |
40 | quantized_model.config.quantization_config.use_exllama = False
41 | if block_name_to_quantize is not None:
42 | quantized_model.config.quantization_config.block_name_to_quantize = block_name_to_quantize
43 |
44 | quantized_model.save_pretrained(output_dir)
45 | tokenizer.save_pretrained(output_dir)
46 |
47 | # Uncomment the following lines to push the model and tokenizer to the hub
48 | # quantized_model.push_to_hub(output_dir, private=True)
49 | # tokenizer.push_to_hub(output_dir, private=True)
50 |
51 | if __name__ == "__main__":
52 | main()
53 |
--------------------------------------------------------------------------------
/SQFT/patches/wanda-8e8fc87.patch:
--------------------------------------------------------------------------------
1 | diff --git a/lib/data.py b/lib/data.py
2 | index b6842c4..ce2b55c 100644
3 | --- a/lib/data.py
4 | +++ b/lib/data.py
5 | @@ -40,8 +40,8 @@ def get_wikitext2(nsamples, seed, seqlen, tokenizer):
6 | # Load and process c4 dataset
7 | def get_c4(nsamples, seed, seqlen, tokenizer):
8 | # Load train and validation datasets
9 | - traindata = load_dataset('allenai/c4', 'allenai--c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train')
10 | - valdata = load_dataset('allenai/c4', 'allenai--c4', data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'}, split='validation')
11 | + traindata = load_dataset('allenai/c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train')
12 | + valdata = load_dataset('allenai/c4', data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'}, split='validation')
13 |
14 | # Generate samples from training set
15 | random.seed(seed)
16 | diff --git a/lib/prune.py b/lib/prune.py
17 | index 01d981c..b772908 100644
18 | --- a/lib/prune.py
19 | +++ b/lib/prune.py
20 | @@ -141,7 +141,11 @@ def prune_wanda(args, model, tokenizer, device=torch.device("cuda:0"), prune_n=0
21 |
22 | if f"model.layers.{i}" in model.hf_device_map: ## handle the case for llama-30B and llama-65B, when the device map has multiple GPUs;
23 | dev = model.hf_device_map[f"model.layers.{i}"]
24 | - inps, outs, attention_mask, position_ids = inps.to(dev), outs.to(dev), attention_mask.to(dev), position_ids.to(dev)
25 | + inps, outs = inps.to(dev), outs.to(dev)
26 | + if attention_mask is not None:
27 | + attention_mask = attention_mask.to(dev)
28 | + if position_ids is not None:
29 | + position_ids = position_ids.to(dev)
30 |
31 | wrapped_layers = {}
32 | for name in subset:
33 | diff --git a/main.py b/main.py
34 | index a94583c..2d5cbec 100644
35 | --- a/main.py
36 | +++ b/main.py
37 | @@ -22,7 +22,20 @@ def get_llm(model_name, cache_dir="llm_weights"):
38 | device_map="auto"
39 | )
40 |
41 | - model.seqlen = model.config.max_position_embeddings
42 | + if not hasattr(model.config, 'max_position_embeddings'):
43 | + raise AttributeError(
44 | + "model.config does not have `max_position_embeddings`, please check the attribute name for the maximum length. "
45 | + "You may need to modify the code accordingly."
46 | + )
47 | + else:
48 | + if model.config.max_position_embeddings > 8192:
49 | + # such as mistralai/Mistral-7B-v0.3 ("max_position_embeddings": 32768)
50 | + model.seqlen = 8192
51 | + print(
52 | + "The maximum length supported by this model is large, setting the maximum length for calibration samples to 8192."
53 | + )
54 | + else:
55 | + model.seqlen = model.config.max_position_embeddings
56 | return model
57 |
58 | def main():
59 |
--------------------------------------------------------------------------------
/SQFT/utils/quantization.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
3 |
4 |
5 | def main():
6 | parser = argparse.ArgumentParser(description="Quantize and save a model.")
7 | parser.add_argument("--base_model_path", type=str, required=True, help="Path to the base model.")
8 | parser.add_argument("--tokenizer_path", type=str, default=None, help="Path to the tokenizer. Defaults to base model path if not provided.")
9 | parser.add_argument("--dtype", type=str, default="float16", help="Data type for model weights.")
10 | parser.add_argument("--block_name_to_quantize", type=str, default=None, help="Specific block name to quantize.")
11 | parser.add_argument("--output_dir", type=str, required=True, help="Directory to save the quantized model and tokenizer.")
12 | args = parser.parse_args()
13 |
14 | base_model_path = args.base_model_path
15 | tokenizer_path = args.tokenizer_path
16 | dtype = args.dtype
17 | block_name_to_quantize = args.block_name_to_quantize
18 | output_dir = args.output_dir
19 |
20 | tokenizer = AutoTokenizer.from_pretrained(
21 | base_model_path if tokenizer_path is None else tokenizer_path,
22 | trust_remote_code=True
23 | )
24 | tokenizer.pad_token = tokenizer.eos_token
25 |
26 | if block_name_to_quantize is None:
27 | quantization_config = GPTQConfig(bits=4, dataset="c4", tokenizer=tokenizer, use_exllama=False)
28 | else:
29 | quantization_config = GPTQConfig(
30 | bits=4, dataset="c4", tokenizer=tokenizer, use_exllama=False, block_name_to_quantize=block_name_to_quantize
31 | )
32 |
33 | quantized_model = AutoModelForCausalLM.from_pretrained(
34 | base_model_path,
35 | device_map="auto",
36 | torch_dtype=dtype,
37 | trust_remote_code=True,
38 | quantization_config=quantization_config
39 | )
40 |
41 | quantized_model.config.quantization_config.use_exllama = False
42 | if block_name_to_quantize is not None:
43 | quantized_model.config.quantization_config.block_name_to_quantize = block_name_to_quantize
44 |
45 | quantized_model.save_pretrained(output_dir)
46 | tokenizer.save_pretrained(output_dir)
47 |
48 | # Uncomment the following lines to push the model and tokenizer to the hub
49 | # quantized_model.push_to_hub(output_dir, private=True)
50 | # tokenizer.push_to_hub(output_dir, private=True)
51 |
52 | if __name__ == "__main__":
53 | main()
54 |
--------------------------------------------------------------------------------
/Shears/example_commonsense.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 |
4 | import torch
5 | from peft import PeftModel
6 | from transformers import AutoModelForCausalLM
7 | from transformers import AutoTokenizer
8 |
9 |
10 | def generate_prompt(instruction):
11 | return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
12 |
13 | ### Instruction:
14 | {instruction}
15 |
16 | ### Response:
17 | """
18 |
19 |
20 | def main():
21 | parser = argparse.ArgumentParser()
22 | parser.add_argument("--base_model_path", default="shears-llama-7b-50-base", type=str)
23 | parser.add_argument("--adapter_model_path", default="IntelLabs/shears-llama-7b-50-cs-heuristic-adapter", type=str)
24 | args = parser.parse_args()
25 | base_model_path = args.base_model_path
26 | adapter_model_path = args.adapter_model_path
27 |
28 | base_model = AutoModelForCausalLM.from_pretrained(
29 | base_model_path,
30 | torch_dtype=torch.float16,
31 | device_map="auto",
32 | trust_remote_code=True
33 | )
34 | model = PeftModel.from_pretrained(base_model, adapter_model_path, torch_dtype=torch.float16, device_map="auto")
35 | model.eval()
36 | tokenizer = AutoTokenizer.from_pretrained(base_model_path)
37 |
38 | non_zero_params = sum([(param.data != 0).sum().item() for _, param in model.named_parameters()])
39 | print(f"Number of all non-zero parameters: {non_zero_params}")
40 |
41 | instructions = [
42 | "Please choose the correct answer to the question: A cactus stem is used to store\n\nAnswer1: fruit "
43 | "Answer2: liquid Answer3: food Answer4: spines\n\nAnswer format: answer1/answer2/answer3/answer4",
44 |
45 | "Please choose the correct solution to the question: Prevent bottles from rolling in fridge.\n\n"
46 | "Solution1: Put binder clip on fridge shelves to prevent sliding.\n\nSolution2: Put staple remover on "
47 | "fridge shelves to prevent sliding.\n\nAnswer format: solution1/solution2",
48 |
49 | "Please choose the correct answer to the question: Which characteristic describes the texture of a "
50 | "kitten's fur?\n\nAnswer1: gray Answer2: warm Answer3: long Answer4: soft\n\nAnswer format: answer1/"
51 | "answer2/answer3/answer4",
52 | ]
53 |
54 | for idx, instruction in enumerate(instructions):
55 | print(f"Example {idx}:")
56 | prompt = generate_prompt(instruction)
57 | inputs = tokenizer(prompt, return_tensors="pt")
58 | input_ids = inputs["input_ids"].to(model.device)
59 | with torch.no_grad():
60 | generation_output = model.generate(
61 | input_ids=input_ids,
62 | return_dict_in_generate=True,
63 | output_scores=True,
64 | max_new_tokens=256,
65 | use_cache=True,
66 | num_beams=4,
67 | )
68 | s = generation_output.sequences[0]
69 | output = tokenizer.decode(s)
70 | print(output)
71 |
72 |
73 | if __name__ == "__main__":
74 | main()
75 |
--------------------------------------------------------------------------------
/Shears/example_math.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 |
4 | import torch
5 | from peft import PeftModel
6 | from transformers import AutoModelForCausalLM
7 | from transformers import AutoTokenizer
8 |
9 |
10 | def generate_prompt(instruction):
11 | return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
12 |
13 | ### Instruction:
14 | {instruction}
15 |
16 | ### Response:
17 | """
18 |
19 |
20 | def main():
21 | parser = argparse.ArgumentParser()
22 | parser.add_argument("--base_model_path", default="IntelLabs/shears-mpt-7b-50-base", type=str)
23 | parser.add_argument("--adapter_model_path", default="IntelLabs/shears-mpt-7b-50-gsm8k-heuristic-adapter", type=str)
24 | args = parser.parse_args()
25 | base_model_path = args.base_model_path
26 | adapter_model_path = args.adapter_model_path
27 |
28 | base_model = AutoModelForCausalLM.from_pretrained(
29 | base_model_path,
30 | torch_dtype=torch.float16,
31 | device_map="auto",
32 | trust_remote_code=True
33 | )
34 | model = PeftModel.from_pretrained(base_model, adapter_model_path, torch_dtype=torch.float16, device_map="auto")
35 | model.eval()
36 | tokenizer = AutoTokenizer.from_pretrained(base_model_path)
37 |
38 | non_zero_params = sum([(param.data != 0).sum().item() for _, param in model.named_parameters()])
39 | print(f"Number of all non-zero parameters: {non_zero_params}")
40 |
41 | instructions = [
42 | "Jack had $100. Sophia gave him 1/5 of her $100. How many dollars does Jack have now?",
43 | "Edgar eats 18 pretzels a day. If his brother eats 1/2 as many, how many does his brother eat in a week?",
44 | "Trent is 5 years older than Jane, and Jane is 3 years younger than Quinn. If Quinn is 30, how old is Trent?",
45 | ]
46 |
47 | for idx, instruction in enumerate(instructions):
48 | print(f"Example {idx}:")
49 | prompt = generate_prompt(instruction)
50 | inputs = tokenizer(prompt, return_tensors="pt")
51 | input_ids = inputs["input_ids"].to(model.device)
52 | with torch.no_grad():
53 | generation_output = model.generate(
54 | input_ids=input_ids,
55 | return_dict_in_generate=True,
56 | output_scores=True,
57 | max_new_tokens=256,
58 | use_cache=True,
59 | num_beams=4,
60 | )
61 | s = generation_output.sequences[0]
62 | output = tokenizer.decode(s)
63 | print(output)
64 |
65 |
66 | if __name__ == "__main__":
67 | main()
68 |
--------------------------------------------------------------------------------
/Shears/install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | set -x
4 |
5 | SHEARS_PATH=$PWD
6 | mkdir third_party && cd third_party
7 |
8 | # transformers
9 | git clone https://github.com/huggingface/transformers.git
10 | cd transformers && git checkout v4.31.0 && git apply --ignore-space-change --ignore-whitespace $SHEARS_PATH/patches/transformers-v4.31.0.patch && pip install -e . && cd ..
11 |
12 | # peft
13 | git clone https://github.com/huggingface/peft.git
14 | cd peft && git checkout v0.5.0 && git apply --ignore-space-change --ignore-whitespace $SHEARS_PATH/patches/peft-v0.5.0.patch && git apply --ignore-space-change --ignore-whitespace $SHEARS_PATH/patches/peft-v0.5.0-inference.patch && pip install -e . && cd ..
15 |
16 | # nncf
17 | git clone https://github.com/openvinotoolkit/nncf.git
18 | cd nncf && git checkout 544d5141 && git apply --ignore-space-change --ignore-whitespace $SHEARS_PATH/patches/nncf-544d5141.patch && pip install -e . && cd ..
19 |
20 | # others
21 | pip install datasets accelerate sentencepiece protobuf
22 |
--------------------------------------------------------------------------------
/Shears/nncf_config/nncf_config.md:
--------------------------------------------------------------------------------
1 | ## NNCF Config of Shears
2 |
3 | To enable the elastic adapter in NLS training, we employ the [BootstrapNAS](https://github.com/openvinotoolkit/nncf/tree/develop/nncf/experimental/torch/nas/bootstrapNAS) feature within [OpenVINO™ NNCF](https://github.com/openvinotoolkit/nncf), offering a range of compression algorithms tailored for optimizing neural networks.
4 | Here is an instruction of NNCF configuration for Shears, aimed at eliminating any user doubts regarding the config and clarifying which parts are relevant to Shears.
5 |
6 | Some explanations:
7 |
8 | - `input_info` is used to create nncf network in Shears.
9 | - Shears employs the `progressive_shrinking` algorithm of bootstrapNAS, details can be found in [BootstrapNAS.md](https://github.com/openvinotoolkit/nncf/blob/develop/nncf/experimental/torch/nas/bootstrapNAS/BootstrapNAS.md).
10 | Actually, Shears only adopts the simplest `progressive_shrinking` feature without utilizing its more intricate and advanced strategies such as multi-stage training.
11 | We will explore more complex training strategies of `progressive_shrinking` in the future.
12 | - `frozen_layers_allowed` should be set to `true`, because Shears freezes the base model.
13 | - `width` means the hidden size of the weight matrix, more precisely, it represents the low-rank size of the LoRA adapter in Shears.
14 | - `num_bn_adaptation_samples` should be set to 0 (default is 2000), as we don't need batch norm adaption.
15 |
16 | ### Elastic low-rank
17 |
18 | In Shears solution, the design of the low-rank search space is crucial, including the allocation of dependency groups and the design of the search space for each group.
19 | In our existing configurations, such as the LLaMA model, we adopt the grouping `[[Q, K, V], [Up], [Down]]` for each llama layer, with each group's search space being `[32, 24, 16]`, i.e.,
20 |
21 | - `[Q, K, V]`: `[32, 24, 16]`
22 | - `[Up]`: `[32, 24, 16]`
23 | - `[Down]`: `[32, 24, 16]`
24 |
25 | ```json
26 | "width": {
27 | "overwrite_groups": [
28 | [
29 | "{re}PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[{*}]/LlamaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
30 | "{re}PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[{*}]/LlamaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
31 | "{re}PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[{*}]/LlamaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
32 | ],
33 | [
34 | "{re}PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[{*}]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
35 | ],
36 | [
37 | "{re}PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[{*}]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
38 | ]
39 | ],
40 | "overwrite_groups_widths": [
41 | [32, 24, 16], [32, 24, 16], [32, 24, 16]
42 | ]
43 | }
44 | ```
45 |
46 | Note that the length of groups should be equal to the length of the group widths, and we only set the output hidden
47 | size space of LoRA-A in the config, as the input hidden size of LoRA-B will be automatically pruned according to LoRA-A.
48 | Feel free to try your own group design and search spaces.
49 |
--------------------------------------------------------------------------------
/Shears/nncf_config/nncf_shears_llama.json:
--------------------------------------------------------------------------------
1 | {
2 | "input_info": [
3 | {
4 | "sample_size": [1, 256],
5 | "type": "long",
6 | "keyword": "input_ids"
7 | },
8 | {
9 | "sample_size": [1, 256],
10 | "type": "long",
11 | "keyword": "attention_mask"
12 | }
13 | ],
14 | "bootstrapNAS": {
15 | "training": {
16 | "algorithm": "progressive_shrinking",
17 | "frozen_layers_allowed": true,
18 | "progressivity_of_elasticity": ["width"],
19 | "batchnorm_adaptation": {
20 | "num_bn_adaptation_samples": 0
21 | },
22 | "schedule": {
23 | "list_stage_descriptions": [
24 | {"train_dims": ["width"], "epochs": -1, "depth_indicator": 1, "width_indicator": 5, "init_lr": -1, "epochs_lr": -1, "sample_rate": 1}
25 | ]
26 | },
27 | "elasticity": {
28 | "available_elasticity_dims": ["width"],
29 | "width": {
30 | "overwrite_groups": [
31 | [
32 | "{re}PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[{*}]/LlamaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
33 | "{re}PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[{*}]/LlamaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
34 | "{re}PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[{*}]/LlamaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
35 | ],
36 | [
37 | "{re}PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[{*}]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
38 | ],
39 | [
40 | "{re}PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[{*}]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
41 | ]
42 | ],
43 | "overwrite_groups_widths": [
44 | [32, 24, 16], [32, 24, 16], [32, 24, 16]
45 | ]
46 | }
47 | }
48 | }
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/Shears/nncf_config/nncf_shears_llama_with_gate_proj.json:
--------------------------------------------------------------------------------
1 | {
2 | "input_info": [
3 | {
4 | "sample_size": [1, 256],
5 | "type": "long",
6 | "keyword": "input_ids"
7 | },
8 | {
9 | "sample_size": [1, 256],
10 | "type": "long",
11 | "keyword": "attention_mask"
12 | }
13 | ],
14 | "bootstrapNAS": {
15 | "training": {
16 | "algorithm": "progressive_shrinking",
17 | "frozen_layers_allowed": true,
18 | "progressivity_of_elasticity": ["width"],
19 | "batchnorm_adaptation": {
20 | "num_bn_adaptation_samples": 0
21 | },
22 | "schedule": {
23 | "list_stage_descriptions": [
24 | {"train_dims": ["width"], "epochs": -1, "depth_indicator": 1, "width_indicator": 5, "init_lr": -1, "epochs_lr": -1, "sample_rate": 1}
25 | ]
26 | },
27 | "elasticity": {
28 | "available_elasticity_dims": ["width"],
29 | "width": {
30 | "overwrite_groups": [
31 | [
32 | "{re}PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[{*}]/LlamaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
33 | "{re}PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[{*}]/LlamaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
34 | "{re}PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[{*}]/LlamaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
35 | ],
36 | [
37 | "{re}PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[{*}]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
38 | "{re}PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[{*}]/LlamaMLP[mlp]/Linear[gate_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
39 | ],
40 | [
41 | "{re}PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[{*}]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
42 | ]
43 | ],
44 | "overwrite_groups_widths": [
45 | [32, 24, 16], [32, 24, 16], [32, 24, 16]
46 | ]
47 | }
48 | }
49 | }
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/Shears/nncf_config/nncf_shears_mpt.json:
--------------------------------------------------------------------------------
1 | {
2 | "input_info": [
3 | {
4 | "sample_size": [1, 256],
5 | "type": "long",
6 | "keyword": "input_ids"
7 | },
8 | {
9 | "sample_size": [1, 256],
10 | "type": "long",
11 | "keyword": "attention_mask"
12 | }
13 | ],
14 | "bootstrapNAS": {
15 | "training": {
16 | "algorithm": "progressive_shrinking",
17 | "frozen_layers_allowed": true,
18 | "progressivity_of_elasticity": ["width"],
19 | "batchnorm_adaptation": {
20 | "num_bn_adaptation_samples": 0
21 | },
22 | "schedule": {
23 | "list_stage_descriptions": [
24 | {"train_dims": ["width"], "epochs": -1, "depth_indicator": 1, "width_indicator": 5, "init_lr": -1, "epochs_lr": -1, "sample_rate": 1}
25 | ]
26 | },
27 | "elasticity": {
28 | "available_elasticity_dims": ["width"],
29 | "width": {
30 | "overwrite_groups": [
31 | [
32 | "{re}PeftModelForCausalLM/LoraModel[base_model]/MPTForCausalLM[model]/MPTModel[transformer]/ModuleList[blocks]/MPTBlock[{*}]/MultiheadAttention[attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
33 | "{re}PeftModelForCausalLM/LoraModel[base_model]/MPTForCausalLM[model]/MPTModel[transformer]/ModuleList[blocks]/MPTBlock[{*}]/MultiheadAttention[attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
34 | "{re}PeftModelForCausalLM/LoraModel[base_model]/MPTForCausalLM[model]/MPTModel[transformer]/ModuleList[blocks]/MPTBlock[{*}]/MultiheadAttention[attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
35 | "{re}PeftModelForCausalLM/LoraModel[base_model]/MPTForCausalLM[model]/MPTModel[transformer]/ModuleList[blocks]/MPTBlock[{*}]/MultiheadAttention[attn]/Linear[out_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
36 | ],
37 | [
38 | "{re}PeftModelForCausalLM/LoraModel[base_model]/MPTForCausalLM[model]/MPTModel[transformer]/ModuleList[blocks]/MPTBlock[{*}]/MPTMLP[ffn]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
39 | "{re}PeftModelForCausalLM/LoraModel[base_model]/MPTForCausalLM[model]/MPTModel[transformer]/ModuleList[blocks]/MPTBlock[{*}]/MPTMLP[ffn]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
40 | ]
41 | ],
42 | "overwrite_groups_widths": [
43 | [32, 24, 16], [32, 24, 16]
44 | ]
45 | }
46 | }
47 | }
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/Shears/patches/nncf-544d5141.patch:
--------------------------------------------------------------------------------
1 | diff --git a/nncf/experimental/torch/nas/bootstrapNAS/elasticity/elasticity_builder.py b/nncf/experimental/torch/nas/bootstrapNAS/elasticity/elasticity_builder.py
2 | index bc6464b2..9f7a2f3d 100644
3 | --- a/nncf/experimental/torch/nas/bootstrapNAS/elasticity/elasticity_builder.py
4 | +++ b/nncf/experimental/torch/nas/bootstrapNAS/elasticity/elasticity_builder.py
5 | @@ -9,7 +9,7 @@
6 | # See the License for the specific language governing permissions and
7 | # limitations under the License.
8 | from collections import OrderedDict
9 | -from typing import Any, Dict, List
10 | +from typing import Any, Dict, List, Tuple
11 |
12 | from nncf import NNCFConfig
13 | from nncf.experimental.torch.nas.bootstrapNAS.elasticity.base_handler import SingleElasticityBuilder
14 | @@ -152,3 +152,8 @@ class ElasticityBuilder(PTCompressionAlgorithmBuilder):
15 |
16 | # No conflict resolving with the related config options, parameters are overridden by compression state
17 | self._available_elasticity_dims = list(map(ElasticityDim, available_elasticity_dims_state))
18 | +
19 | + def _are_frozen_layers_allowed(self) -> Tuple[bool, str]:
20 | + if self.config.get("bootstrapNAS", {}).get("training", {}).get("frozen_layers_allowed", False):
21 | + return True, "Frozen layers are allowed (set in NNCF config)"
22 | + return super()._are_frozen_layers_allowed()
23 | diff --git a/nncf/experimental/torch/nas/bootstrapNAS/training/progressive_shrinking_builder.py b/nncf/experimental/torch/nas/bootstrapNAS/training/progressive_shrinking_builder.py
24 | index 92609327..fefdc2f6 100644
25 | --- a/nncf/experimental/torch/nas/bootstrapNAS/training/progressive_shrinking_builder.py
26 | +++ b/nncf/experimental/torch/nas/bootstrapNAS/training/progressive_shrinking_builder.py
27 | @@ -8,7 +8,7 @@
28 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
29 | # See the License for the specific language governing permissions and
30 | # limitations under the License.
31 | -from typing import Any, Dict, List
32 | +from typing import Any, Dict, List, Tuple
33 |
34 | from nncf import NNCFConfig
35 | from nncf.common.initialization.batchnorm_adaptation import BatchnormAdaptationAlgorithm
36 | @@ -152,3 +152,8 @@ class ProgressiveShrinkingBuilder(PTCompressionAlgorithmBuilder):
37 | self._bn_adapt_params = state_without_name[self._state_names.BN_ADAPTATION_PARAMS]
38 | bn_adapt_algo_kwargs = get_bn_adapt_algo_kwargs(self.config, self._bn_adapt_params)
39 | self._bn_adaptation = BatchnormAdaptationAlgorithm(**bn_adapt_algo_kwargs) if bn_adapt_algo_kwargs else None
40 | +
41 | + def _are_frozen_layers_allowed(self) -> Tuple[bool, str]:
42 | + if self._algo_config.get("frozen_layers_allowed", False):
43 | + return True, "Frozen layers are allowed (set in NNCF config)"
44 | + return super()._are_frozen_layers_allowed()
45 |
--------------------------------------------------------------------------------
/Shears/patches/peft-v0.5.0-inference.patch:
--------------------------------------------------------------------------------
1 | diff --git a/src/peft/utils/save_and_load.py b/src/peft/utils/save_and_load.py
2 | index 617287e..cb64cf7 100644
3 | --- a/src/peft/utils/save_and_load.py
4 | +++ b/src/peft/utils/save_and_load.py
5 | @@ -132,6 +132,33 @@ def set_peft_model_state_dict(model, peft_model_state_dict, adapter_name="defaul
6 | else:
7 | raise NotImplementedError
8 |
9 | + def module_reshape(state_dict):
10 | + for param_name, param in state_dict.items():
11 | + tensor_name = param_name
12 | + splits = tensor_name.split(".")
13 | + if len(splits) > 1:
14 | + module = model
15 | + parent = None
16 | + for split in splits[:-1]:
17 | + new_module = getattr(module, split)
18 | + if new_module is None:
19 | + raise ValueError(f"{module} has no attribute {split}.")
20 | + parent = module
21 | + module = new_module
22 | + tensor_name = splits[-1]
23 | + old_value = getattr(module, tensor_name)
24 | + if old_value.shape != param.shape and isinstance(module, torch.nn.Linear):
25 | + new_module = torch.nn.Linear(
26 | + param.shape[1],
27 | + param.shape[0],
28 | + bias=module.bias is not None,
29 | + dtype=module.weight.dtype,
30 | + device=module.weight.device
31 | + )
32 | + setattr(parent, splits[-2], new_module)
33 | +
34 | + module_reshape(peft_model_state_dict)
35 | +
36 | load_result = model.load_state_dict(peft_model_state_dict, strict=False)
37 | if config.is_prompt_learning:
38 | model.prompt_encoder[adapter_name].embedding.load_state_dict(
39 |
--------------------------------------------------------------------------------
/Shears/preprocess/mpt_process/mpt-7b-modifications-for-shears-usage.patch:
--------------------------------------------------------------------------------
1 | diff --git a/attention.py b/attention.py
2 | index 5cc3be7..6c6e308 100644
3 | --- a/attention.py
4 | +++ b/attention.py
5 | @@ -242,9 +242,15 @@ class GroupedQueryAttention(nn.Module):
6 | fc_kwargs: dict[str, Any] = {'bias': bias}
7 | if fc_type != 'te':
8 | fc_kwargs['device'] = device
9 | - self.Wqkv = FC_CLASS_REGISTRY[fc_type](self.d_model, self.d_model + 2 * self.kv_n_heads * self.head_dim, **fc_kwargs)
10 | - fuse_splits = [i * self.head_dim for i in range(1, self.n_heads + 2 * self.kv_n_heads)]
11 | - self.Wqkv._fused = (0, fuse_splits)
12 | +
13 | + # Separating QKV brings more flexibility for pruning.
14 | + # self.Wqkv = FC_CLASS_REGISTRY[fc_type](self.d_model, self.d_model + 2 * self.kv_n_heads * self.head_dim, **fc_kwargs)
15 | + # fuse_splits = [i * self.head_dim for i in range(1, self.n_heads + 2 * self.kv_n_heads)]
16 | + # self.Wqkv._fused = (0, fuse_splits)
17 | + self.q_proj = FC_CLASS_REGISTRY[fc_type](self.d_model, self.d_model, **fc_kwargs)
18 | + self.k_proj = FC_CLASS_REGISTRY[fc_type](self.d_model, self.kv_n_heads * self.head_dim, **fc_kwargs)
19 | + self.v_proj = FC_CLASS_REGISTRY[fc_type](self.d_model, self.kv_n_heads * self.head_dim, **fc_kwargs)
20 | +
21 | if self.qk_ln:
22 | norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
23 | self.q_ln = norm_class(self.d_model, device=device)
24 | @@ -261,10 +267,16 @@ class GroupedQueryAttention(nn.Module):
25 | self.out_proj._is_residual = True
26 |
27 | def forward(self, x: torch.Tensor, past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]]=None, attn_bias: Optional[torch.Tensor]=None, attention_mask: Optional[torch.Tensor]=None, is_causal: bool=True, needs_weights: bool=False) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor, torch.Tensor]]]:
28 | - qkv = self.Wqkv(x)
29 | + # qkv = self.Wqkv(x)
30 | + # if self.clip_qkv:
31 | + # qkv = qkv.clamp(min=-self.clip_qkv, max=self.clip_qkv)
32 | + # (query, key, value) = qkv.split([self.d_model, self.kv_n_heads * self.head_dim, self.kv_n_heads * self.head_dim], dim=2)
33 | + query, key, value = self.q_proj(x), self.k_proj(x), self.v_proj(x)
34 | if self.clip_qkv:
35 | - qkv = qkv.clamp(min=-self.clip_qkv, max=self.clip_qkv)
36 | - (query, key, value) = qkv.split([self.d_model, self.kv_n_heads * self.head_dim, self.kv_n_heads * self.head_dim], dim=2)
37 | + query = query.clamp(min=-self.clip_qkv, max=self.clip_qkv)
38 | + key = key.clamp(min=-self.clip_qkv, max=self.clip_qkv)
39 | + value = value.clamp(min=-self.clip_qkv, max=self.clip_qkv)
40 | +
41 | key_padding_mask = attention_mask
42 | if self.qk_ln:
43 | dtype = query.dtype
44 |
--------------------------------------------------------------------------------
/Shears/preprocess/mpt_process/split_qkv_preprocess.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 |
4 | import torch
5 |
6 | Q_PROJ_RANGE = (0, 4096)
7 | K_PROJ_RANGE = (4096, 8192)
8 | V_PROJ_RANGE = (8192, 12288)
9 |
10 |
11 | def main():
12 | parser = argparse.ArgumentParser()
13 |
14 | parser.add_argument("--base_model_name_or_path", default="mpt-7b", type=str, help="Path to mpt-7b model")
15 |
16 | args = parser.parse_args()
17 | base_model_name_or_path = args.base_model_name_or_path
18 | paths = [
19 | f"{base_model_name_or_path}/pytorch_model-00001-of-00002.bin",
20 | f"{base_model_name_or_path}/pytorch_model-00002-of-00002.bin",
21 | ]
22 | new_state_dict = {}
23 |
24 | for path in paths:
25 | old_state_dict = torch.load(path)
26 | keys = list(old_state_dict.keys())
27 | for key in keys:
28 | if key.endswith("Wqkv.weight"):
29 | prefix = ".".join(key.split(".")[:-2])
30 | new_state_dict[prefix + ".q_proj.weight"] = old_state_dict[key][
31 | Q_PROJ_RANGE[0] : Q_PROJ_RANGE[1]
32 | ].clone()
33 | new_state_dict[prefix + ".k_proj.weight"] = old_state_dict[key][
34 | K_PROJ_RANGE[0] : K_PROJ_RANGE[1]
35 | ].clone()
36 | new_state_dict[prefix + ".v_proj.weight"] = old_state_dict[key][
37 | V_PROJ_RANGE[0] : V_PROJ_RANGE[1]
38 | ].clone()
39 | else:
40 | new_state_dict[key] = old_state_dict[key].clone()
41 |
42 | os.system(f"rm {base_model_name_or_path}/pytorch_model*")
43 | torch.save(new_state_dict, f"{base_model_name_or_path}/pytorch_model.bin")
44 |
45 |
46 | if __name__ == "__main__":
47 | main()
48 |
--------------------------------------------------------------------------------
/Shears/utils/utils.py:
--------------------------------------------------------------------------------
1 | """
2 | Some NNCF config preprocessing code for Shears.
3 |
4 | This module provides preprocessing functionality for NNCF (Neural Network Compression Framework) configuration files
5 | used in Shears. It includes utility functions for handling JSON files and preprocessing NNCF configurations.
6 | """
7 | import json
8 | from pathlib import Path
9 | from nncf import NNCFConfig
10 | from nncf.common.utils.os import safe_open
11 |
12 | def parse_nncf_config(nncf_config_path, num=1):
13 |
14 | with safe_open(Path(nncf_config_path)) as f:
15 | loaded_json = json.load(f)
16 |
17 | base_overwrite_groups = loaded_json["bootstrapNAS"]["training"]["elasticity"]["width"]["overwrite_groups"]
18 | base_overwrite_groups_widths = loaded_json["bootstrapNAS"]["training"]["elasticity"]["width"][
19 | "overwrite_groups_widths"]
20 | overwrite_groups, overwrite_groups_widths = [], []
21 | for group, width in zip(base_overwrite_groups, base_overwrite_groups_widths):
22 | if group[0].startswith("{re}"):
23 | new_group = [[item.replace("{re}", "").replace("{*}", str(i)) for item in group] for i in range(num)]
24 | new_width = [width for _ in range(num)]
25 | else:
26 | new_group = [group]
27 | new_width = [width]
28 | overwrite_groups.extend(new_group)
29 | overwrite_groups_widths.extend(new_width)
30 |
31 | loaded_json["bootstrapNAS"]["training"]["elasticity"]["width"]["overwrite_groups"] = overwrite_groups
32 | loaded_json["bootstrapNAS"]["training"]["elasticity"]["width"][
33 | "overwrite_groups_widths"] = overwrite_groups_widths
34 | return loaded_json
35 |
36 | def add_lr_epochs(nncf_config, lr=3e-4, epochs=3):
37 | stage_desc = nncf_config["bootstrapNAS"]["training"]["schedule"]["list_stage_descriptions"][0]
38 | if stage_desc["init_lr"] == -1:
39 | stage_desc["init_lr"] = lr
40 | if stage_desc["epochs"] == -1:
41 | stage_desc["epochs"] = epochs
42 | stage_desc["epochs_lr"] = epochs
43 |
44 | return nncf_config
45 |
46 | def load_nncf_config(nncf_config, lr=3e-4, epochs=3, num_hidden_layers=32):
47 | loaded_json = parse_nncf_config(nncf_config, num=num_hidden_layers)
48 | loaded_json = add_lr_epochs(loaded_json, lr=lr, epochs=epochs)
49 | nncf_config = NNCFConfig.from_dict(loaded_json)
50 | return nncf_config
51 |
--------------------------------------------------------------------------------
/SparAMX/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 | docs/_output/
74 |
75 | # Jupyter Notebook
76 | .ipynb_checkpoints
77 |
78 | # IPython
79 | profile_default/
80 | ipython_config.py
81 |
82 | # pyenv
83 | .python-version
84 |
85 | # celery beat schedule file
86 | celerybeat-schedule.*
87 |
88 | # SageMath parsed files
89 | *.sage.py
90 |
91 | # Environments
92 | .env
93 | .venv
94 | env/
95 | venv/
96 | ENV/
97 | env.bak/
98 | venv.bak/
99 |
100 | # Spyder project settings
101 | .spyderproject
102 | .spyderproject/config/
103 |
104 | # Rope project settings
105 | .ropeproject
106 |
107 | # mkdocs documentation
108 | /site
109 |
110 | # mypy
111 | .mypy_cache/
112 | .dmypy.json
113 | dmypy.json
114 |
115 | # Pyre type checker
116 | .pyre/
117 |
118 | # pytype static type analyzer
119 | .pytype/
120 |
121 | # Cython debug symbols
122 | cython_debug/
123 |
124 | # PyTorch
125 | *.pt
126 | *.pth
127 | *.onnx
128 |
129 | # PyCharm
130 | .idea/
131 |
132 | # VS Code
133 | .vscode/
134 |
135 | # Ignore data files
136 | data/
137 | datasets/
138 |
139 | # Ignore generated files
140 | logs/
141 | results/
142 | checkpoints/
143 |
144 | # Ignore virtual environments
145 | venv/
146 | venv.bak/
147 | .env/
148 | .venv/
149 | ENV/
150 | env.bak/
151 | venv.bak/
152 |
--------------------------------------------------------------------------------
/SparAMX/README.md:
--------------------------------------------------------------------------------
1 | # SparAMX: Accelerating Compressed LLMs Token Generation on AMX-powered CPUs
2 |
3 | Official implementation of SparAMX: Accelerating Compressed LLMs Token Generation on AMX-powered CPUs.
4 |
5 | This repo contains the code for **SparAMX**, a set of open-source customized sparse kernels that can speed up any PyTorch model by automatically replacing all linear layers with our customized layer. Furthermore, we demonstrate for the first time the use of unstructured sparsity in the attention computation and achieving \textbf{1.14}$\times$ speedup over the current systems without compromising accuracy.
6 |
7 | | Stock PyTorch | SparAMX |
8 | |:-----------:|:-----------:|
9 | |
|
|
10 |
11 | # torch-custom-linear
12 | Custom implementation of linear through torch extension
13 |
14 | ### Dependency
15 | ```pip install -r requirements.txt```
16 |
17 | ### Build & Install Custom Kernel
18 | ```
19 | python setup.py install
20 | ```
21 |
22 | ### Run Experiments Example
23 |
24 | Please make sure you're logged in to HuggingFace through the CLI if you'll be using a private model.
25 |
26 | You need to define the experiments you want to run in `generate_experiments.py` then run
27 | ```
28 | python generate_experiments.py
29 | ```
30 |
31 | A file `experiments.csv` is generated. Modify it if needed. After that run
32 | ```
33 | ./run_experiments.sh
34 | ```
35 |
36 | Your results will be saved inside folder `experiment_results/YYYY-MM-DD_HH-MM-SS`.
37 |
38 | ## Citation
39 | If you find our SparAMX code and paper helpful, please kindly cite:
40 | ```bibtex
41 | @misc{abouelhamayed2025sparamxacceleratingcompressedllms,
42 | title={SparAMX: Accelerating Compressed LLMs Token Generation on AMX-powered CPUs},
43 | author={Ahmed F. AbouElhamayed and Jordan Dotzel and Yash Akhauri and Chi-Chih Chang and Sameh Gobriel and J. Pablo Muñoz and Vui Seng Chua and Nilesh Jain and Mohamed S. Abdelfattah},
44 | year={2025},
45 | eprint={2502.12444},
46 | archivePrefix={arXiv},
47 | primaryClass={cs.LG},
48 | url={https://arxiv.org/abs/2502.12444},
49 | }
50 | '''
51 |
52 |
--------------------------------------------------------------------------------
/SparAMX/Videos/sparamx.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/7549413d38677dd6eb92f918f7cc003dc65d1deb/SparAMX/Videos/sparamx.gif
--------------------------------------------------------------------------------
/SparAMX/Videos/stock.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/7549413d38677dd6eb92f918f7cc003dc65d1deb/SparAMX/Videos/stock.gif
--------------------------------------------------------------------------------
/SparAMX/benchmark_deepsparse.sh:
--------------------------------------------------------------------------------
1 | numactl --cpunodebind 0 --membind 0 --physcpubind=0-31 deepsparse.benchmark hf:neuralmagic/Llama2-7b-chat-pruned50-quant-ds -x benchmark -b 1
2 |
--------------------------------------------------------------------------------
/SparAMX/compare_stock_vs_custom_linear.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 | import time
4 |
5 | from layer.dense_linear import DenseLinear
6 | from statistics import median
7 |
8 | torch.set_num_threads(1)
9 | d=4096
10 | o=1024
11 |
12 | one_layer_net_torch_stock = nn.Linear(in_features=d, out_features=o, bias=False)
13 |
14 | # Make sure all weights are representable in bfloat16.
15 | one_layer_net_torch_stock.weight.data = one_layer_net_torch_stock.weight.data.to(torch.bfloat16).to(torch.float)
16 | # one_layer_net_torch_stock.weight.data = torch.tensor([[1.,1.,2,1], [5.,1,1,1]])
17 | print("\n[Info]: Creating one layer neural network with nn.Linear")
18 | print(f"one_layer_net_torch_stock:\n{one_layer_net_torch_stock}")
19 |
20 | one_layer_net_torch_extcpp = DenseLinear.from_linear(one_layer_net_torch_stock)
21 | print("\n[Info]: Creating one layer neural network with DenseFC")
22 | print(f"one_layer_net_torch_extcpp:\n{one_layer_net_torch_extcpp}")
23 |
24 | N=1
25 | x = torch.randn(N, d, dtype=torch.bfloat16).to(torch.float)
26 | x_bf = x.to(torch.bfloat16)
27 | # x = torch.tensor([[1., 1, 1, 1]])
28 | print(f"\n[Info]: Creating test input x ({N}, {d})")
29 | print(f"x: {x.shape}\n{x}")
30 |
31 | times_stock = []
32 | times_amx = []
33 | with torch.no_grad():
34 | for i in range(5): # warmup
35 | x = torch.randn(N, d, dtype=torch.bfloat16).to(torch.float)
36 | o_torch_stock = one_layer_net_torch_stock(x)
37 | o_torch_extcpp = one_layer_net_torch_extcpp(x)
38 | for i in range(1000): # measurement
39 | x = torch.randn(N, d, dtype=torch.bfloat16).to(torch.float)
40 | start = time.time()
41 | o_torch_stock = one_layer_net_torch_stock(x)
42 | time_stock = time.time() - start
43 | times_stock.append(time_stock)
44 | start = time.time()
45 | o_torch_extcpp = one_layer_net_torch_extcpp(x)
46 | time_amx = time.time() - start
47 | times_amx.append(time_amx)
48 |
49 | print(f"\none_layer_net_torch_stock(x): {o_torch_stock.shape}\n{o_torch_stock}")
50 | print(f"\none_layer_net_torch_extcpp(x): {o_torch_extcpp.shape}\n{o_torch_extcpp}")
51 |
52 | # print(f"\none_layer_net_torch_stock(x) == one_layer_net_torch_extcpp(x) ???\n{torch.testing.assert_close(o_torch_extcpp, o_torch_stock)}")
53 | print(f"Time Stock: {median(times_stock)}, Time AMX: {median(times_amx)}")
54 |
55 |
--------------------------------------------------------------------------------
/SparAMX/compare_stock_vs_onednn_linear.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 | import time
4 |
5 | from layer.onednn_linear import OneDnnLinear
6 | from statistics import median
7 |
8 | torch.set_num_threads(1)
9 | d=14336
10 | o=4096
11 |
12 | execute_custom_layer = True
13 | execute_stock_layer = False
14 |
15 | one_layer_net_torch_stock = nn.Linear(in_features=d, out_features=o, bias=False)
16 |
17 | # Make sure all weights are representable in bfloat16.
18 | one_layer_net_torch_stock.weight.data = one_layer_net_torch_stock.weight.data.to(torch.bfloat16).to(torch.float)
19 | # one_layer_net_torch_stock.weight.data = torch.tensor([[0.,0]])
20 | # import pdb; pdb.set_trace()
21 | print("\n[Info]: Creating one layer neural network with nn.Linear")
22 | print(f"one_layer_net_torch_stock:\n{one_layer_net_torch_stock}")
23 |
24 | one_layer_net_torch_extcpp = OneDnnLinear.from_linear(one_layer_net_torch_stock)
25 | print("\n[Info]: Creating one layer neural network with DenseFC")
26 | print(f"one_layer_net_torch_extcpp:\n{one_layer_net_torch_extcpp}")
27 |
28 | N=1
29 | x = torch.randn(N, d, dtype=torch.bfloat16).to(torch.float)
30 | # x_bf = x.to(torch.bfloat16)
31 | # x = torch.tensor([[1., 0]])
32 | # import pdb; pdb.set_trace()
33 | print(f"\n[Info]: Creating test input x ({N}, {d})")
34 | print(f"x: {x.shape}\n{x}")
35 |
36 | times_stock = []
37 | times_amx = []
38 | with torch.no_grad():
39 | for i in range(5): # warmup
40 | x = torch.randn(N, d, dtype=torch.bfloat16).to(torch.float)
41 | if execute_stock_layer:
42 | o_torch_stock = one_layer_net_torch_stock(x)
43 | if execute_custom_layer:
44 | o_torch_extcpp = one_layer_net_torch_extcpp(x)
45 | for i in range(1000): # measurement
46 | x = torch.randn(N, d, dtype=torch.bfloat16).to(torch.float)
47 | start = time.time()
48 | if execute_stock_layer:
49 | o_torch_stock = one_layer_net_torch_stock(x)
50 | time_stock = time.time() - start
51 | times_stock.append(time_stock)
52 | start = time.time()
53 | if execute_custom_layer:
54 | o_torch_extcpp = one_layer_net_torch_extcpp(x)
55 | time_amx = time.time() - start
56 | times_amx.append(time_amx)
57 |
58 | # print(f"\none_layer_net_torch_stock(x): {o_torch_stock.shape}\n{o_torch_stock}")
59 | # print(f"\none_layer_net_torch_extcpp(x): {o_torch_extcpp.shape}\n{o_torch_extcpp}")
60 | if execute_stock_layer and execute_custom_layer:
61 | torch.testing.assert_close(o_torch_extcpp, o_torch_stock)
62 |
63 | if execute_stock_layer:
64 | print(f"Time Stock: {median(times_stock)}\n")
65 | if execute_custom_layer:
66 | print(f"Time AMX: {median(times_amx)}\n")
67 |
--------------------------------------------------------------------------------
/SparAMX/deepsparse_optimized_llama2.py:
--------------------------------------------------------------------------------
1 | # Example Run: numactl --cpunodebind 0 --membind 0 --physcpubind=0-31 python deepsparse_optimized_llama2.py --batch_size 32 --num_generation_tokens 128 --warmup_iterations 5 --num_iterations 5
2 | import time
3 | import numpy as np
4 | import argparse
5 | from deepsparse import TextGeneration
6 |
7 | # def benchmark(batch_size, num_generation_tokens, input_size, num_iterations, warmup_iterations):
8 | def benchmark(batch_size, num_generation_tokens, num_iterations, warmup_iterations):
9 | # Initialize the pipeline outside of timing
10 | pipeline = TextGeneration(model="hf:neuralmagic/Llama2-7b-chat-pruned50-quant-ds")
11 |
12 | # Prepare batch inputs (use realistic prompts of specified input size)
13 | batch_inputs = ["" for _ in range(batch_size)]
14 |
15 | # Warm-up phase
16 | for _ in range(warmup_iterations):
17 | _ = pipeline(batch_inputs, max_new_tokens=num_generation_tokens)
18 |
19 | # Benchmarking
20 | timings = []
21 | import pdb; pdb.set_trace()
22 | for _ in range(num_iterations):
23 | start_time = time.time()
24 | out = pipeline(batch_inputs, max_new_tokens=num_generation_tokens)
25 | end_time = time.time()
26 | timings.append(end_time - start_time)
27 |
28 | # Compute statistics
29 | mean_time = np.mean(timings)
30 | std_time = np.std(timings)
31 | median_time = np.median(timings)
32 | throughput = (batch_size * num_generation_tokens) / mean_time # Tokens per second
33 | # Output results
34 | print(f"Mean inference time per batch: {mean_time:.4f} seconds")
35 | print(f"Standard deviation: {std_time:.4f} seconds")
36 | print(f"Median inference time per batch: {median_time:.4f} seconds")
37 | print(f"Throughput: {throughput:.2f} tokens per second")
38 |
39 |
40 |
41 | if __name__ == "__main__":
42 | parser = argparse.ArgumentParser(description="Benchmark DeepSparse Text Generation Pipeline")
43 | parser.add_argument("--batch_size", type=int, default=1, help="Batch size for the benchmark")
44 | parser.add_argument("--num_generation_tokens", type=int, default=1, help="Number of tokens to generate")
45 | # parser.add_argument("--input_size", type=int, default=1, help="Size of the input prompt (number of characters)")
46 | parser.add_argument("--num_iterations", type=int, default=100, help="Number of iterations for benchmarking")
47 | parser.add_argument("--warmup_iterations", type=int, default=10, help="Number of warm-up iterations")
48 |
49 | args = parser.parse_args()
50 | # Run the benchmark
51 | benchmark(
52 | batch_size=args.batch_size,
53 | num_generation_tokens=args.num_generation_tokens,
54 | # input_size=args.input_size,
55 | num_iterations=args.num_iterations,
56 | warmup_iterations=args.warmup_iterations
57 | )
58 |
--------------------------------------------------------------------------------
/SparAMX/generate_experiments.py:
--------------------------------------------------------------------------------
1 | import csv
2 |
3 | # Example arrays (you can replace these with your actual data)
4 | num_threads = [32]
5 | cores = [32]
6 | # modes = ['avx_sparse', 'sparse', 'stock']
7 | modes = ['stock']
8 | context_lengths = [512, 1024, 2048, 4096, 8192, 16384]
9 | generation_lengths = [2]
10 | batch_sizes = [1]
11 | model_ids = ['meta-llama/Meta-Llama-3-8B']
12 |
13 | num_groups = [32]
14 | use_custom_attention = True
15 | use_custom_k = [True, False]
16 | use_custom_v = [True, False]
17 | k_pruning_percentages = [50, 60, 70, 80]
18 | v_pruning_percentages = [50, 60, 70, 80]
19 |
20 | # Define the CSV file path
21 | csv_file = "experiments.csv"
22 |
23 | # Create a CSV file and write the header
24 | with open(csv_file, mode="w", newline="") as file:
25 | writer = csv.writer(file)
26 | writer.writerow(["Model ID", "Saved Model Path", "Context Length", "Number of generated Tokens", "Mode", "Number of Column Groups", "Number of Threads", "Number of Cores", "Batch Size", "Use Custom Attention", "Use Custom K", "Use Custom V", "K Pruning Percentage", "V Pruning Percentage", "Prefill Latency", "Decode Latency", "Total Latency", "Layer Times"])
27 |
28 | # Loop through the arrays and write the data
29 | for model_id in model_ids:
30 | for mode in modes:
31 | placeholder_saved_model_path = f"processed_models/{mode}/{model_id.split('/')[-1]}_num_threads_threads"
32 | for generation_length in generation_lengths:
33 | for context_length in context_lengths:
34 | for core in cores:
35 | for batch_size in batch_sizes:
36 | for use_custom_k_val in use_custom_k:
37 | for use_custom_v_val in use_custom_v:
38 | for num_thread in num_threads:
39 | saved_model_path = placeholder_saved_model_path.replace('_num_threads', f'_{num_thread}')
40 | for num_group in num_groups:
41 | original_model_path = f'{saved_model_path}'
42 | if mode == 'avx_sparse':
43 | saved_model_path = f'{saved_model_path}_{num_group}_groups'
44 | for k_pruning in k_pruning_percentages:
45 | for v_pruning in v_pruning_percentages:
46 | writer.writerow([model_id, saved_model_path, context_length, generation_length, mode, num_group, num_thread, core, batch_size, use_custom_attention, use_custom_k_val, use_custom_v_val, k_pruning, v_pruning])
47 | if not(use_custom_k_val):
48 | break
49 | if not(use_custom_v_val):
50 | break
51 | saved_model_path = original_model_path
52 | if mode != 'avx_sparse':
53 | break
54 | if mode == 'stock':
55 | break
56 |
57 | print(f"CSV file '{csv_file}' has been generated with the data.")
58 |
--------------------------------------------------------------------------------
/SparAMX/layer/onednn_linear.py:
--------------------------------------------------------------------------------
1 | import math
2 | from torch import Tensor, nn
3 | from torch.autograd import Function
4 | import torch
5 |
6 | import custom_onednn_linear
7 |
8 |
9 | class OneDnnLinear(nn.Linear):
10 | def __init__(self, in_features: int, out_features: int, bias: bool = True, device=None, dtype=None) -> None:
11 | super().__init__(in_features, out_features, bias, device, dtype)
12 | # import pdb; pdb.set_trace()
13 | self.onednn_primitive = custom_onednn_linear.get_onednn_descriptor(1,in_features, out_features)
14 |
15 | # we only override the forward to map to our custom kernel
16 | # backward will fallback to nn.Linear.backward()
17 |
18 | def forward(self, input: Tensor) -> Tensor:
19 | # print("[Info]: Entering custom linear implementation")
20 | # import pdb; pdb.set_trace()
21 | return custom_onednn_linear.forward(self.onednn_primitive, input, self.weight, self.bias)
22 |
23 | @classmethod
24 | def from_linear(cls, nn_linear_inst: nn.Linear, shallow=False):
25 | # TODO: not an efficient implementation, we will create another copy of parameters, imagine Billion scale llm!
26 | # see llm_pipeline.py for a "Wrapper" way
27 | new_inst = cls(nn_linear_inst.in_features,
28 | nn_linear_inst.out_features,
29 | nn_linear_inst.bias is not None,
30 | next(nn_linear_inst.parameters()).device,
31 | next(nn_linear_inst.parameters()).dtype)
32 |
33 | # new_inst.weight.data = nn_linear_inst.weight.data
34 | print(f"Before weight conversion: {nn_linear_inst.weight.data}")
35 | # Convert to AMX format: Group each 2 consecutive elements, then transpose then reshape to have the same original shape.
36 | new_inst.weight.data = nn_linear_inst.weight.data
37 | # ^ The above handles corner cases for the below.
38 | # new_inst.weight.data = nn_linear_inst.weight.data.view(-1, 2, 2).transpose(0, 1).reshape(nn_linear_inst.weight.data.shape)
39 | print(f"After weight conversion: {new_inst.weight.data}")
40 | if new_inst.bias is not None:
41 | new_inst.bias.data = nn_linear_inst.bias.data
42 | return new_inst
--------------------------------------------------------------------------------
/SparAMX/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate==0.31.0
2 | aiohappyeyeballs==2.4.0
3 | aiohttp==3.10.11
4 | aiosignal==1.3.1
5 | asttokens==2.4.1
6 | attrs==24.2.0
7 | certifi==2024.7.4
8 | charset-normalizer==3.3.2
9 | coloredlogs==15.0.1
10 | comm==0.2.2
11 | contourpy==1.3.0
12 | cycler==0.12.1
13 | datasets==3.0.0
14 | debugpy==1.8.5
15 | decorator==5.1.1
16 | dill==0.3.8
17 | distlib==0.3.8
18 | evaluate==0.4.3
19 | executing==2.1.0
20 | filelock==3.15.4
21 | flatbuffers==24.3.25
22 | fonttools==4.53.1
23 | frozenlist==1.4.1
24 | fsspec==2024.6.1
25 | huggingface-hub==0.24.6
26 | humanfriendly==10.0
27 | idna==3.8
28 | iniconfig==2.0.0
29 | inquirerpy==0.3.4
30 | ipykernel==6.29.5
31 | ipython==8.27.0
32 | jedi==0.19.1
33 | Jinja2==3.1.5
34 | jupyter_client==8.6.2
35 | jupyter_core==5.7.2
36 | kiwisolver==1.4.7
37 | MarkupSafe==2.1.5
38 | matplotlib==3.9.2
39 | matplotlib-inline==0.1.7
40 | mpmath==1.3.0
41 | multidict==6.1.0
42 | multiprocess==0.70.16
43 | nest-asyncio==1.6.0
44 | networkx==3.3
45 | numpy==1.26.4
46 | nvidia-cublas-cu12==12.1.3.1
47 | nvidia-cuda-cupti-cu12==12.1.105
48 | nvidia-cuda-nvrtc-cu12==12.1.105
49 | nvidia-cuda-runtime-cu12==12.1.105
50 | nvidia-cudnn-cu12==8.9.2.26
51 | nvidia-cufft-cu12==11.0.2.54
52 | nvidia-curand-cu12==10.3.2.106
53 | nvidia-cusolver-cu12==11.4.5.107
54 | nvidia-cusparse-cu12==12.1.0.106
55 | nvidia-nccl-cu12==2.20.5
56 | nvidia-nvjitlink-cu12==12.6.20
57 | nvidia-nvtx-cu12==12.1.105
58 | onnx==1.17.0
59 | onnxconverter-common==1.14.0
60 | onnxruntime==1.19.2
61 | onnxruntime-tools==1.7.0
62 | optimum==1.22.0
63 | packaging==24.1
64 | pandas==2.2.2
65 | parso==0.8.4
66 | pexpect==4.9.0
67 | pfzy==0.3.4
68 | pillow==10.4.0
69 | platformdirs==4.3.2
70 | pluggy==1.5.0
71 | prompt_toolkit==3.0.47
72 | protobuf==3.20.2
73 | psutil==6.0.0
74 | ptyprocess==0.7.0
75 | pure_eval==0.2.3
76 | py-cpuinfo==9.0.0
77 | py3nvml==0.2.7
78 | pyarrow==17.0.0
79 | Pygments==2.18.0
80 | pyparsing==3.1.4
81 | pytest==8.3.2
82 | python-dateutil==2.9.0.post0
83 | pytz==2024.2
84 | PyYAML==6.0.2
85 | pyzmq==26.2.0
86 | regex==2024.7.24
87 | requests==2.32.3
88 | safetensors==0.4.4
89 | seaborn==0.13.2
90 | sentencepiece==0.2.0
91 | setuptools==73.0.1
92 | six==1.16.0
93 | sparse_linear==0.0.0
94 | stack-data==0.6.3
95 | sympy==1.13.2
96 | tf2onnx==1.16.1
97 | tokenizers==0.19.1
98 | torch==2.3.1
99 | tornado==6.4.2
100 | tqdm==4.66.5
101 | traitlets==5.14.3
102 | transformers==4.48.0
103 | typing_extensions==4.12.2
104 | tzdata==2024.1
105 | urllib3==2.2.2
106 | virtualenv==20.26.6
107 | wcwidth==0.2.13
108 | xmltodict==0.13.0
109 | xxhash==3.5.0
110 | yarl==1.11.1
111 |
--------------------------------------------------------------------------------
/SparAMX/run_experiments.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Input and output CSV files
4 | input_csv="experiments.csv"
5 | time=$(date +"%Y-%m-%d_%H-%M-%S")
6 | dir="experiment_results/$time"
7 | mkdir -p $dir
8 | output_csv="$dir/results.csv"
9 | prefill_time_file="prefill_time.txt"
10 | decode_time_file="decode_time.txt"
11 | total_time_file="total_time.txt"
12 | layer_times_file="layer_times.txt"
13 | num_repeats=1
14 | # Read the header and write it to the output CSV
15 | header=$(head -n 1 "$input_csv")
16 | echo "$header" > "$output_csv"
17 |
18 | # Process the CSV file line by line, skipping the header
19 | tail -n +2 "$input_csv" | tr -d '\r' | while IFS=, read -r model_id saved_model_path context_length num_generated_tokens mode num_groups num_threads core batch_size enable_custom_attention use_custom_k use_custom_v k_pruning_percentage v_pruning_percentage; do
20 |
21 | for ((i=1; i<=$num_repeats; i++)); do
22 | rm -f $prefill_time_file
23 | rm -f $decode_time_file
24 |
25 | if [ "$mode" = "avx_sparse" ]; then
26 | sed -i '/# CHANGE BELOW FOR GROUP SIZE/!b;n;cNUM_GROUPS = '"$num_groups" layer/avx_sparse_linear.py
27 | sed -i '/\/\/ CHANGE BELOW FOR GROUP SIZE/!b;n;c#define NUM_COL_GROUPS '"$num_groups" csrc/avx_sparse_linear.cpp
28 | python setup.py install
29 | fi
30 |
31 | # construct new variable core_vals = "0-($core-1)"
32 | core_vals="0-$(($core-1))"
33 |
34 | enable_custom_attention_str=""
35 | if [ "$enable_custom_attention" = "True" ]; then
36 | enable_custom_attention_str="--enable_custom_attention"
37 | fi
38 |
39 | use_custom_k_str=""
40 | if [ "$use_custom_k" = "True" ]; then
41 | use_custom_k_str="--use_custom_k"
42 | fi
43 |
44 | use_custom_v_str=""
45 | if [ "$use_custom_v" = "True" ]; then
46 | use_custom_v_str="--use_custom_v"
47 | fi
48 |
49 | # Run the experiment
50 | numactl --cpunodebind 0 --membind 0 --physcpubind=$core_vals python llm_pipeline.py --model_id $model_id --saved_model_path $saved_model_path --context_length $context_length --num_generated_tokens $num_generated_tokens --mode $mode --num_threads $num_threads --batch_size $batch_size $enable_custom_attention_str $use_custom_k_str $use_custom_v_str --k_pruning $k_pruning_percentage --v_pruning $v_pruning_percentage
51 |
52 | # Read the result from the result file
53 | time_prefill=$(<$prefill_time_file)
54 | time_decode=$(<$decode_time_file)
55 | total_time=$(<$total_time_file)
56 | # layer_times=$(<$layer_times_file)
57 |
58 | # Construct the new CSV line with the result
59 | new_line="$model_id,$saved_model_path,$context_length,$num_generated_tokens,$mode,$num_groups,$num_threads,$core,$batch_size,$enable_custom_attention,$use_custom_k,$use_custom_v,$k_pruning_percentage,$v_pruning_percentage,$time_prefill,$time_decode,$total_time"
60 | # new_line="$model_id,$saved_model_path,$context_length,$num_generated_tokens,$mode,$num_groups,$num_threads,$core,$batch_size,$enable_custom_attention,$use_custom_k,$use_custom_v,$k_pruning_percentage,$v_pruning_percentage,$time_prefill,$time_decode,$total_time,$layer_times"
61 | # new_line="$model_id,$saved_model_path,$context_length,$num_generated_tokens,$mode,$num_groups,$num_threads,$core,$batch_size,$time_prefill,$time_decode"
62 |
63 | # Append the new line to the output CSV
64 | echo "$new_line" >> "$output_csv"
65 | done
66 | done
67 |
--------------------------------------------------------------------------------
/SparAMX/setup.py:
--------------------------------------------------------------------------------
1 | import os
2 | import glob
3 | import shutil
4 | import torch
5 | from setuptools import setup, find_packages, Command
6 | from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDA_HOME
7 | from pathlib import Path
8 |
9 | # Get PyTorch library path
10 | TORCH_LIB_PATH = str(Path(torch.__file__).parent / 'lib')
11 |
12 | # Add torch lib path to environment
13 | if 'LD_LIBRARY_PATH' in os.environ:
14 | os.environ['LD_LIBRARY_PATH'] = f"{TORCH_LIB_PATH}:{os.environ['LD_LIBRARY_PATH']}"
15 | else:
16 | os.environ['LD_LIBRARY_PATH'] = TORCH_LIB_PATH
17 |
18 | class CustomCleanCommand(Command):
19 | """Custom clean command to tidy up the project root."""
20 | user_options = []
21 |
22 | def initialize_options(self):
23 | pass
24 |
25 | def finalize_options(self):
26 | pass
27 |
28 | def run(self):
29 | patterns_to_remove = [
30 | 'sparamx.egg-info',
31 | 'sparamx/*.so',
32 | 'sparamx/*.pyd',
33 | ]
34 |
35 | build_dirs = ['./build', './dist']
36 | for dir_path in build_dirs:
37 | if os.path.exists(dir_path):
38 | print(f'Removing directory: {dir_path}')
39 | shutil.rmtree(dir_path)
40 |
41 | for pattern in patterns_to_remove:
42 | for item in glob.glob(pattern):
43 | if os.path.isdir(item):
44 | print(f'Removing directory: {item}')
45 | shutil.rmtree(item)
46 | elif os.path.isfile(item):
47 | print(f'Removing file: {item}')
48 | os.remove(item)
49 |
50 | for root, dirs, files in os.walk('./sparamx'):
51 | if '__pycache__' in dirs:
52 | cache_dir = os.path.join(root, '__pycache__')
53 | print(f'Removing directory: {cache_dir}')
54 | shutil.rmtree(cache_dir)
55 |
56 | # Common compiler and linker arguments
57 | extra_compile_args = [
58 | '-mamx-tile',
59 | '-mamx-int8',
60 | '-mamx-bf16',
61 | '-fopenmp',
62 | '-O3',
63 | '-DNDEBUG',
64 | '-march=sapphirerapids',
65 | '-mavx512f',
66 | '-mavx512dq'
67 | ]
68 |
69 | # Add PyTorch include paths
70 | include_dirs = [
71 | os.path.join(torch.utils.cpp_extension.include_paths()[0], 'torch', 'csrc', 'api', 'include'),
72 | os.path.join(torch.utils.cpp_extension.include_paths()[0], 'torch', 'lib'),
73 | ]
74 |
75 | # Define extensions
76 | extension_specs = [
77 | ("sparse_linear", "csrc/sparse_linear.cpp"),
78 | ("avx_sparse_linear", "csrc/avx_sparse_linear.cpp"),
79 | ("quantized_sparse_linear", "csrc/quantized_sparse_linear.cpp"),
80 | ("quantized_dense_linear", "csrc/quantized_dense_linear.cpp"),
81 | ("dense_linear", "csrc/dense_linear.cpp"),
82 | ]
83 |
84 | extensions = []
85 | for name, source in extension_specs:
86 | source_path = os.path.abspath(source)
87 | print(f"Setting up extension {name} from source {source_path}")
88 |
89 | if not os.path.exists(source_path):
90 | print(f"WARNING: Source file not found: {source_path}")
91 | continue
92 |
93 | ext = CppExtension(
94 | name=f"sparamx.{name}",
95 | sources=[source_path],
96 | include_dirs=include_dirs,
97 | extra_compile_args=extra_compile_args,
98 | extra_link_args=['-lgomp', f'-Wl,-rpath,{TORCH_LIB_PATH}']
99 | )
100 | extensions.append(ext)
101 |
102 | setup(
103 | name="sparamx",
104 | version="0.1.0",
105 | packages=find_packages(),
106 | ext_modules=extensions,
107 | cmdclass={
108 | 'build_ext': BuildExtension,
109 | 'clean': CustomCleanCommand,
110 | }
111 | )
--------------------------------------------------------------------------------
/security.md:
--------------------------------------------------------------------------------
1 | # Security Policy
2 | Intel is committed to rapidly addressing security vulnerabilities affecting our customers and providing clear guidance on the solution, impact, severity and mitigation.
3 |
4 | ## Reporting a Vulnerability
5 | Please report any security vulnerabilities in this project [utilizing the guidelines here](https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html).
6 |
--------------------------------------------------------------------------------