├── __init__.py ├── benchmark ├── __init__.py ├── bench_utils.py ├── ADBConnect.py ├── run_on_device.py └── tensorrt │ ├── calibrator.py │ └── onnx_trt_test.py ├── are_16_heads ├── __init__.py ├── itp │ ├── setup.sh │ ├── run_itp.sh │ └── submit.py ├── .amltignore ├── logger.py ├── deit_tiny_head_importance.txt ├── .amltconfig ├── heads_pruning.sh ├── heads_ablation.sh ├── requirements.txt ├── deit_small_head_importance.txt ├── deit_base_head_importance.txt ├── classifier_scoring.py ├── prepare_task.sh ├── fetch_results.py ├── evaluate_iterative_pruned_deit.py ├── finetune.py ├── util.py └── pruning.py ├── deit_pruning ├── src │ ├── inspector │ │ ├── __init__.py │ │ └── get_sparsity.py │ ├── preprocessing │ │ ├── __init__.py │ │ └── random_select.py │ ├── pytorch_prune │ │ ├── __init__.py │ │ ├── ln_smart.py │ │ ├── block.py │ │ └── pruner.py │ ├── deepspeed_config │ │ ├── deepspeed.json │ │ ├── deepspeed_deit_base.json │ │ ├── deepspeed_deit_small.json │ │ ├── deepspeed_deit_tiny.json │ │ ├── deepspeed_finetune_deit_tiny.json │ │ ├── deepspeed_finetune_deit_base.json │ │ └── deepspeed_finetune_deit_small.json │ ├── validate.py │ ├── analyse.py │ ├── onnx_inference.py │ ├── latency_model.py │ ├── onnx_export.py │ ├── layers │ │ └── super_bertlayers.py │ ├── model.py │ ├── supernet.py │ ├── get_latency.py │ ├── data.py │ └── trainer.py ├── vendor │ └── nn_pruning_v1 │ │ ├── MANIFEST.in │ │ ├── nn_pruning │ │ ├── tests │ │ │ ├── __init__.py │ │ │ ├── test_quantization.py │ │ │ ├── test_patch2.py │ │ │ └── test_patch.py │ │ ├── __init__.py │ │ ├── modules │ │ │ ├── gelu2relu.py │ │ │ ├── AmpereRework.ipynb │ │ │ ├── quantization_config.py │ │ │ └── nonorm.py │ │ ├── model_patcher.py │ │ ├── training_patcher.py │ │ └── hp_naming.py │ │ ├── pyproject.toml │ │ ├── .isort.cfg │ │ ├── Makefile │ │ ├── .gitignore │ │ └── setup.py ├── config │ ├── topk-hybrid-struct-layerwise.json │ ├── topk-hybrid-struct-layerwise-base.json │ ├── topk-hybrid-struct-layerwise-small.json │ ├── topk-hybrid-struct-layerwise-tiny.json │ ├── topk-hybrid.json │ ├── topk-hybrid-block4x4.json │ ├── topk-hybrid-struct.json │ ├── magnitude-hybrid.json │ ├── topk-hybrid-block16x16.json │ ├── sigmoied_threshold-hybrid.json │ ├── topk-unstructured.json │ └── sigmoied_threshold-unstructured.json └── requirements.txt ├── requirements.txt ├── modeling ├── torch_layers │ ├── activation.py │ ├── residual.py │ ├── norm.py │ ├── ffn.py │ └── attention.py ├── layers │ ├── residual.py │ ├── ffn.py │ ├── norm.py │ ├── activation.py │ ├── embedding.py │ ├── attention.py │ ├── tf1_layers.py │ └── transformer_encoder.py ├── save_model.py └── models │ ├── squeezenet.py │ ├── cnn_zoo.py │ └── vit.py ├── .gitignore ├── experiments ├── D1130_vino_quant_cnn_test.py ├── D1130_tflite_gpu_r21_benchmark.py ├── D0104_tvm_fusion_test.py ├── D1207_tflite_quant_cnn_test.py └── D1207_vino_quant_cnn_test.py ├── run.sh └── draw.py /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /benchmark/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /are_16_heads/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deit_pruning/src/inspector/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deit_pruning/src/preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deit_pruning/src/pytorch_prune/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deit_pruning/vendor/nn_pruning_v1/MANIFEST.in: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deit_pruning/vendor/nn_pruning_v1/nn_pruning/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deit_pruning/vendor/nn_pruning_v1/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 119 3 | target-version = ['py35'] 4 | -------------------------------------------------------------------------------- /deit_pruning/vendor/nn_pruning_v1/.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | multi_line_output=3 3 | include_trailing_comma=True 4 | 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow>=2.5.2 2 | onnx-tf==1.8.0 3 | timm==0.4.12 4 | torch==1.9.1 5 | torchvision== 0.10.1 6 | onnx==1.10.1 7 | onnxruntime==1.9.0 -------------------------------------------------------------------------------- /are_16_heads/itp/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | pip install -r requirements.txt 4 | cd vendor/huggingface_transformers 5 | python setup.py install --user 6 | cd ../.. -------------------------------------------------------------------------------- /deit_pruning/vendor/nn_pruning_v1/nn_pruning/__init__.py: -------------------------------------------------------------------------------- 1 | def run1(path, output): 2 | return "run1_ok" 3 | 4 | 5 | def run2(path, output): 6 | return "run2_ok" 7 | -------------------------------------------------------------------------------- /are_16_heads/.amltignore: -------------------------------------------------------------------------------- 1 | vendor/huggingface_transformers/docker 2 | vendor/huggingface_transformers/docs 3 | vendor/huggingface_transformers/examples 4 | vendor/huggingface_transformers/tests 5 | amlt/ 6 | itp/.tmp 7 | -------------------------------------------------------------------------------- /modeling/torch_layers/activation.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | 4 | def gelu(x): 5 | cdf = 0.5 * (1.0 + torch.tanh( 6 | (math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))) 7 | return x * cdf -------------------------------------------------------------------------------- /are_16_heads/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s', 4 | datefmt='%H:%M:%S', 5 | level=logging.INFO) 6 | logger = logging.getLogger(__name__) 7 | -------------------------------------------------------------------------------- /modeling/layers/residual.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | class Residual(tf.keras.Model): 4 | def __init__(self, fn): 5 | super().__init__() 6 | self.fn = fn 7 | 8 | def call(self, x): 9 | return self.fn(x) + x -------------------------------------------------------------------------------- /modeling/torch_layers/residual.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class Residual(nn.Module): 5 | def __init__(self, sub_layer): 6 | super().__init__() 7 | self.sub_layer = sub_layer 8 | 9 | def forward(self, x): 10 | return x + self.sub_layer(x) -------------------------------------------------------------------------------- /deit_pruning/vendor/nn_pruning_v1/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: style test 2 | 3 | # Run code quality checks 4 | style: 5 | black . 6 | isort . 7 | 8 | # Run tests for the library 9 | test: 10 | python -m pytest nn_pruning 11 | 12 | build_dist: 13 | rm -fr build 14 | rm -fr dist 15 | python -m build 16 | 17 | pypi_upload: build_dist 18 | python -m twine upload dist/* 19 | -------------------------------------------------------------------------------- /are_16_heads/deit_tiny_head_importance.txt: -------------------------------------------------------------------------------- 1 | 0.88811 0.35664 0.28993 2 | 0.34891 0.88253 0.31530 3 | 0.46338 0.44602 0.76573 4 | 0.64993 0.54293 0.53182 5 | 0.56222 0.61737 0.55024 6 | 0.50725 0.74968 0.42505 7 | 0.65688 0.56474 0.49958 8 | 0.56039 0.57933 0.59190 9 | 0.56777 0.53060 0.62936 10 | 0.57603 0.50893 0.63967 11 | 0.55203 0.46831 0.68989 12 | 0.30792 0.62804 0.71467 -------------------------------------------------------------------------------- /modeling/layers/ffn.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from .activation import gelu 3 | 4 | 5 | class FeedForward(tf.keras.Model): 6 | def __init__(self, dim, hidden_dim): 7 | super().__init__() 8 | self.net = tf.keras.Sequential([tf.keras.layers.Dense(hidden_dim, activation=gelu), 9 | tf.keras.layers.Dense(dim)]) 10 | 11 | def call(self, x): 12 | return self.net(x) -------------------------------------------------------------------------------- /are_16_heads/.amltconfig: -------------------------------------------------------------------------------- 1 | {"project_name": "are16heads_deit", "storage_account_name": "hexnas", "container_name": "amulet", "blob_storage_account_name": "hexnas", "registry_name": "projects", "targets": {}, "local_path": "/data/data1/v-xudongwang/benchmark_tools/are_16_heads", "default_output_dir": "/data/data1/v-xudongwang/benchmark_tools/are_16_heads/amlt", "project_uuid": "7366271800.53065-27c70f65-df0a-43de-b226-b1e82b3d54a1", "version": "8.1.3"} -------------------------------------------------------------------------------- /are_16_heads/heads_pruning.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | TASK=$1 4 | OPTIONS="${@:2}" 5 | 6 | here="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 7 | source $here/prepare_task.sh $TASK 8 | 9 | echo $base_acc 10 | prune_options="--do_prune --eval_pruned --prune_percent `seq 5 5 100` $OPTIONS" 11 | run_eval "$prune_options" 12 | 13 | # prune cmd: bash experiments/BERT/heads_pruning.sh MNLI --normalize_pruning_by_layer -------------------------------------------------------------------------------- /deit_pruning/config/topk-hybrid-struct-layerwise.json: -------------------------------------------------------------------------------- 1 | { 2 | "dense_pruning_method": "topK:1d_alt", 3 | "attention_pruning_method": "topK", 4 | "initial_threshold": 1.0, 5 | "initial_warmup": 1, 6 | "final_warmup": 3, 7 | "attention_block_rows": 64, 8 | "attention_block_cols": 256, 9 | "attention_output_with_dense": 0, 10 | "regularization_final_lambda": 20, 11 | "dense_lambda": 0.25, 12 | "regularization": "l1" 13 | } -------------------------------------------------------------------------------- /deit_pruning/config/topk-hybrid-struct-layerwise-base.json: -------------------------------------------------------------------------------- 1 | { 2 | "dense_pruning_method": "topK:1d_alt", 3 | "attention_pruning_method": "topK", 4 | "initial_threshold": 1.0, 5 | "initial_warmup": 1, 6 | "final_warmup": 3, 7 | "attention_block_rows": 64, 8 | "attention_block_cols": 768, 9 | "attention_output_with_dense": 0, 10 | "regularization_final_lambda": 20, 11 | "dense_lambda": 0.25, 12 | "regularization": "l1" 13 | } -------------------------------------------------------------------------------- /deit_pruning/config/topk-hybrid-struct-layerwise-small.json: -------------------------------------------------------------------------------- 1 | { 2 | "dense_pruning_method": "topK:1d_alt", 3 | "attention_pruning_method": "topK", 4 | "initial_threshold": 1.0, 5 | "initial_warmup": 1, 6 | "final_warmup": 3, 7 | "attention_block_rows": 64, 8 | "attention_block_cols": 384, 9 | "attention_output_with_dense": 0, 10 | "regularization_final_lambda": 20, 11 | "dense_lambda": 0.25, 12 | "regularization": "l1" 13 | } -------------------------------------------------------------------------------- /deit_pruning/config/topk-hybrid-struct-layerwise-tiny.json: -------------------------------------------------------------------------------- 1 | { 2 | "dense_pruning_method": "topK:1d_alt", 3 | "attention_pruning_method": "topK", 4 | "initial_threshold": 1.0, 5 | "initial_warmup": 1, 6 | "final_warmup": 3, 7 | "attention_block_rows": 64, 8 | "attention_block_cols": 192, 9 | "attention_output_with_dense": 0, 10 | "regularization_final_lambda": 20, 11 | "dense_lambda": 0.25, 12 | "regularization": "l1" 13 | } -------------------------------------------------------------------------------- /deit_pruning/config/topk-hybrid.json: -------------------------------------------------------------------------------- 1 | { 2 | "dense_pruning_method": "topK:1d_alt", 3 | "attention_pruning_method": "topK", 4 | "initial_threshold": 1.0, 5 | "final_threshold": 0.5, 6 | "initial_warmup": 1, 7 | "final_warmup": 3, 8 | "attention_block_rows": 32, 9 | "attention_block_cols": 32, 10 | "attention_output_with_dense": 0, 11 | "regularization_final_lambda": 20, 12 | "dense_lambda": 0.25, 13 | "regularization": "l1" 14 | } -------------------------------------------------------------------------------- /modeling/layers/norm.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | class LayerNorm(tf.keras.Model): 4 | def __init__(self, fn, pre=False): 5 | super().__init__() 6 | self.norm = tf.keras.layers.LayerNormalization(epsilon=1e-5) 7 | self.fn = fn 8 | self.pre = pre 9 | 10 | def call(self, x): 11 | if self.pre: 12 | return self.fn(self.norm(x)) 13 | else: 14 | return self.norm(self.fn(x)) -------------------------------------------------------------------------------- /deit_pruning/config/topk-hybrid-block4x4.json: -------------------------------------------------------------------------------- 1 | { 2 | "dense_pruning_method": "topK:1d_alt", 3 | "attention_pruning_method": "topK", 4 | "initial_threshold": 1.0, 5 | "final_threshold": 0.5, 6 | "initial_warmup": 1, 7 | "final_warmup": 3, 8 | "attention_block_rows": 4, 9 | "attention_block_cols": 4, 10 | "attention_output_with_dense": 0, 11 | "regularization_final_lambda": 20, 12 | "dense_lambda": 0.25, 13 | "regularization": "l1" 14 | } -------------------------------------------------------------------------------- /deit_pruning/config/topk-hybrid-struct.json: -------------------------------------------------------------------------------- 1 | { 2 | "dense_pruning_method": "topK:1d_alt", 3 | "attention_pruning_method": "topK", 4 | "initial_threshold": 1.0, 5 | "final_threshold": 0.5, 6 | "initial_warmup": 1, 7 | "final_warmup": 3, 8 | "attention_block_rows": 32, 9 | "attention_block_cols": 32, 10 | "attention_output_with_dense": 0, 11 | "regularization_final_lambda": 20, 12 | "dense_lambda": 0.25, 13 | "regularization": "l1" 14 | } -------------------------------------------------------------------------------- /deit_pruning/config/magnitude-hybrid.json: -------------------------------------------------------------------------------- 1 | { 2 | "dense_pruning_method": "magnitude:1d_alt", 3 | "attention_pruning_method": "magnitude", 4 | "initial_threshold": 1.0, 5 | "final_threshold": 0.5, 6 | "initial_warmup": 1, 7 | "final_warmup": 3, 8 | "attention_block_rows": 32, 9 | "attention_block_cols": 32, 10 | "attention_output_with_dense": 0, 11 | "regularization_final_lambda": 20, 12 | "dense_lambda": 0.25, 13 | "regularization": "l1" 14 | } -------------------------------------------------------------------------------- /deit_pruning/config/topk-hybrid-block16x16.json: -------------------------------------------------------------------------------- 1 | { 2 | "dense_pruning_method": "topK:1d_alt", 3 | "attention_pruning_method": "topK", 4 | "initial_threshold": 1.0, 5 | "final_threshold": 0.5, 6 | "initial_warmup": 1, 7 | "final_warmup": 3, 8 | "attention_block_rows": 16, 9 | "attention_block_cols": 16, 10 | "attention_output_with_dense": 0, 11 | "regularization_final_lambda": 20, 12 | "dense_lambda": 0.25, 13 | "regularization": "l1" 14 | } -------------------------------------------------------------------------------- /deit_pruning/config/sigmoied_threshold-hybrid.json: -------------------------------------------------------------------------------- 1 | { 2 | "dense_pruning_method": "sigmoied_threshold:1d_alt", 3 | "attention_pruning_method": "sigmoied_threshold", 4 | "initial_threshold": 0.0, 5 | "final_threshold": 0.1, 6 | "initial_warmup": 1, 7 | "final_warmup": 3, 8 | "attention_block_rows": 32, 9 | "attention_block_cols": 32, 10 | "attention_output_with_dense": 0, 11 | "regularization_final_lambda": 20, 12 | "dense_lambda": 0.25, 13 | "regularization": "l1" 14 | } -------------------------------------------------------------------------------- /modeling/layers/activation.py: -------------------------------------------------------------------------------- 1 | import math 2 | import tensorflow as tf 3 | 4 | def gelu(x): 5 | """Gaussian Error Linear Unit. 6 | This is a smoother version of the RELU. 7 | Original paper: https://arxiv.org/abs/1606.08415 8 | Args: 9 | x: float Tensor to perform activation. 10 | Returns: 11 | `x` with the GELU activation applied. 12 | """ 13 | cdf = 0.5 * (1.0 + tf.tanh( 14 | (math.sqrt(2 / math.pi) * (x + 0.044715 * tf.pow(x, 3))))) 15 | return x * cdf -------------------------------------------------------------------------------- /deit_pruning/config/topk-unstructured.json: -------------------------------------------------------------------------------- 1 | { 2 | "dense_pruning_method": "topK", 3 | "attention_pruning_method": "topK", 4 | "initial_threshold": 1.0, 5 | "final_threshold": 0.5, 6 | "initial_warmup": 1, 7 | "final_warmup": 3, 8 | "attention_block_rows": 1, 9 | "attention_block_cols": 1, 10 | "dense_block_rows": 1, 11 | "dense_block_cols": 1, 12 | "attention_output_with_dense": 0, 13 | "regularization_final_lambda": 20, 14 | "dense_lambda": 0.25, 15 | "regularization": "l1" 16 | } -------------------------------------------------------------------------------- /modeling/torch_layers/norm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class LayerNorm(nn.Module): 5 | def __init__(self, input_shape, sub_layer, is_pre=False) -> None: 6 | super().__init__() 7 | self.layer_norm = nn.LayerNorm(input_shape) 8 | self.sub_layer = sub_layer 9 | self.is_pre = is_pre 10 | 11 | def forward(self, x): 12 | if self.is_pre: 13 | return self.sub_layer(self.layer_norm(x)) 14 | else: 15 | return self.layer_norm(self.sub_layer(x)) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | vit_model.py 2 | vit_huggingface.py 3 | play.py 4 | **/foo.sh 5 | **/__pycache__ 6 | are_16_heads/training_log/ 7 | are_16_heads/vendor 8 | are_16_heads/amlt 9 | are_16_heads/itp 10 | deit_pruning/itp 11 | deit_pruning/results 12 | deit_pruning/logs 13 | deit_pruning/vendor/nn_pruning 14 | deit_pruning/vendor/onnx_scripts 15 | deit_pruning/vendor/nn_pruning_v1/analysis 16 | deit_pruning/vendor/nn_pruning_v1/docs 17 | deit_pruning/vendor/nn_pruning_v1/examples 18 | deit_pruning/vendor/nn_pruning_v1/notebooks 19 | model_zoo 20 | tmp* 21 | -------------------------------------------------------------------------------- /deit_pruning/requirements.txt: -------------------------------------------------------------------------------- 1 | certifi==2021.5.30 2 | chardet==4.0.0 3 | click==8.0.1 4 | deepspeed==0.4.1 5 | filelock==3.0.12 6 | huggingface-hub==0.0.8 7 | idna==2.10 8 | joblib==1.0.1 9 | numpy==1.20.3 10 | onnxruntime==1.6.0 11 | packaging==20.9 12 | pyparsing==2.4.7 13 | PyYAML==5.4.1 14 | regex==2021.4.4 15 | requests==2.25.1 16 | sacremoses==0.0.45 17 | six==1.16.0 18 | tokenizers==0.10.3 19 | torch==1.8.1 20 | tqdm==4.61.1 21 | transformers==4.7.0 22 | typing-extensions==3.10.0.0 23 | urllib3==1.26.5 24 | torchvision==0.9.1 25 | timm==0.4.12 -------------------------------------------------------------------------------- /deit_pruning/config/sigmoied_threshold-unstructured.json: -------------------------------------------------------------------------------- 1 | { 2 | "dense_pruning_method": "sigmoied_threshold", 3 | "attention_pruning_method": "sigmoied_threshold", 4 | "initial_threshold": 0.0, 5 | "final_threshold": 0.1, 6 | "initial_warmup": 1, 7 | "final_warmup": 3, 8 | "attention_block_rows": 1, 9 | "attention_block_cols": 1, 10 | "dense_block_rows": 1, 11 | "dense_block_cols": 1, 12 | "attention_output_with_dense": 0, 13 | "regularization_final_lambda": 20, 14 | "dense_lambda": 0.25, 15 | "regularization": "l1" 16 | } -------------------------------------------------------------------------------- /modeling/torch_layers/ffn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn.modules import activation 4 | from .activation import gelu 5 | 6 | 7 | class FeedForward(nn.Module): 8 | def __init__(self, hidden_size, intermediate_size): 9 | super().__init__() 10 | self.linear1 = nn.Linear(hidden_size, intermediate_size) 11 | self.linear2 = nn.Linear(intermediate_size, hidden_size) 12 | 13 | def forward(self, x): 14 | x = self.linear1(x) 15 | x = gelu(x) 16 | x = self.linear2(x) 17 | return x -------------------------------------------------------------------------------- /deit_pruning/vendor/nn_pruning_v1/.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled python modules. 2 | *.pyc 3 | *.pyo 4 | 5 | # Setuptools distribution folder. 6 | /dist/ 7 | /build/ 8 | 9 | # Python egg metadata, regenerated from source files by setuptools. 10 | /*.egg-info 11 | /*.egg 12 | 13 | # emacs Files 14 | *~ 15 | 16 | # Python cache files 17 | __pycache__/ 18 | 19 | # Jupyter Notebook 20 | .ipynb_checkpoints 21 | 22 | # wandb information directory 23 | wandb 24 | 25 | # backup directories 26 | back 27 | 28 | venv 29 | .vscode 30 | notebooks/models/ 31 | notebooks/checkpoints/ -------------------------------------------------------------------------------- /are_16_heads/heads_ablation.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | TASK=$1 4 | OPTIONS="${@:2}" 5 | 6 | here="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 7 | source $here/prepare_task.sh $TASK 8 | 9 | 10 | echo $base_acc 11 | echo $part 12 | for layer in `seq 1 12` 13 | do 14 | echo -n "$layer" 15 | for head in `seq 1 12` 16 | do 17 | mask_str="${layer}:${head}" 18 | acc=$(run_eval "--attention_mask_heads $mask_str $OPTIONS" | grep $metric | rev | cut -d" " -f1 | rev) 19 | printf "\t%.5f" $(echo "$acc - $base_acc" | bc ) 20 | done 21 | done 22 | 23 | -------------------------------------------------------------------------------- /are_16_heads/requirements.txt: -------------------------------------------------------------------------------- 1 | boto3==1.18.50 2 | botocore==1.21.50 3 | certifi==2021.5.30 4 | charset-normalizer==2.0.6 5 | click==8.0.1 6 | filelock==3.1.0 7 | huggingface-hub==0.0.17 8 | idna==3.2 9 | jmespath==0.10.0 10 | joblib==1.0.1 11 | numpy==1.21.2 12 | packaging==21.0 13 | Pillow==8.3.2 14 | pyparsing==2.4.7 15 | python-dateutil==2.8.2 16 | PyYAML==5.4.1 17 | regex==2021.9.24 18 | requests==2.26.0 19 | s3transfer==0.5.0 20 | sacremoses==0.0.46 21 | six==1.16.0 22 | timm==0.4.12 23 | tokenizers==0.10.3 24 | torch==1.9.1 25 | torchvision==0.10.1 26 | tqdm==4.62.3 27 | typing-extensions==3.10.0.2 28 | urllib3==1.26.7 29 | -------------------------------------------------------------------------------- /deit_pruning/src/deepspeed_config/deepspeed.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": 256, 3 | "steps_per_print": 10, 4 | "optimizer": { 5 | "type": "Adam", 6 | "params": { 7 | "lr": 3e-5, 8 | "weight_decay": 0.01, 9 | "bias_correction": false 10 | } 11 | }, 12 | "scheduler": { 13 | "type": "WarmupDecayLR", 14 | "params": { 15 | "warmup_min_lr": 0, 16 | "warmup_max_lr": "auto", 17 | "warmup_num_steps": "auto", 18 | "total_num_steps": "auto" 19 | } 20 | }, 21 | "gradient_clipping": 1.0, 22 | "fp16": { 23 | "enabled": false 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /deit_pruning/src/deepspeed_config/deepspeed_deit_base.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": 64, 3 | "steps_per_print": 10, 4 | "optimizer": { 5 | "type": "Adam", 6 | "params": { 7 | "lr": 0.000025, 8 | "weight_decay": 0.01, 9 | "bias_correction": false 10 | } 11 | }, 12 | "scheduler": { 13 | "type": "WarmupDecayLR", 14 | "params": { 15 | "warmup_min_lr": 0, 16 | "warmup_max_lr": "auto", 17 | "warmup_num_steps": "auto", 18 | "total_num_steps": "auto" 19 | } 20 | }, 21 | "gradient_clipping": 1.0, 22 | "fp16": { 23 | "enabled": false 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /deit_pruning/src/deepspeed_config/deepspeed_deit_small.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": 128, 3 | "steps_per_print": 10, 4 | "optimizer": { 5 | "type": "Adam", 6 | "params": { 7 | "lr": 5e-5, 8 | "weight_decay": 0.01, 9 | "bias_correction": false 10 | } 11 | }, 12 | "scheduler": { 13 | "type": "WarmupDecayLR", 14 | "params": { 15 | "warmup_min_lr": 0, 16 | "warmup_max_lr": "auto", 17 | "warmup_num_steps": "auto", 18 | "total_num_steps": "auto" 19 | } 20 | }, 21 | "gradient_clipping": 1.0, 22 | "fp16": { 23 | "enabled": false 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /deit_pruning/src/deepspeed_config/deepspeed_deit_tiny.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": 256, 3 | "steps_per_print": 10, 4 | "optimizer": { 5 | "type": "Adam", 6 | "params": { 7 | "lr": 1e-4, 8 | "weight_decay": 0.01, 9 | "bias_correction": false 10 | } 11 | }, 12 | "scheduler": { 13 | "type": "WarmupDecayLR", 14 | "params": { 15 | "warmup_min_lr": 0, 16 | "warmup_max_lr": "auto", 17 | "warmup_num_steps": "auto", 18 | "total_num_steps": "auto" 19 | } 20 | }, 21 | "gradient_clipping": 1.0, 22 | "fp16": { 23 | "enabled": false 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /deit_pruning/src/deepspeed_config/deepspeed_finetune_deit_tiny.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": 256, 3 | "steps_per_print": 10, 4 | "optimizer": { 5 | "type": "Adam", 6 | "params": { 7 | "lr": 1e-5, 8 | "weight_decay": 0.01, 9 | "bias_correction": false 10 | } 11 | }, 12 | "scheduler": { 13 | "type": "WarmupDecayLR", 14 | "params": { 15 | "warmup_min_lr": 0, 16 | "warmup_max_lr": "auto", 17 | "warmup_num_steps": "auto", 18 | "total_num_steps": "auto" 19 | } 20 | }, 21 | "gradient_clipping": 1.0, 22 | "fp16": { 23 | "enabled": false 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /modeling/save_model.py: -------------------------------------------------------------------------------- 1 | from models.vit import ViT 2 | import tensorflow as tf 3 | 4 | 5 | if __name__ == '__main__': 6 | vit_config = { 7 | "image_size":224, 8 | "patch_size":16, 9 | "num_classes":1000, 10 | "dim":768, 11 | "depth":12, 12 | "heads":12, 13 | "mlp_dim":3072 14 | } 15 | 16 | vit = ViT(**vit_config) 17 | vit = tf.keras.Sequential([ 18 | tf.keras.layers.InputLayer(input_shape=(3, vit_config["image_size"], vit_config["image_size"]), batch_size=1), 19 | vit, 20 | ]) 21 | 22 | vit.save(f'/data/v-xudongwang/models/tf_model/vit_test_patch16_224.tf') -------------------------------------------------------------------------------- /are_16_heads/deit_small_head_importance.txt: -------------------------------------------------------------------------------- 1 | 0.18231 0.24750 0.70043 0.29003 0.19202 0.54214 2 | 0.38329 0.25754 0.29344 0.27641 0.44560 0.65245 3 | 0.33424 0.36127 0.50169 0.53243 0.30777 0.35759 4 | 0.43552 0.38621 0.45984 0.28844 0.43485 0.42121 5 | 0.40952 0.40148 0.41213 0.33956 0.40213 0.47356 6 | 0.49885 0.34838 0.41573 0.44428 0.35074 0.36952 7 | 0.36161 0.35020 0.48799 0.38296 0.46031 0.38720 8 | 0.41847 0.36983 0.40848 0.36265 0.50831 0.36244 9 | 0.33773 0.37743 0.40715 0.41210 0.52201 0.36797 10 | 0.36294 0.30509 0.55308 0.42384 0.42007 0.33644 11 | 0.34931 0.60585 0.26035 0.28327 0.31772 0.51182 12 | 0.22166 0.13127 0.15896 0.37551 0.73163 0.48175 -------------------------------------------------------------------------------- /deit_pruning/src/deepspeed_config/deepspeed_finetune_deit_base.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": 64, 3 | "steps_per_print": 10, 4 | "optimizer": { 5 | "type": "Adam", 6 | "params": { 7 | "lr": 0.0000025, 8 | "weight_decay": 0.01, 9 | "bias_correction": false 10 | } 11 | }, 12 | "scheduler": { 13 | "type": "WarmupDecayLR", 14 | "params": { 15 | "warmup_min_lr": 0, 16 | "warmup_max_lr": "auto", 17 | "warmup_num_steps": "auto", 18 | "total_num_steps": "auto" 19 | } 20 | }, 21 | "gradient_clipping": 1.0, 22 | "fp16": { 23 | "enabled": false 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /deit_pruning/src/deepspeed_config/deepspeed_finetune_deit_small.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": 128, 3 | "steps_per_print": 10, 4 | "optimizer": { 5 | "type": "Adam", 6 | "params": { 7 | "lr": 5e-6, 8 | "weight_decay": 0.01, 9 | "bias_correction": false 10 | } 11 | }, 12 | "scheduler": { 13 | "type": "WarmupDecayLR", 14 | "params": { 15 | "warmup_min_lr": 0, 16 | "warmup_max_lr": "auto", 17 | "warmup_num_steps": "auto", 18 | "total_num_steps": "auto" 19 | } 20 | }, 21 | "gradient_clipping": 1.0, 22 | "fp16": { 23 | "enabled": false 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /deit_pruning/src/validate.py: -------------------------------------------------------------------------------- 1 | from nn_pruning.inference_model_patcher import optimize_model as nn_optimize 2 | from model import SwiftBERT 3 | from transformers import AutoModelForImageClassification 4 | import sys 5 | model = AutoModelForImageClassification.from_pretrained(sys.argv[1]) 6 | # model = SwiftBERTOutput.from_pretrained('results/playground/swift_bert_final') 7 | original_params = model.num_parameters() 8 | print('=== model before optimize ===') 9 | print(model) 10 | model = nn_optimize(model, "dense") 11 | pruned_params = model.num_parameters() 12 | print("Original params:", original_params) 13 | print("After-pruned params:", pruned_params) 14 | print('=== model after optimize ===') 15 | print(model) 16 | -------------------------------------------------------------------------------- /modeling/layers/embedding.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | def get_sinusoid_encoding(n_position, d_hid): 5 | ''' Sinusoid position encoding table ''' 6 | 7 | def get_position_angle_vec(position): 8 | return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)] 9 | 10 | sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)]) 11 | sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i 12 | sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 13 | 14 | sinusoid_table = tf.convert_to_tensor(sinusoid_table, dtype=tf.float32) 15 | return tf.expand_dims(sinusoid_table, axis=0) 16 | -------------------------------------------------------------------------------- /deit_pruning/src/analyse.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from pathlib import Path 3 | from transformers import AutoModelForImageClassification 4 | import torch 5 | import matplotlib.pyplot as plt 6 | import numpy as np 7 | 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('--model_path', type=Path, required=True, help='pretrained model path to analyse') 10 | args = parser.parse_args() 11 | 12 | model = AutoModelForImageClassification.from_pretrained(args.model_path) 13 | attention = model.vit.encoder.layer[0].attention.attention 14 | qkv_weight = [attention.query.weight, attention.key.weight, attention.value.weight] 15 | qkv_sparsity = [torch.sum(x == 0) / np.prod(x.shape) for x in qkv_weight] 16 | qkv_name = ['query', 'key', 'value'] 17 | 18 | print('Model Layer0 attention analyse summary') 19 | print('qkv sparsity', qkv_sparsity) 20 | 21 | for i in range(3): 22 | plt.imshow(qkv_weight[i] == 0) 23 | plt.savefig(args.model_path / f'{qkv_name[i]}_sparsity.png') -------------------------------------------------------------------------------- /deit_pruning/vendor/nn_pruning_v1/nn_pruning/tests/test_quantization.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from transformers import AutoModelForQuestionAnswering, AutoTokenizer 4 | 5 | from nn_pruning.modules.quantization import ( 6 | prepare_qat, 7 | prepare_static, 8 | quantize, 9 | ) 10 | 11 | 12 | class TestQuantization(unittest.TestCase): 13 | def _test_quantization(self, prepare_fn): 14 | model_name = "bert-base-uncased" 15 | tokenizer = AutoTokenizer.from_pretrained(model_name) 16 | model = AutoModelForQuestionAnswering.from_pretrained(model_name) 17 | prepared_model = prepare_fn( 18 | model, input_names=["input_ids", "attention_mask", "token_type_ids"], qconfig_name="default" 19 | ) 20 | prepared_model(**prepared_model.dummy_inputs) 21 | quantized = quantize(prepared_model) 22 | quantized(**prepared_model.dummy_inputs) 23 | 24 | def test_static_quantization(self): 25 | self._test_quantization(prepare_static) 26 | 27 | def test_qat(self): 28 | self._test_quantization(prepare_qat) 29 | -------------------------------------------------------------------------------- /are_16_heads/deit_base_head_importance.txt: -------------------------------------------------------------------------------- 1 | 0.10633 0.21009 0.17917 0.17396 0.23997 0.03947 0.66133 0.04475 0.20111 0.12547 0.55047 0.15649 2 | 0.85622 0.07048 0.14891 0.13867 0.10301 0.17172 0.11505 0.32608 0.11934 0.07479 0.10796 0.17137 3 | 0.13972 0.13900 0.27455 0.16737 0.13647 0.19076 0.55112 0.62408 0.22676 0.18713 0.10770 0.10728 4 | 0.33292 0.37048 0.27849 0.43238 0.18705 0.15823 0.30169 0.20417 0.44913 0.18356 0.16674 0.17738 5 | 0.32738 0.28058 0.33533 0.16170 0.26594 0.15998 0.36160 0.35538 0.30818 0.26875 0.21111 0.33221 6 | 0.26823 0.40010 0.35554 0.25859 0.24402 0.26022 0.24198 0.35145 0.20782 0.26094 0.21693 0.32716 7 | 0.20791 0.18577 0.28252 0.25817 0.37873 0.42019 0.27734 0.32148 0.25068 0.30721 0.19980 0.28000 8 | 0.24535 0.35692 0.31715 0.21870 0.22684 0.40151 0.30993 0.28751 0.25410 0.27766 0.29290 0.21250 9 | 0.29598 0.26994 0.34892 0.23472 0.30831 0.25975 0.25006 0.29248 0.31943 0.36282 0.23939 0.24718 10 | 0.23726 0.30566 0.27336 0.22064 0.35475 0.22777 0.28260 0.33208 0.23842 0.41765 0.22885 0.27451 11 | 0.23946 0.23778 0.38542 0.20270 0.24301 0.20889 0.28000 0.24122 0.51742 0.21560 0.30406 0.22484 12 | 0.28876 0.22567 0.17927 0.18561 0.60431 0.09382 0.10180 0.10877 0.27325 0.30052 0.36722 0.32107 -------------------------------------------------------------------------------- /benchmark/bench_utils.py: -------------------------------------------------------------------------------- 1 | def fetech_tf_bench_results(result_str): 2 | if rfind_assign_int(result_str, 'count') >= 2: 3 | std_ms = rfind_assign_float(result_str, 'std') / 1e3 4 | avg_ms = rfind_assign_float(result_str, 'avg') / 1e3 5 | mem_mb = rfind_assign_float(result_str, 'overall') 6 | else: 7 | std_ms = 0 8 | avg_ms = rfind_assign_float(result_str, 'curr') / 1e3 9 | mem_mb = rfind_assign_float(result_str, 'overall') 10 | 11 | return std_ms, avg_ms, mem_mb 12 | 13 | def rfind_assign(s, mark): 14 | mark += "=" 15 | p = s.rfind(mark) 16 | assert p != -1 17 | l_idx = p + len(mark) 18 | r_idx = l_idx 19 | while s[r_idx] not in [' ', '\n']: 20 | r_idx += 1 21 | return s[l_idx: r_idx] 22 | 23 | 24 | def rfind_assign_float(s, mark): 25 | return float(rfind_assign(s, mark)) 26 | 27 | 28 | def rfind_assign_int(s, mark): 29 | return int(rfind_assign(s, mark)) 30 | 31 | 32 | def table_try_float(table): 33 | for i in range(len(table)): 34 | for j in range(len(table[i])): 35 | try: 36 | table[i][j] = float(table[i][j]) 37 | except: 38 | pass 39 | return table 40 | -------------------------------------------------------------------------------- /are_16_heads/itp/run_itp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | TASK=$1 4 | OPTIONS="${@:2}" 5 | 6 | function distributed_launch() { 7 | python -m torch.distributed.launch --nproc_per_node 4 ./run_classifier.py \ 8 | --normalize_pruning_by_layer \ 9 | --do_prune \ 10 | --eval_pruned \ 11 | --actually_prune \ 12 | --data_dir /mnt/data/EdgeDL/imagenet2012 \ 13 | --eval_batch_size 500 \ 14 | --at_least_x_heads_per_layer 1 \ 15 | --num_workers 8 \ 16 | --use_huggingface_trainer \ 17 | $OPTIONS 18 | } 19 | 20 | function iterative_pruning_base() { 21 | ./itp/run_itp.sh distributed_launch \ 22 | --deit_type base \ 23 | --prune_number `seq 0 4 132` \ 24 | --exact_pruning \ 25 | --train_batch_size 64 \ 26 | --n_retrain_epochs_after_pruning 3 \ 27 | --retrain_learning_rate 0.000025 \ 28 | --output_dir /mnt/data/EdgeDL/are16heads_results/iterative/base 29 | } 30 | 31 | function finetune_many_base() { 32 | python -m torch.distributed.launch --nproc_per_node 4 finetune_many.py \ 33 | --data_dir /mnt/data/EdgeDL/imagenet2012 \ 34 | --model_path /mnt/data/EdgeDL/are16heads_results/iterative/base \ 35 | --output_dir /mnt/data/EdgeDL/are16heads_results/ \ 36 | --finetune_learning_rate 0.000025 \ 37 | --n_finetune_epochs_after_pruning 3 \ 38 | --finetune_batch_size 64 39 | } 40 | $1 "" -------------------------------------------------------------------------------- /modeling/layers/attention.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from einops.layers.tensorflow import Rearrange 3 | 4 | 5 | class Attention(tf.keras.Model): 6 | def __init__(self, dim, num_heads, h_k=None): 7 | if h_k is None: 8 | if dim % num_heads != 0: 9 | raise ValueError(f'hidden_size {dim} must be a multiple of num_heads {num_heads}.') 10 | self.h_k = dim // num_heads 11 | else: 12 | self.h_k = h_k 13 | super().__init__() 14 | self.num_heads = num_heads 15 | self.scale = self.h_k ** -0.5 16 | 17 | self.to_qkv = tf.keras.layers.Dense(self.num_heads * self.h_k * 3, use_bias=False) 18 | self.to_out = tf.keras.layers.Dense(dim) 19 | 20 | self.rearrange_qkv = Rearrange('b n (qkv h d) -> qkv b h n d', qkv = 3, h = self.num_heads) 21 | self.rearrange_out = Rearrange('b h n d -> b n (h d)') 22 | 23 | def call(self, x): 24 | qkv = self.to_qkv(x) 25 | qkv = self.rearrange_qkv(qkv) 26 | q = qkv[0] 27 | k = qkv[1] 28 | v = qkv[2] 29 | 30 | dots = tf.einsum('bhid,bhjd->bhij', q, k) * self.scale 31 | attn = tf.nn.softmax(dots, axis=-1) 32 | 33 | out = tf.einsum('bhij,bhjd->bhid', attn, v) 34 | out = self.rearrange_out(out) 35 | out = self.to_out(out) 36 | return out -------------------------------------------------------------------------------- /benchmark/ADBConnect.py: -------------------------------------------------------------------------------- 1 | import subprocess,re 2 | 3 | class ADBConnect: 4 | def __init__(self, serial=None): 5 | devices = subprocess.check_output(f'adb devices', shell=True).decode('utf-8') 6 | device_list = re.findall(r'([a-zA-Z0-9]+)[^\w]*([a-zA-Z0-9]+)', devices.split('List of devices attached')[-1]) 7 | if serial == None: 8 | if len(device_list) == 0: 9 | raise FileNotFoundError 10 | else: 11 | self.serial = device_list[0][0] 12 | print(f'Device {self.serial} selected.') 13 | else: 14 | for device in device_list: 15 | if serial == device[0]: 16 | self.serial = serial 17 | print(f'Device {self.serial} selected.') 18 | return 19 | raise FileNotFoundError 20 | 21 | def push_files(self, src, dst): 22 | subprocess.check_output(f'adb -s {self.serial} push {src} {dst}', shell=True) 23 | 24 | def pull_files(self, src, dst): 25 | subprocess.check_output(f'adb -s {self.serial} pull {src} {dst}', shell=True) 26 | 27 | def run_cmd(self, cmd, no_root=False): 28 | #print(self.serial) 29 | results = subprocess.check_output(f'adb -s {self.serial} shell {"su -c" if not no_root else ""} {cmd}', shell=True).decode('utf-8') 30 | #print(results) 31 | #latency=get_avg_latency(results) 32 | #print(latency) 33 | 34 | return results -------------------------------------------------------------------------------- /are_16_heads/classifier_scoring.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class BaseClassifierScorer(object): 5 | _name = "base" 6 | 7 | def __call__(self, predictions, labels): 8 | raise NotImplementedError() 9 | 10 | @property 11 | def name(cls): 12 | return cls._name 13 | 14 | 15 | class Accuracy(BaseClassifierScorer): 16 | _name = "Accuracy" 17 | 18 | def __call__(self, predictions, labels): 19 | return (predictions == labels).mean() 20 | 21 | 22 | class F1(BaseClassifierScorer): 23 | _name = "F-1 score" 24 | 25 | def __call__(self, predictions, labels): 26 | # True positives 27 | tp = np.logical_and(predictions == 1, labels == 1).sum() 28 | # Precision 29 | P = tp / (predictions == 1).sum() 30 | # Recall 31 | R = tp / (labels == 1).sum() 32 | # F-score 33 | return 2 * P * R / (P + R) 34 | 35 | 36 | class Matthews(BaseClassifierScorer): 37 | _name = "Matthew's correlation" 38 | 39 | def __call__(self, predictions, labels): 40 | # True/False positives/negatives 41 | tp = np.logical_and(predictions == 1, labels == 1).sum() 42 | fp = np.logical_and(predictions == 1, labels == 0).sum() 43 | tn = np.logical_and(predictions == 0, labels == 0).sum() 44 | fn = np.logical_and(predictions == 0, labels == 1).sum() 45 | # Correlation coefficient 46 | m = (tp * tn) - (fp * fn) 47 | m /= np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) + 1e-20 48 | 49 | return m 50 | -------------------------------------------------------------------------------- /deit_pruning/vendor/nn_pruning_v1/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | 4 | def readme(): 5 | with open("README.md") as f: 6 | return f.read() 7 | 8 | extras = { 9 | "tests": ["pytest"], 10 | "examples": ["numpy>=1.2.0", "datasets>=1.4.1", "ipywidgets>=7.6.3", "matplotlib>=3.3.4", "pandas>=1.2.3"], 11 | } 12 | 13 | def combine_requirements(base_keys): 14 | return list(set(k for v in base_keys for k in extras[v])) 15 | 16 | extras["dev"] = combine_requirements([k for k in extras if k != "examples"]) 17 | 18 | 19 | setup( 20 | name="nn_pruning", 21 | version="0.1.2", 22 | description="nn_pruning is a python package for pruning PyTorch models.", 23 | long_description="nn_pruning is a python package for pruning PyTorch models.", 24 | classifiers=[ 25 | "Development Status :: 3 - Alpha", 26 | "License :: OSI Approved :: MIT License", 27 | "Programming Language :: Python :: 3.0", 28 | "Topic :: Text Processing", 29 | ], 30 | keywords="", 31 | url="", 32 | author="", 33 | author_email="", 34 | license="MIT", 35 | packages=["nn_pruning", "nn_pruning.modules"], 36 | install_requires=["click", "transformers>=4.3.0", "torch>=1.6", "scikit-learn>=0.24"], 37 | extras_require=extras, 38 | test_suite="nose.collector", 39 | tests_require=["nose", "nose-cover3"], 40 | entry_points={ 41 | "console_scripts": ["nn_pruning_run_example=examples.command_line:main"], 42 | }, 43 | include_package_data=True, 44 | zip_safe=False, 45 | ) 46 | -------------------------------------------------------------------------------- /deit_pruning/src/preprocessing/random_select.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from pathlib import Path 3 | from ..utils import set_random 4 | 5 | # import numpy as np 6 | import random 7 | 8 | def output(train_filename, output_filename, idx): 9 | global_idx = 0 10 | idx_ptr = 0 11 | fout = open(output_filename, "w") 12 | with open(train_filename) as f: 13 | for line in f: 14 | if global_idx == idx[idx_ptr]: 15 | fout.write(line) 16 | idx_ptr += 1 17 | global_idx += 1 18 | fout.close() 19 | 20 | 21 | if __name__ == "__main__": 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument("--train_filename", required=True, type=Path) 24 | parser.add_argument("--output_filename", required=True, type=Path) 25 | parser.add_argument("--train_lcnt", required=True, type=int), 26 | parser.add_argument("--ratio", required=True, type=float) 27 | parser.add_argument("--seed", type=int, default=12345) 28 | args = parser.parse_args() 29 | # python -m src.preprocessing.random_select --train_filename ../../swiftBertData/data.tsv --output_filename data/train_subset_0.02.tsv --train_lcnt 1573820370 --ratio 0.02 30 | ## new dataset: 2000000007 31 | assert args.train_filename != args.output_filename 32 | set_random(args.seed) 33 | 34 | selected_lcnt = int(args.train_lcnt * args.ratio) 35 | print(f"Select {selected_lcnt} / {args.train_lcnt}") 36 | 37 | selected_idx = sorted(random.sample(range(args.train_lcnt), selected_lcnt)) 38 | selected_idx.append(-1) 39 | 40 | output(train_filename=args.train_filename, output_filename=args.output_filename, idx=selected_idx) 41 | -------------------------------------------------------------------------------- /modeling/models/squeezenet.py: -------------------------------------------------------------------------------- 1 | # from https://medium.com/@sumeetbadgujar/squeezenet-implementation-in-tensorflow-7949d795e84e 2 | 3 | import tensorflow as tf 4 | from tensorflow.keras.layers import Conv2D, ReLU, concatenate, Input, MaxPool2D, Dropout, AvgPool2D 5 | from tensorflow.keras import Model 6 | 7 | 8 | def fire_module(x, s1, e1, e3): 9 | s1x = Conv2D(s1, kernel_size=1, padding='same')(x) 10 | s1x = ReLU()(s1x) 11 | e1x = Conv2D(e1, kernel_size=1, padding='same')(s1x) 12 | e3x = Conv2D(e3, kernel_size=3, padding='same')(s1x) 13 | x = concatenate([e1x, e3x]) 14 | x = ReLU()(x) 15 | return x 16 | 17 | 18 | def SqueezeNet(image_size=[224, 224, 3], nclasses=1000, batch_size=1): 19 | input = Input(image_size, batch_size) 20 | x = Conv2D(96, kernel_size=(7, 7), strides=(2, 2), padding='same')(input) 21 | x = MaxPool2D(pool_size=(3, 3), strides=(2, 2))(x) 22 | x = fire_module(x, s1=16, e1=64, e3=64) # 2 23 | x = fire_module(x, s1=16, e1=64, e3=64) # 3 24 | x = fire_module(x, s1=32, e1=128, e3=128) # 4 25 | x = MaxPool2D(pool_size=(3, 3), strides=(2, 2))(x) 26 | x = fire_module(x, s1=32, e1=128, e3=128) # 5 27 | x = fire_module(x, s1=48, e1=192, e3=192) # 6 28 | x = fire_module(x, s1=48, e1=192, e3=192) # 7 29 | x = fire_module(x, s1=64, e1=256, e3=256) # 8 30 | x = MaxPool2D(pool_size=(3, 3), strides=(2, 2))(x) 31 | x = fire_module(x, s1=64, e1=256, e3=256) # 9 32 | x = Dropout(0.5)(x) 33 | x = Conv2D(nclasses, kernel_size=1)(x) 34 | output = AvgPool2D(pool_size=(13, 13))(x) 35 | model = Model(input, output) 36 | return model 37 | -------------------------------------------------------------------------------- /modeling/layers/tf1_layers.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | def fc_layer(_input, out_units, opname='',use_bias=False, param_initializer=None): 4 | features_total = int(_input.get_shape()[-1]) 5 | if not param_initializer: 6 | param_initializer = {} 7 | with tf.compat.v1.variable_scope(opname+'.fc'): 8 | init_key = '%s/weight' % tf.get_variable_scope().name 9 | initializer = param_initializer.get(init_key, tf.contrib.layers.xavier_initializer()) 10 | weight = tf.compat.v1.get_variable(name='weight', shape=[features_total, out_units],initializer=initializer) 11 | output = tf.matmul(_input, weight) 12 | if use_bias: 13 | init_key = '%s/bias' % tf.get_variable_scope().name 14 | initializer = param_initializer.get(init_key, tf.constant_initializer([0.0] * out_units)) 15 | bias = tf.get_variable(name='bias', shape=[out_units],initializer=initializer) 16 | output = output + bias 17 | return output 18 | 19 | def gelu(_input, opname=''): 20 | import math 21 | with tf.compat.v1.variable_scope(opname + '.' + 'gelu'): 22 | cdf = 0.5 * (1.0 + tf.tanh( 23 | (math.sqrt(2 / math.pi) * (_input + 0.044715 * tf.pow(_input, 3))))) 24 | return _input * cdf 25 | 26 | def ffn(_input, intermediate_size, opname=''): 27 | h = int(_input.get_shape()[-1]) 28 | with tf.compat.v1.variable_scope(opname + '.' + 'ffn'): 29 | x = fc_layer(_input, intermediate_size, use_bias=True, opname='dense1') 30 | x = gelu(x) 31 | x = fc_layer(x, h, use_bias=True, opname='dense2') 32 | return x 33 | -------------------------------------------------------------------------------- /deit_pruning/src/inspector/get_sparsity.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import argparse 3 | from nn_pruning.inference_model_patcher import optimize_model 4 | 5 | def show(model, skip_embedding=False, skip_layernorm=False, skip_bias=False): 6 | print("Params:", model.num_parameters()) 7 | zero_param_cnt = 0 8 | param_numel = 0 9 | 10 | for k, v in model.named_parameters(): 11 | if skip_embedding and "embedding" in k: 12 | continue 13 | if skip_layernorm and "LayerNorm" in k: 14 | continue 15 | if skip_bias and "bias" in k: 16 | continue 17 | zero_mask = v == 0 18 | 19 | with torch.no_grad(): 20 | print(k, float(zero_mask.sum() / zero_mask.numel()), int(zero_mask.sum()), zero_mask.shape, sep='\t') 21 | zero_param_cnt += zero_mask.sum().item() 22 | param_numel += zero_mask.numel() 23 | 24 | print("Zero params:", zero_param_cnt) 25 | #print("Params (for):", param_numel) 26 | 27 | if __name__ == "__main__": 28 | from ..model import SwiftBERT 29 | parser = argparse.ArgumentParser() 30 | parser.add_argument("--deit_model_name", type=str) 31 | parser.add_argument("--nn_pruning", action='store_true') 32 | parser.add_argument("--skip_embedding", action='store_true') 33 | parser.add_argument("--skip_layernorm", action='store_true') 34 | parser.add_argument("--skip_bias", action='store_true') 35 | 36 | args = parser.parse_args() 37 | # python -m src.inspector.get_sparsity --deit_model_name ./results/playground/final 38 | 39 | model = SwiftBERT.from_pretrained(args.deit_model_name) 40 | if args.nn_pruning: 41 | model = optimize_model(model, "dense") 42 | 43 | show(model, skip_embedding=args.skip_embedding, skip_layernorm=args.skip_layernorm, skip_bias=args.skip_bias) 44 | -------------------------------------------------------------------------------- /benchmark/run_on_device.py: -------------------------------------------------------------------------------- 1 | from .bench_utils import* 2 | import os 3 | 4 | def run_on_android(modelpath, adb, use_gpu=False, num_threads=1, num_runs=10, warmup_runs=10, skip_push=False, 5 | taskset_mask='70', benchmark_binary_dir='/data/local/tmp', bin_name='benchmark_model_plus_flex_r27', no_root=False, use_xnnpack=False, 6 | profiling_output_csv_file=None): 7 | if not skip_push: 8 | #=======Push to device=========== 9 | adb.push_files(modelpath, '/sdcard/') 10 | model_name=modelpath.split('/')[-1] 11 | if benchmark_binary_dir[-1] == '/': 12 | benchmark_binary_dir = benchmark_binary_dir[:-1] 13 | benchmark_binary_path = f'{benchmark_binary_dir}/{bin_name}' 14 | 15 | command = f'taskset {taskset_mask} {benchmark_binary_path} --num_threads={num_threads} {"--use_gpu=true" if use_gpu else ""} ' 16 | command += f'--num_runs={num_runs} --warmup_runs={warmup_runs} {"--use_xnnpack=true" if use_xnnpack else "--use_xnnpack=false"} --graph=/sdcard/{model_name} ' 17 | command += f'--enable_op_profiling=true --profiling_output_csv_file=/sdcard/{os.path.basename(profiling_output_csv_file)} ' if profiling_output_csv_file else '' 18 | print(command) 19 | 20 | bench_str = adb.run_cmd(command, no_root=no_root) 21 | std_ms, avg_ms, mem_mb = fetech_tf_bench_results(bench_str) 22 | 23 | if not skip_push: 24 | #=======Clear device files======= 25 | adb.run_cmd(f'rm -rf /sdcard/{model_name}', no_root=no_root) 26 | 27 | if profiling_output_csv_file: 28 | adb.pull_files(src=f'/sdcard/{os.path.basename(profiling_output_csv_file)}', dst=profiling_output_csv_file) 29 | print(f'Save profiling output csv file in {profiling_output_csv_file}') 30 | return std_ms, avg_ms, mem_mb -------------------------------------------------------------------------------- /are_16_heads/prepare_task.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | TASK=$1 4 | EVAL=${EVAL:-"1"} 5 | TRAIN_OPTIONS="${@:2}" 6 | FEATURE_MODE=${FEATURE_MODE:-0} 7 | NODROP_MODE=${NODROP_MODE:-0} 8 | 9 | 10 | prefix="$TASK" 11 | if [ "$FEATURE_MODE" -eq "1" ] 12 | then 13 | prefix="${TASK}-feature" 14 | TRAIN_OPTIONS="$TRAIN_OPTIONS --feature_mode" 15 | fi 16 | if [ "$NODROP_MODE" -eq "1" ] 17 | then 18 | prefix="${TASK}-nodrop" 19 | TRAIN_OPTIONS="$TRAIN_OPTIONS --attn_dropout 0.0" 20 | fi 21 | 22 | mkdir -p models 23 | model_dir=models/$prefix 24 | mkdir -p $model_dir 25 | 26 | function run_train () { 27 | python pytorch-pretrained-BERT/examples/run_classifier.py $TRAIN_OPTIONS \ 28 | --task_name $TASK \ 29 | --do_train \ 30 | --do_lower_case \ 31 | --data_dir glue_data/$TASK/ \ 32 | --bert_model bert-base-uncased \ 33 | --max_seq_length 128 \ 34 | --train_batch_size 32 \ 35 | --eval_batch_size 32 \ 36 | --learning_rate 2e-5 \ 37 | --num_train_epochs 3.0 \ 38 | --output_dir $model_dir 2>&1 39 | } 40 | 41 | function run_eval () { 42 | python pytorch-pretrained-BERT/examples/run_classifier.py \ 43 | --task_name $TASK \ 44 | --do_eval \ 45 | --do_lower_case \ 46 | $1 \ 47 | --data_dir glue_data/$TASK/ \ 48 | --bert_model bert-base-uncased \ 49 | --max_seq_length 128 \ 50 | --eval_batch_size 32 \ 51 | --output_dir $model_dir 2>&1 52 | } 53 | 54 | if [ ! -e $model_dir/pytorch_model.bin ] 55 | then 56 | run_train 57 | fi 58 | 59 | metric="eval_accuracy" 60 | if [ $TASK == "CoLA" ] 61 | then 62 | metric="Matthew" 63 | elif [ $TASK == "MRPC" ] 64 | then 65 | metric="F-1" 66 | fi 67 | 68 | if [ "$EVAL" = "1" ] 69 | then 70 | run_eval "" 71 | base_acc=$(run_eval "" | grep $metric | rev | cut -d" " -f1 | rev) 72 | fi 73 | -------------------------------------------------------------------------------- /modeling/torch_layers/attention.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class Attention(nn.Module): 5 | # reference huggingface ViTSelfAttention 6 | def __init__(self, hidden_size, num_heads, head_size=None): 7 | if head_size is None: 8 | if hidden_size % num_heads != 0: 9 | raise ValueError(f'hidden_size {head_size} must be a multiple of num_heads {num_heads}.') 10 | self.head_size = hidden_size // num_heads 11 | else: 12 | self.head_size = head_size 13 | 14 | super().__init__() 15 | self.num_heads = num_heads 16 | self.scale = self.head_size ** -0.5 17 | self.hidden_size = hidden_size 18 | 19 | self.to_query = nn.Linear(in_features=hidden_size, out_features=self.num_heads * self.head_size) 20 | self.to_key = nn.Linear(in_features=hidden_size, out_features=self.num_heads * self.head_size) 21 | self.to_value = nn.Linear(in_features=hidden_size, out_features=self.num_heads * self.head_size) 22 | self.to_out = nn.Linear(in_features=self.num_heads * self.head_size, out_features=hidden_size) 23 | 24 | def transpose_for_scores(self, x): 25 | new_shape = x.size()[:-1] + (self.num_heads, self.head_size) 26 | x = x.view(*new_shape) 27 | return x.permute(0, 2, 1, 3) 28 | 29 | def forward(self, x): 30 | mixed_query = self.to_query(x) 31 | 32 | key = self.transpose_for_scores(self.to_key(x)) 33 | value = self.transpose_for_scores(self.to_value(x)) 34 | query = self.transpose_for_scores(mixed_query) 35 | 36 | attention_scores = torch.matmul(query, key.transpose(-1, -2)) 37 | attention_scores = attention_scores * self.scale 38 | 39 | attention_probs = nn.Softmax(dim=-1)(attention_scores) 40 | 41 | context = torch.matmul(attention_probs, value) 42 | context = context.permute(0, 2, 1, 3).contiguous() 43 | 44 | next_shape = context.size()[:-2] + (self.num_heads * self.head_size,) 45 | context = context.view(*next_shape) 46 | context = self.to_out(context) 47 | 48 | return context -------------------------------------------------------------------------------- /deit_pruning/src/onnx_inference.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | import argparse 19 | import collections 20 | import logging 21 | import json 22 | import math 23 | import os 24 | import random 25 | import pickle 26 | 27 | import time 28 | import numpy as np 29 | import torch 30 | 31 | from pathlib import Path 32 | 33 | def main(): 34 | parser = argparse.ArgumentParser() 35 | parser.add_argument('--batch_size', type=int, default=1) 36 | parser.add_argument('--model_file', type=Path) 37 | parser.add_argument('--profile', required=False, action='store_true', help='Enable layer profiling (JSON output)') 38 | parser.add_argument('--extra', required=False, type=str, default=None) 39 | parser.add_argument('--seed', type=int, default=12345) 40 | parser.add_argument('--seq_len', type=int, default=38) 41 | # python src/onnx_inference.py --model_file ./results/dummy_mini/final/output.onnx 42 | 43 | args = parser.parse_args() 44 | print(args) 45 | 46 | # random.seed(args.seed) 47 | # np.random.seed(args.seed) 48 | # torch.manual_seed(args.seed) 49 | 50 | profile_arg = "--profile" if args.profile else "" 51 | extra_arg = args.extra if args.extra else "" 52 | 53 | perf_script = os.path.join("vendor/onnx_scripts", "bert_perf_test.py") 54 | 55 | os.system(f'python {perf_script} {profile_arg} --model "' + str(args.model_file) + f'" --batch_size {args.batch_size} --sequence_length {args.seq_len} --seed {args.seed} {extra_arg}') 56 | 57 | 58 | if __name__ == "__main__": 59 | main() 60 | 61 | -------------------------------------------------------------------------------- /deit_pruning/src/pytorch_prune/ln_smart.py: -------------------------------------------------------------------------------- 1 | from torch.nn.utils import prune 2 | from torch.nn.utils.prune import ( 3 | _validate_pruning_amount_init, 4 | _validate_structured_pruning, 5 | _compute_nparams_toprune, 6 | _validate_pruning_amount 7 | ) 8 | import torch 9 | 10 | 11 | class LnSmartStructured(prune.BasePruningMethod): 12 | PRUNING_TYPE = "1d" 13 | 14 | def __init__(self, amount, n=1): 15 | _validate_pruning_amount_init(amount) 16 | self.amount = amount 17 | self.ord = n 18 | if n != 1: 19 | print("WARN: LnSmartStructured is only verified in norm ord=1!") 20 | 21 | def make_mask(self, t, dim, indices): 22 | # Modified frin pytorch LnStructured.make_mask 23 | # init mask to 1 24 | mask = torch.ones_like(t) 25 | # e.g.: slc = [None, None, None], if len(t.shape) = 3 26 | slc = [slice(None)] * len(t.shape) 27 | # replace a None at position=dim with indices 28 | # e.g.: slc = [None, None, [0, 2, 3]] if dim=2 & indices=[0,2,3] 29 | slc[dim] = indices 30 | # use slc to slice mask and replace all its entries with 0s 31 | # e.g.: mask[:, :, [0, 2, 3]] = 0 32 | mask[slc] = 0 33 | return mask 34 | 35 | def compute_mask(self, t, default_mask): 36 | # _validate_structured_pruning(t) 37 | assert len(t.shape) == 2 38 | rows = t.shape[0] 39 | cols = t.shape[1] 40 | 41 | # 1. Calculate whether to prune row (dim=0) or col (dim=1) 42 | prune_row = False 43 | test_nparams_toprune = _compute_nparams_toprune(self.amount, min(rows, cols)) 44 | _validate_pruning_amount(test_nparams_toprune, min(rows, cols)) 45 | row_norm_sum = torch.topk(torch.linalg.norm(t, dim=1, ord=self.ord), k=test_nparams_toprune, largest=False).values.sum() / (cols ** (1 / self.ord)) # Is it right to avoid bias between the two norm values? 46 | col_norm_sum = torch.topk(torch.linalg.norm(t, dim=0, ord=self.ord), k=test_nparams_toprune, largest=False).values.sum() / (rows ** (1 / self.ord)) 47 | # print(row_norm_sum, col_norm_sum) 48 | if col_norm_sum >= row_norm_sum: 49 | prune_row = True 50 | 51 | # 2. Prune (actually) 52 | bcnt = rows if prune_row else cols 53 | nparams_toprune = _compute_nparams_toprune(self.amount, bcnt) 54 | _validate_pruning_amount(nparams_toprune, bcnt) 55 | 56 | mask = default_mask.clone() 57 | if nparams_toprune != 0: 58 | indices = torch.topk(torch.linalg.norm(t, dim=1 if prune_row else 0), k=nparams_toprune, largest=False).indices 59 | mask[self.make_mask(t, 0 if prune_row else 1, indices).to(dtype=mask.dtype) == 0] = 0 60 | 61 | return mask 62 | 63 | 64 | def ln_smart_structured(module, name, amount, n=1): 65 | LnSmartStructured.apply(module, name, amount=amount, n=n) 66 | return module 67 | -------------------------------------------------------------------------------- /are_16_heads/fetch_results.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | from pathlib import Path 4 | import os 5 | 6 | def fetch_accuracy(parser: argparse.ArgumentParser): 7 | parser.add_argument('--file', '-f', type=str, help='input file to fecth accuracy') 8 | parser.add_argument('--begin_line', type=int, default=0, help='begin line number') 9 | parser.add_argument('--end_line' , type=int, default=None, help='end line number plus one') 10 | args = parser.parse_args() 11 | 12 | f = open(args.file) 13 | if args.end_line: 14 | lines = f.readlines()[args.begin_line: args.end_line] 15 | else: 16 | lines = f.readlines()[args.begin_line:] 17 | 18 | i = 0 19 | acc_list = [] 20 | for i in range(len(lines)): 21 | line = lines[i] 22 | if 'Pruning eval results' in line: 23 | tokens = lines[i + 1].split() 24 | acc = round(float(tokens[2]) * 100, 2) 25 | acc_list.append(acc) 26 | # if 'Finetuning eval results' in line: 27 | # tokens = lines[i + 1].split() 28 | # acc = round(float(tokens[3]) * 100, 2) 29 | # acc_list.append(acc) 30 | i += 1 31 | 32 | print (acc_list) 33 | return acc_list 34 | 35 | 36 | def fetch_accuracy_from_path(parser: argparse.ArgumentParser): 37 | parser.add_argument('--path', type=Path, help='pruned model directory to fetch accuracy') 38 | parser.add_argument('--finetuned', action='store_true', help='sub_dir is "final_finetuned" instead of "final"') 39 | args = parser.parse_args() 40 | 41 | final_str = 'final' if not args.finetuned else 'final_finetuned' 42 | 43 | model_list = sorted(os.listdir(args.path), key=lambda x: int(x[x.find('prune') + len('prune'): ])) 44 | acc_list = [] 45 | for model_name in model_list: 46 | final_dir = args.path / model_name / final_str 47 | acc_file_name = sorted(os.listdir(final_dir))[0] 48 | if 'accuracy' not in acc_file_name: 49 | print(f'Please check the contents of this dir {final_dir}: {os.listdir(final_dir)}') 50 | exit() 51 | 52 | acc = round(int(acc_file_name[len('accuracy'): len('accuracy') + 4]) / 100, 2) 53 | acc_list.append(acc) 54 | 55 | print(acc_list) 56 | 57 | function_dict = dict( 58 | fetch_accuracy = fetch_accuracy, 59 | fetch_accuracy_from_path = fetch_accuracy_from_path, 60 | ) 61 | 62 | 63 | if __name__ == '__main__': 64 | parser = argparse.ArgumentParser() 65 | parser.add_argument('func', help='Specify the function to do.') 66 | 67 | assert(len(sys.argv) > 1) 68 | func = sys.argv[1] 69 | 70 | if func not in function_dict.keys(): 71 | print('Supported functions: ', list(function_dict.keys())) 72 | exit() 73 | 74 | function_dict[func](parser) 75 | -------------------------------------------------------------------------------- /deit_pruning/src/pytorch_prune/block.py: -------------------------------------------------------------------------------- 1 | from torch.nn.utils import prune 2 | from torch.nn.utils.prune import ( 3 | _validate_pruning_amount_init, 4 | _validate_structured_pruning, 5 | _compute_nparams_toprune, 6 | _validate_pruning_amount 7 | ) 8 | import torch 9 | 10 | 11 | class BlockPruningMethod(prune.BasePruningMethod): 12 | # Well pytorch thinks that structured pruning shall have a 'dim' attr 13 | # and unstructured pruning shall accept 1-d tensor 14 | # Block pruning satisfies neither of these conditions, so I set PRUNING_TYPE to this value to avoid misused 15 | PRUNING_TYPE = "block" 16 | 17 | def __init__(self, amount, block_row, block_col, n='fro'): 18 | _validate_pruning_amount_init(amount) 19 | self.amount = amount 20 | self.block_row = block_row 21 | self.block_col = block_col 22 | self.ord = n 23 | 24 | def get_block_view(self, matrix): 25 | rows = matrix.shape[0] 26 | cols = matrix.shape[1] 27 | 28 | assert rows % self.block_row == 0 29 | assert cols % self.block_col == 0 30 | 31 | brows = rows // self.block_row 32 | bcols = cols // self.block_col 33 | bcnt = brows * bcols 34 | 35 | def subview(idx): 36 | rstart = idx // bcols * self.block_row 37 | rend = (idx // bcols + 1) * self.block_row 38 | cstart = idx % bcols * self.block_col 39 | cend = (idx % bcols + 1) * self.block_col 40 | 41 | return matrix[rstart:rend, cstart:cend] 42 | 43 | blocks = [subview(idx) for idx in range(bcnt)] 44 | 45 | return torch.stack(blocks) 46 | 47 | def compute_mask(self, t, default_mask): 48 | # _validate_structured_pruning(t) 49 | assert len(t.shape) == 2 50 | rows = t.shape[0] 51 | cols = t.shape[1] 52 | assert rows % self.block_row == 0 53 | assert cols % self.block_col == 0 54 | brows = rows // self.block_row 55 | bcols = cols // self.block_col 56 | bcnt = brows * bcols 57 | 58 | nparams_toprune = _compute_nparams_toprune(self.amount, bcnt) 59 | _validate_pruning_amount(nparams_toprune, bcnt) 60 | 61 | mask = default_mask.clone() 62 | if nparams_toprune != 0: 63 | block_view = self.get_block_view(t) 64 | norms = torch.linalg.norm(block_view, ord=self.ord, dim=(1, 2)) 65 | indices = torch.topk(norms, k=nparams_toprune, largest=False).indices 66 | this_mask = torch.ones((brows, bcols)) 67 | this_mask.view(-1)[indices] = 0 68 | this_mask = torch.repeat_interleave(this_mask, self.block_row, dim=0) 69 | this_mask = torch.repeat_interleave(this_mask, self.block_col, dim=1) 70 | mask[this_mask == 0] = 0 71 | return mask 72 | 73 | 74 | def block_pruning(module, name, amount, block_row, block_col, n='fro'): 75 | BlockPruningMethod.apply(module, name, amount=amount, block_row=block_row, block_col=block_col, n=n) 76 | return module 77 | -------------------------------------------------------------------------------- /are_16_heads/itp/submit.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import subprocess 3 | import random 4 | import string 5 | import os 6 | 7 | template = \ 8 | """ 9 | description: Are16heads DeiT exp ({job_name}) 10 | 11 | target: 12 | service: amlk8s 13 | # run "pt target list amlk8s" to list the names of available AMLK8s targets 14 | name: itpscusv100cl # TODO 15 | vc: resrchvc 16 | 17 | environment: 18 | image: taoky/pytorch-1.8.1-cuda10.2-cudnn7-devel-enhanced:latest 19 | # sh does not have "source", so use ". ./xxx.sh" here. 20 | setup: 21 | - . ./itp/setup.sh 22 | 23 | code: 24 | # upload the code 25 | local_dir: $CONFIG_DIR/../../ 26 | 27 | storage: 28 | teamdrive: 29 | storage_account_name: hexnas 30 | container_name: teamdrive 31 | mount_dir: /mnt/data 32 | local_dir: $CONFIG_DIR/../../../faketeamdrive/ 33 | 34 | 35 | jobs: 36 | {jobs} 37 | """ 38 | 39 | job_template = \ 40 | """- name: {job_name} 41 | sku: G{gpu_cnt} 42 | command: 43 | - ./itp/run_itp.sh {function} 44 | """ 45 | 46 | func_to_job_name_dict = { 47 | 'iterative_pruning_base': 'D1009_are16heads_iterative_pruning_deit_base', 48 | 'finetune_many_base': 'D1013_are16heads_finetune_pruned_deit_base' 49 | } 50 | def main(mode): 51 | function = sys.argv[2] 52 | assert function in func_to_job_name_dict.keys() 53 | 54 | job_name = func_to_job_name_dict[function] # !! Edit this 55 | jobs = "" 56 | jobs += job_template.format( 57 | job_name=job_name, gpu_cnt=4, function=function 58 | ) 59 | description = f"EdgeDL Are16heads exp ({job_name})" 60 | 61 | # ====================================================================================================== 62 | # Don't need to modify following code 63 | result = template.format( 64 | job_name=job_name, 65 | jobs=jobs, 66 | ) 67 | print(result) 68 | 69 | tmp_name = ''.join(random.choices(string.ascii_lowercase, k=6)) + job_name 70 | tmp_name = os.path.join("./.tmp", tmp_name) 71 | with open(tmp_name, "w") as fout: 72 | fout.write(result) 73 | if mode == 0: 74 | subprocess.run(["amlt", "run", "-t", "local", "--use-sudo", tmp_name, "--devices", "all"]) 75 | input() 76 | elif mode == 1: 77 | subprocess.run(["amlt", "run", "-d", description, tmp_name, job_name]) 78 | 79 | 80 | if __name__ == "__main__": 81 | # example: python xx.py submit tiny 50 82 | # tiny (sys.argv[2]) is deit_type 83 | # 50 (sys.argv[2]) is sparsity 84 | if len(sys.argv) == 2 and sys.argv[1] in ('--help', '-h'): 85 | print('Example cmd: python submit iterative_pruning_base') 86 | exit() 87 | mode = 2 88 | if len(sys.argv) == 3 and sys.argv[1] == 'submit': 89 | print("Submit (pt run)") 90 | mode = 1 91 | elif len(sys.argv) == 3 and sys.argv[1] == 'debug': 92 | print("Debug dry run (pt run -t local)") 93 | mode = 0 94 | else: 95 | print("Print only") 96 | 97 | main(mode) 98 | -------------------------------------------------------------------------------- /experiments/D1130_vino_quant_cnn_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import argparse 4 | import subprocess 5 | 6 | sys.path.insert(0, f'{os.path.dirname(sys.argv[0])}/..') 7 | from benchmark.openvino.vino_cli import openvino_benchmark 8 | 9 | MODEL_LIST = [ 10 | 'mobilenet-v1-1.0-224-tf', 11 | 'mobilenet-v2-1.0-224', 12 | 'shufflenet-v2-x1.0', 13 | 'inception-resnet-v2-tf', 14 | 'efficientnet-b0', 15 | 'resnet-34-pytorch', 16 | 'resnet-50-tf', 17 | ] 18 | 19 | 20 | class ModelExporter: 21 | def __init__(self, model_list, download_model_dir, ir_model_dir): 22 | self.model_list = model_list 23 | self.download_model_dir = download_model_dir 24 | self.ir_model_dir = ir_model_dir 25 | pass 26 | 27 | def download(self): 28 | subprocess.run( 29 | f'python $VINO_DOWNLOADER --name={",".join(self.model_list)} --precisions=FP32 --output_dir={self.download_model_dir}', shell=True) 30 | 31 | def convert(self): 32 | subprocess.run( 33 | f'python $VINO_CONVERTER --name={",".join(self.model_list)} --precisions=FP32 --download_dir={self.download_model_dir} --output_dir={self.ir_model_dir}', shell=True) 34 | 35 | def quantize(self): 36 | subprocess.run( 37 | f'python $VINO_QUANTER --name={",".join(self.model_list)} --model_dir={self.ir_model_dir} --dataset_dir=datasets/imagenet2012 --output_dir={self.ir_model_dir} --precisions=FP32-INT8', shell=True) 38 | 39 | def benchmark(self): 40 | print('========== Benchmarking model performance on CPU with 1 thread') 41 | latency_list_fp32 = [] 42 | latency_list_int8 = [] 43 | 44 | for name in self.model_list: 45 | print(f'===== Testing {name} =====') 46 | model_path_fp32 = os.path.join(self.ir_model_dir, 'public', name, 'FP32', f'{name}.xml') 47 | model_path_int8 = os.path.join(self.ir_model_dir, 'public', name, 'FP32-INT8', f'{name}.xml') 48 | 49 | latency_fp32 = openvino_benchmark('$VINO_BENCHMARK_APP', model_path_fp32, niter=30, num_threads=1, batch_size=1) 50 | latency_int8 = openvino_benchmark('$VINO_BENCHMARK_APP', model_path_int8, niter=30, num_threads=1, batch_size=1) 51 | 52 | latency_list_fp32.append(latency_fp32) 53 | latency_list_int8.append(latency_int8) 54 | 55 | print('== SUMMARY ==') 56 | print(self.model_list) 57 | print('FP32 Latency:') 58 | print([round(v, 2) for v in latency_list_fp32]) 59 | print('INT8 Latency:') 60 | print([round(v, 2) for v in latency_list_int8]) 61 | 62 | 63 | if __name__ == '__main__': 64 | model_exporter = ModelExporter( 65 | MODEL_LIST, 'models/vino_model/download', 'models/vino_model/ir') 66 | model_exporter.download() 67 | model_exporter.convert() 68 | model_exporter.quantize() 69 | model_exporter.benchmark() -------------------------------------------------------------------------------- /benchmark/tensorrt/calibrator.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import tensorrt as trt 18 | import os 19 | 20 | import pycuda.driver as cuda 21 | import pycuda.autoinit 22 | from PIL import Image 23 | import numpy as np 24 | 25 | class DummyCalibrator(trt.IInt8EntropyCalibrator2): 26 | def __init__(self, training_data, cache_file='tmp.cache', batch_size=64): 27 | # Whenever you specify a custom constructor for a TensorRT class, 28 | # you MUST call the constructor of the parent explicitly. 29 | trt.IInt8EntropyCalibrator2.__init__(self) 30 | 31 | self.cache_file = None 32 | 33 | # Every time get_batch is called, the next batch of size batch_size will be copied to the device and returned. 34 | self.data = training_data 35 | self.batch_size = batch_size 36 | self.current_index = 0 37 | 38 | # Allocate enough memory for a whole batch. 39 | self.device_input = cuda.mem_alloc(self.data[0].nbytes * self.batch_size) 40 | 41 | def get_batch_size(self): 42 | return self.batch_size 43 | 44 | # TensorRT passes along the names of the engine bindings to the get_batch function. 45 | # You don't necessarily have to use them, but they can be useful to understand the order of 46 | # the inputs. The bindings list is expected to have the same ordering as 'names'. 47 | def get_batch(self, names): 48 | if self.current_index + self.batch_size > self.data.shape[0]: 49 | return None 50 | 51 | current_batch = int(self.current_index / self.batch_size) 52 | # if current_batch % 10 == 0: 53 | # print("Calibrating batch {:}, containing {:} images".format(current_batch, self.batch_size)) 54 | 55 | batch = self.data[self.current_index:self.current_index + self.batch_size].ravel() 56 | cuda.memcpy_htod(self.device_input, batch) 57 | self.current_index += self.batch_size 58 | return [self.device_input] 59 | 60 | 61 | def read_calibration_cache(self): 62 | pass 63 | # # If there is a cache, use it instead of calibrating again. Otherwise, implicitly return None. 64 | # if os.path.exists(self.cache_file): 65 | # with open(self.cache_file, "rb") as f: 66 | # return f.read() 67 | 68 | def write_calibration_cache(self, cache): 69 | # with open(self.cache_file, "wb") as f: 70 | # f.write(cache) 71 | pass 72 | -------------------------------------------------------------------------------- /deit_pruning/vendor/nn_pruning_v1/nn_pruning/modules/gelu2relu.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from nn_pruning.model_patcher import ModelPatcher 5 | 6 | 7 | class GeLU2ReLU(nn.Module): 8 | # There are two ways to specify how the module will move progressively from a GeLU to a ReLU 9 | # If you give a non-None schedule_callback, steps won't be used. 10 | # It must be a function that returns a dictionary containing at least two keys: 11 | # - mix : moving from 1.0 to 0.0 , it is the lerp factor between LayerNorm and NoNorm 12 | # If you don't specify a schedule_callback, each call to forward will count as a step, and in 'steps' steps 13 | # it will move to a GeLU to a ReLU 14 | def __init__(self, 15 | steps = 5000, 16 | schedule_callback = None): 17 | super().__init__() 18 | self.schedule_callback = schedule_callback 19 | 20 | if self.schedule_callback is None: 21 | self.steps = steps 22 | self.mix_step = 1 / self.steps 23 | self.mix = 1.0 24 | else: 25 | self.steps = None 26 | self.mix_step = None 27 | self.mix = None 28 | 29 | def forward(self, batch): 30 | if self.schedule_callback is not None: 31 | d = self.schedule_callback() 32 | mix = d["mix"] 33 | else: 34 | if self.training: 35 | mix = self.mix 36 | else: 37 | mix = 0 38 | 39 | if mix == 0: 40 | ret = F.relu(batch) 41 | else: 42 | g = F.gelu(batch) 43 | r = F.relu(batch) 44 | ret = torch.lerp(r, g, mix) 45 | 46 | if self.training: 47 | if self.schedule_callback is None: 48 | self.mix = max(0.0, self.mix - self.mix_step) 49 | 50 | return ret 51 | 52 | class GeLU2ReLUModelPatcher(ModelPatcher): 53 | def __init__(self, 54 | steps = 5000, 55 | schedule_callback = None): 56 | super().__init__(all_match=True) 57 | self.steps = steps 58 | self.schedule_callback = schedule_callback 59 | self.names = ["intermediate_act_fn", "transform_act_fn", "activation_fn"] 60 | 61 | def is_patchable(self, module_name, module, raiseError): 62 | for name in self.names: 63 | if hasattr(module, name): 64 | return True 65 | return False 66 | 67 | def new_child_module(self, child_module_name, child_module, patch_info): 68 | fn = GeLU2ReLU(steps=self.steps, 69 | schedule_callback=self.schedule_callback) 70 | patched = True 71 | for name in self.names: 72 | if hasattr(child_module, name): 73 | setattr(child_module, name, fn) 74 | patched = True 75 | 76 | assert(patched) 77 | # We don't return a new child module, we change it in place, we return the module itself to let the patcher update stats 78 | return child_module 79 | 80 | 81 | -------------------------------------------------------------------------------- /deit_pruning/vendor/nn_pruning_v1/nn_pruning/modules/AmpereRework.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 79, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "tensor([[0., 0., 1., 1., 0., 0., 1., 1., 1., 1., 0., 0.],\n", 13 | " [0., 1., 1., 0., 0., 0., 1., 1., 1., 0., 0., 1.],\n", 14 | " [1., 1., 0., 0., 1., 0., 1., 0., 0., 1., 0., 1.],\n", 15 | " [1., 1., 0., 0., 1., 0., 1., 0., 1., 0., 1., 0.],\n", 16 | " [0., 0., 1., 1., 0., 1., 1., 0., 1., 0., 0., 1.],\n", 17 | " [0., 1., 1., 0., 1., 0., 1., 0., 0., 1., 1., 0.],\n", 18 | " [0., 0., 1., 1., 0., 0., 1., 1., 1., 1., 0., 0.],\n", 19 | " [1., 0., 0., 1., 1., 0., 1., 0., 0., 1., 0., 1.],\n", 20 | " [1., 1., 0., 0., 0., 0., 1., 1., 0., 0., 1., 1.],\n", 21 | " [0., 1., 0., 1., 0., 1., 0., 1., 1., 0., 1., 0.],\n", 22 | " [1., 0., 0., 1., 0., 1., 1., 0., 1., 0., 1., 0.]])\n" 23 | ] 24 | } 25 | ], 26 | "source": [ 27 | "import torch\n", 28 | "\n", 29 | "from nn_pruning.modules.binarizer import MagnitudeBinarizer, ThresholdBinarizer, TopKBinarizer\n", 30 | "\n", 31 | "def ampere_mask_threshold(mask_scores, threshold, sigmoid, train):\n", 32 | " assert((mask_scores.shape[1] % 4) == 0)\n", 33 | " \n", 34 | " mask_scores_4 = mask_scores.reshape(mask_scores.shape[0], mask_scores.shape[1] // 4, 4)\n", 35 | " top = torch.topk(mask_scores_4, 2, dim=2, largest=True) \n", 36 | " top_mask = torch.zeros_like(mask_scores_4, device=mask_scores.device) \n", 37 | " top_mask = top_mask.scatter(2, top.indices, True) \n", 38 | " top_mask = top_mask.reshape_as(mask_scores)\n", 39 | " \n", 40 | " if train:\n", 41 | " mask = ThresholdBinarizer.apply(mask_scores, threshold, sigmoid) \n", 42 | " ret = torch.max(mask, top_mask)\n", 43 | " else:\n", 44 | " ret = top_mask\n", 45 | " \n", 46 | " return ret\n", 47 | " \n", 48 | " \n", 49 | "scores = torch.randn(11,12)\n", 50 | "\n", 51 | "mask = ampere_mask_threshold(scores, 0.9, True, False)\n", 52 | "\n", 53 | "assert(mask.shape == scores.shape)\n", 54 | "\n", 55 | "print(mask)\n" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [] 64 | } 65 | ], 66 | "metadata": { 67 | "kernelspec": { 68 | "display_name": "Python 3", 69 | "language": "python", 70 | "name": "python3" 71 | }, 72 | "language_info": { 73 | "codemirror_mode": { 74 | "name": "ipython", 75 | "version": 3 76 | }, 77 | "file_extension": ".py", 78 | "mimetype": "text/x-python", 79 | "name": "python", 80 | "nbconvert_exporter": "python", 81 | "pygments_lexer": "ipython3", 82 | "version": "3.8.5" 83 | } 84 | }, 85 | "nbformat": 4, 86 | "nbformat_minor": 4 87 | } 88 | -------------------------------------------------------------------------------- /deit_pruning/src/latency_model.py: -------------------------------------------------------------------------------- 1 | import json 2 | from sklearn.metrics import mean_squared_error 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.ensemble import RandomForestRegressor 5 | import numpy as np 6 | import pickle 7 | from sklearn.metrics import mean_squared_error 8 | import shutil,json 9 | def get_accuracy(y_pred,y_true,threshold=0.01): 10 | a=(y_true-y_pred)/y_true 11 | c=abs(y_true-y_pred) 12 | 13 | b=(np.where(abs(a)<=threshold ) ) 14 | return len(b[0])/len(y_true) 15 | 16 | 17 | 18 | def lat_metrics(y_pred,y_true): 19 | rmspe = (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))) * 100 20 | rmse=np.sqrt(mean_squared_error(y_pred,y_true)) 21 | acc5=get_accuracy(y_pred,y_true,threshold=0.05) 22 | acc10=get_accuracy(y_pred,y_true,threshold=0.10) 23 | acc15=get_accuracy(y_pred,y_true,threshold=0.15) 24 | 25 | 26 | return rmse,rmspe,rmse/np.mean(y_true),acc5,acc10,acc15 27 | def get_feature(fe): 28 | layers=fe.split('-') 29 | X=[] 30 | for layer in layers: 31 | items=layer.split('_') 32 | h=float(items[1]) 33 | d=float(items[-1]) 34 | X.append(h) 35 | X.append(d) 36 | return X 37 | 38 | 39 | def get_latency(filename): 40 | X=[] 41 | Y=[] 42 | f1=open("latency.csv",'w') 43 | with open(filename,'r') as fw: 44 | dicts=json.load(fw) 45 | for mid in dicts: 46 | #print(mid,dicts[mid]) 47 | fe=mid.split('\\')[-1].replace(".onnx","") 48 | data=dicts[mid] 49 | items=data.split('\r\n') 50 | # print(items) 51 | avg=float(items[-3].split(': ')[-1].replace(" us","")) 52 | x=get_feature(fe) 53 | print(fe,avg) 54 | X.append(x) 55 | Y.append(avg) 56 | f1.write(fe+','+str(avg)+'\n') 57 | return X,Y 58 | 59 | def get_model(filename): 60 | X,Y=get_latency(filename) 61 | print(len(X)) 62 | trainx, testx, trainy, testy = train_test_split( 63 | X, Y, test_size=0.2, random_state=10 64 | ) 65 | 66 | print(min(Y),max(Y),np.average(Y)) 67 | 68 | 69 | model=RandomForestRegressor(max_depth=70,n_estimators=320,min_samples_leaf=1,min_samples_split=2, 70 | max_features=8, oob_score=True,random_state=10) 71 | 72 | model.fit(trainx,trainy) 73 | predicts=model.predict(testx) 74 | rmse,rmspe,error,acc5,acc10,_=lat_metrics(predicts,testy) 75 | print(rmse,rmspe,error,acc5,acc10) 76 | for i in range(len(testy)): 77 | print(testy[i],predicts[i],(testy[i]-predicts[i])/testy[i]) 78 | 79 | model.fit(X, Y) 80 | with open("latency/latency_model.pkl", "wb") as f: 81 | pickle.dump(model, f) 82 | 83 | def predict(feature): 84 | with open('latency/latency_model.pkl', "rb") as f: 85 | model = pickle.load(f) 86 | return model.predict(feature)[0] 87 | 88 | #get_model("latency/latency_bench_newt13.json") 89 | fe=get_feature('h_4_d_0.4-h_4_d_0.4-h_4_d_0.4-h_4_d_0.4') 90 | print(fe) 91 | latency=predict([fe]) 92 | 93 | print(latency) -------------------------------------------------------------------------------- /deit_pruning/vendor/nn_pruning_v1/nn_pruning/model_patcher.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | class ModelPatcher: 4 | def __init__(self, all_match=False): 5 | self.patterns = [] 6 | self.stats = {"patched": 0, "patched_names":set()} 7 | self.all_match = all_match 8 | 9 | def is_patchable(self, child_module_name, child_module, raiseError): 10 | # Implement in subclass if needed 11 | return True 12 | 13 | def new_child_module(self, child_module_name, child_module, patch_info): 14 | # Just return None if the module does not need to be patched, or a new module. 15 | raise NotImplementedError("Implement this in subclasses") 16 | 17 | def get_patchable_layers(self, model, number_rewrite=True): 18 | # Layer names (displayed as regexps)") 19 | ret = {} 20 | for k, v in model.named_modules(): 21 | if self.is_patchable(k, v, raiseError=False): 22 | r = re.escape(k) 23 | r = re.sub(r"[0-9]+", "[0-9]+", r) 24 | if r not in ret: 25 | ret[r] = [] 26 | ret[r].append(v) 27 | 28 | return ret 29 | 30 | def add_pattern(self, pattern, patch_info): 31 | self.patterns.append(dict(pattern=pattern, patch_info=patch_info)) 32 | 33 | def pattern_match(self, module_name): 34 | if self.all_match: 35 | return True, -1 36 | for pattern_def in self.patterns: 37 | if re.match(pattern_def["pattern"], module_name): 38 | return True, pattern_def["patch_info"] 39 | return False, -1 40 | 41 | 42 | 43 | def replace_module(self, father, child_module_name, child_name, child_module, patch_info): 44 | new_child_module = self.new_child_module(child_module_name, child_module, patch_info) 45 | if new_child_module is not None: 46 | self.stats["patched"] += 1 47 | self.stats["patched_names"].add(child_module_name) 48 | setattr(father, child_name, new_child_module) 49 | 50 | def needs_patch(self, model): 51 | for k, v in model.named_modules(): 52 | if self.is_patchable(k, v, raiseError=True): 53 | return True 54 | else: 55 | return False 56 | 57 | def patch(self, model): 58 | modules = {} 59 | modified = False 60 | if self.all_match and len(self.patterns) != 0: 61 | Warning("all match is true, but there are some defined patterns, those will be ignored") 62 | for k, v in model.named_modules(): 63 | modules[k] = v 64 | match, patch_info = self.pattern_match(k) 65 | if match and self.is_patchable(k, v, raiseError=True): 66 | parts = k.split(".") 67 | father_module_name = ".".join(parts[:-1]) 68 | child_name = parts[-1] 69 | father = modules[father_module_name] 70 | self.replace_module(father, k, child_name, v, patch_info) 71 | modified = True 72 | 73 | if not modified: 74 | print( 75 | "Warning: the patcher did not patch anything!" 76 | " Check patchable layers with `mp.get_patchable_layers(model)`" 77 | ) 78 | 79 | -------------------------------------------------------------------------------- /experiments/D1130_tflite_gpu_r21_benchmark.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import subprocess 3 | import os 4 | import re 5 | class ADB: 6 | def __init__(self, serino): 7 | self.serino = serino 8 | 9 | def push(self, src, dst): 10 | subprocess.check_output(f'adb -s {self.serino} push {src} {dst}', shell=True) 11 | 12 | def pull(self, src, dst): 13 | subprocess.check_output(f'adb -s {self.serino} pull {src} {dst}', shell=True) 14 | 15 | def remove(self, dst): 16 | subprocess.check_output(f'adb -s {self.serino} shell rm {dst}', shell=True) 17 | 18 | def run_cmd(self, cmd): 19 | result = subprocess.check_output(f'adb -s {self.serino} shell {cmd}', shell=True).decode('utf-8') 20 | return result 21 | 22 | def fetch_number(text: str, marker: str): 23 | result = re.findall(f'{marker}\d+\.\d+|{marker}\d+', text)[0] 24 | return float(result[len(marker):]) 25 | 26 | def main(): 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument('--model_dir', type=str, required=True, help='tflite model dir to test') 29 | parser.add_argument('--serino', default='98281FFAZ009SV', type=str, help='phone serial number to test') 30 | parser.add_argument('--precision', default=3, type=int, help='precision to print latency') 31 | parser.add_argument('--dump_csv', action='store_true', dest='dump_csv', help='dump result to csv file') 32 | args = parser.parse_args() 33 | 34 | adb = ADB(args.serino) 35 | name_list = [] 36 | latency_list_f32 = [] 37 | latency_list_f16 = [] 38 | 39 | if args.dump_csv: 40 | with open('result_gpu_fp32.csv', 'a') as f: 41 | f.write('model_name, fp32_ms\n') 42 | with open('result_gpu_fp16.csv', 'a') as f: 43 | f.write('model_name fp16_ms\n') 44 | 45 | for name in sorted(os.listdir(args.model_dir)): 46 | f32_ms = 0 47 | f16_ms = 0 48 | try: 49 | name_list.append(os.path.splitext(os.path.basename(name))[0]) 50 | model_path = os.path.join(args.model_dir, name) 51 | dst_path = f'/sdcard/{name}' 52 | adb.push(model_path, dst_path) 53 | result_f32 = adb.run_cmd(f'"cd /data/local/tmp && ./benchmark_model_fixed_group_size --graph={dst_path} --use_gpu=true --precision=F32"') 54 | result_f16 = adb.run_cmd(f'"cd /data/local/tmp && ./benchmark_model_fixed_group_size --graph={dst_path} --use_gpu=true --precision=F16"') 55 | adb.remove(dst_path) 56 | f32_ms = fetch_number(result_f32, 'comp_avg_ms=') 57 | f16_ms = fetch_number(result_f16, 'comp_avg_ms=') 58 | except: 59 | pass 60 | latency_list_f32.append(round(f32_ms, args.precision)) 61 | latency_list_f16.append(round(f16_ms, args.precision)) 62 | 63 | print(name_list[-1], f32_ms, f16_ms) 64 | if args.dump_csv: 65 | with open('result_gpu_fp32.csv', 'a') as f: 66 | f.write(f'{name_list[-1]}, {latency_list_f32[-1]}\n') 67 | with open('result_gpu_fp16.csv', 'a') as f: 68 | f.write(f'{name_list[-1]}, {latency_list_f16[-1]}\n') 69 | 70 | print('==== LATENCY SUMMARY ====') 71 | print(name_list) 72 | print('[F32 Latency]') 73 | print(latency_list_f32) 74 | print('[F16 Latency]') 75 | print(latency_list_f16) 76 | 77 | if __name__ == '__main__': 78 | main() -------------------------------------------------------------------------------- /are_16_heads/evaluate_iterative_pruned_deit.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import torch.distributed as dist 4 | from classifier_eval import evaluate 5 | from util import build_dataset 6 | from classifier_scoring import Accuracy 7 | from pathlib import Path 8 | from transformers import AutoModelForImageClassification 9 | import os 10 | from logger import logger 11 | 12 | def evaluate_one_model(model_path, dataset, batch_size, local_rank, num_workers): 13 | is_main = local_rank == -1 or local_rank == 0 14 | if is_main: 15 | logger.info(f'*** Evaluating {model_path}***') 16 | 17 | model = AutoModelForImageClassification.from_pretrained(model_path) 18 | if local_rank == -1: 19 | device = torch.device('cuda') 20 | else: 21 | device = torch.device('cuda', local_rank) 22 | model.to(device) 23 | model = (model) 24 | scorer = Accuracy() 25 | accuracy = evaluate( 26 | dataset, 27 | model, 28 | batch_size, 29 | save_attention_probs=False, 30 | print_head_entropy=False, 31 | verbose=False, 32 | scorer=scorer, 33 | distributed=local_rank != -1, 34 | num_workers=num_workers 35 | )[scorer.name] 36 | 37 | if is_main: 38 | logger.info("***** Pruning eval results *****") 39 | logger.info(f"Accuracy\t{accuracy}") 40 | accuracy_file_name = f'accuracy{int(accuracy * 10000)}.txt' 41 | os.system(f'touch {os.path.join(model_path, accuracy_file_name)}') 42 | 43 | 44 | def main(): 45 | parser = argparse.ArgumentParser() 46 | parser.add_argument('--local_rank', type=int, default=-1, help='local rank for distributed evaluating') 47 | parser.add_argument('--data_path', type=Path, default='/data/data1/v-xudongwang/imagenet', help='imagenet2012 dataset path') 48 | parser.add_argument('--model_path', type=Path, required=True, help='directory of models or a model to evalute') 49 | parser.add_argument('--batch_size', type=int, default=500, help='evaluate batch size per gpu') 50 | parser.add_argument('--eval_dir_of_models', action='store_true', help='evaludate all models in model_path') 51 | parser.add_argument('--num_workers', default=8, type=int, help='dataloader number of workers') 52 | args = parser.parse_args() 53 | 54 | dataset, _ = build_dataset(args.data_path, is_train=False, shuffle=False, return_dict=False) 55 | 56 | if args.local_rank != -1: 57 | dist.init_process_group("nccl", rank=args.local_rank, world_size=torch.cuda.device_count()) 58 | 59 | if not args.eval_dir_of_models: 60 | evaluate_one_model(model_path=args.model_path / 'final', dataset=dataset, batch_size=args.batch_size, local_rank=args.local_rank, num_workers=args.num_workers) 61 | else: 62 | for model_name in sorted(os.listdir(args.model_path)): 63 | model_path = args.model_path / model_name 64 | if 'final' in os.listdir(model_path): 65 | model_path = model_path / 'final' 66 | if len(os.listdir(model_path)) < 3: 67 | evaluate_one_model(model_path=model_path, dataset=dataset, batch_size=args.batch_size, local_rank=args.local_rank, num_workers=args.num_workers) 68 | else: 69 | logger.info(os.listdir(model_path)) 70 | logger.info(f"{model_name} already evaluated. Skip.") 71 | 72 | if __name__ == '__main__': 73 | main() -------------------------------------------------------------------------------- /deit_pruning/vendor/nn_pruning_v1/nn_pruning/tests/test_patch2.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest import TestCase 3 | from nn_pruning.patch_coordinator import SparseTrainingArguments, ModelPatchingCoordinator 4 | from nn_pruning.modules.masked_nn import ( 5 | GenericLinearPruningContextModule, 6 | BlockLinearPruningContextModule, 7 | SingleDimensionLinearPruningContextModule, 8 | ) 9 | from transformers import AutoConfig, AutoModelForQuestionAnswering 10 | 11 | import copy 12 | 13 | class TestFun(TestCase): 14 | def helper(self, sparse_args, model_name_or_path): 15 | config = AutoConfig.from_pretrained(model_name_or_path) 16 | model = AutoModelForQuestionAnswering.from_pretrained(model_name_or_path) 17 | 18 | device = "cuda" 19 | cache_dir = None 20 | logit_names = ["start_logits", "end_logits"] 21 | teacher_constructor = AutoModelForQuestionAnswering 22 | 23 | coordinator = ModelPatchingCoordinator(sparse_args, device, cache_dir, model_name_or_path, logit_names, teacher_constructor) 24 | 25 | return config, model, coordinator 26 | 27 | def test_base(self): 28 | sparse_args = SparseTrainingArguments.hybrid(20.0) 29 | sparse_args.layer_norm_patch = True 30 | sparse_args.gelu_patch = True 31 | 32 | ref_stats = { 33 | "bert-base-uncased": {"main": {"patched": 72}, "layer_norm": {"patched": 25}, "gelu": {"patched": 12}}, 34 | "bert-large-uncased": {"main": {"patched": 144}, "layer_norm": {"patched": 49}, "gelu": {"patched": 24}}, 35 | "facebook/bart-base": {"main": {"patched": 96}, "layer_norm": {"patched": 32}, "gelu": {"patched": 12}} 36 | } 37 | 38 | for model_name_or_path in ref_stats.keys(): 39 | config, model, coordinator = self.helper(sparse_args, model_name_or_path) 40 | 41 | coordinator.patch_model(model) 42 | 43 | stats = copy.deepcopy(coordinator.stats) 44 | 45 | print(stats["main"]) 46 | for k in stats: 47 | del stats[k]["patched_names"] 48 | 49 | self.assertEqual(stats, ref_stats[model_name_or_path]) 50 | 51 | def test_context_module(self): 52 | sparse_args = SparseTrainingArguments.hybrid(20.0) 53 | sparse_args.layer_norm_patch = True 54 | sparse_args.gelu_patch = True 55 | 56 | ref_context_module = { 57 | "bert-base-uncased": {"generic": 60, "block": 48, "single": 12}, 58 | "bert-large-uncased": {"generic": 120, "block": 96, "single": 24}, 59 | "facebook/bart-base": {"generic": 84, "block": 72, "single": 12}, 60 | } 61 | 62 | for model_name_or_path in ref_context_module.keys(): 63 | config, model, coordinator = self.helper(sparse_args, model_name_or_path) 64 | 65 | coordinator.patch_model(model) 66 | 67 | context_module = {"generic": 0, "block": 0, "single": 0} 68 | 69 | for name, module in model.named_modules(): 70 | if isinstance(module, GenericLinearPruningContextModule): 71 | context_module["generic"] += 1 72 | if isinstance(module, BlockLinearPruningContextModule): 73 | context_module["block"] += 1 74 | elif isinstance(module, SingleDimensionLinearPruningContextModule): 75 | context_module["single"] += 1 76 | 77 | self.assertEqual(context_module, ref_context_module[model_name_or_path]) 78 | 79 | 80 | if __name__ == "__main__": 81 | unittest.main() 82 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | TASK=$1 4 | OPTIONS="${@:2}" 5 | function mobile_benchmark_vivo() { 6 | for model in `ls ${MODEL_DIR}` 7 | do 8 | echo "********************" && echo $model && 9 | python tools.py mobile_benchmark \ 10 | --model "${MODEL_DIR}/$model" \ 11 | --num_runs=30 \ 12 | --warmup_runs=30 \ 13 | --num_threads=1 \ 14 | --taskset_mask=c0 \ 15 | --serial_number=0000028e2c780e4e \ 16 | --benchmark_binary_dir="/data/local/tmp" \ 17 | --no_root \ 18 | $OPTIONS # --no_xnnpack 19 | done 20 | } 21 | 22 | function mobile_benchmark_pixel1() { 23 | for model in `ls ${MODEL_DIR}` 24 | do 25 | echo "********************" && echo $model && 26 | python tools.py mobile_benchmark \ 27 | --model "${MODEL_DIR}/$model" \ 28 | --num_runs=20 \ 29 | --warmup_runs=10 \ 30 | --num_threads=1 \ 31 | --taskset_mask=c \ 32 | --serial_number=FA6A70311471 \ 33 | --benchmark_binary_dir="/data/local/tmp" \ 34 | --no_root \ 35 | $OPTIONS 36 | done 37 | } 38 | 39 | function mobile_benchmark_mi() { 40 | for model in `ls ${MODEL_DIR}` 41 | do 42 | echo "********************" && echo $model && 43 | python tools.py mobile_benchmark \ 44 | --model "${MODEL_DIR}/$model" \ 45 | --num_runs=30 \ 46 | --warmup_runs=30 \ 47 | --num_threads=1 \ 48 | --taskset_mask=70 \ 49 | --serial_number=2458c476 \ 50 | --benchmark_binary_dir="/data/local/tmp" \ 51 | --no_root \ 52 | $OPTIONS # --no_xnnpack 53 | done 54 | } 55 | 56 | function mobile_benchmark_pixel4() { 57 | for model in `ls ${MODEL_DIR}` 58 | do 59 | echo "********************" && echo $model && 60 | python tools.py mobile_benchmark \ 61 | --model "${MODEL_DIR}/$model" \ 62 | --num_runs=20 \ 63 | --warmup_runs=15 \ 64 | --num_threads=1 \ 65 | --taskset_mask=70 \ 66 | $OPTIONS 67 | done 68 | } 69 | 70 | function mobile_benchmark_pixel4_thread4() { 71 | for model in `ls ${MODEL_DIR}` 72 | do 73 | echo "********************" && echo $model && 74 | python tools.py mobile_benchmark \ 75 | --model "${MODEL_DIR}/$model" \ 76 | --num_runs=30 \ 77 | --warmup_runs=30 \ 78 | --num_threads=4 \ 79 | --taskset_mask=f0 80 | done 81 | } 82 | 83 | function mobile_benchmark_pixel4_thread8() { 84 | for model in `ls ${MODEL_DIR}` 85 | do 86 | echo "********************" && echo $model && 87 | python tools.py mobile_benchmark \ 88 | --model "${MODEL_DIR}/$model" \ 89 | --num_runs=30 \ 90 | --warmup_runs=30 \ 91 | --num_threads=8 \ 92 | --taskset_mask=ff 93 | done 94 | } 95 | 96 | function onnx_benchmark() { 97 | for model in `ls ${MODEL_DIR}` 98 | do 99 | python tools.py server_benchmark \ 100 | --model "${MODEL_DIR}/$model" \ 101 | --num_runs=200 \ 102 | --top=30 \ 103 | --warmup_runs=30 \ 104 | --precision=3 \ 105 | $OPTIONS 106 | done 107 | } 108 | 109 | 110 | function trt_benchmark() { 111 | for model in `ls ${MODEL_DIR}` 112 | do 113 | python benchmark/tensorrt/onnx_trt_test.py \ 114 | --model "${MODEL_DIR}/$model" \ 115 | --num_runs=100 \ 116 | --warmup_runs=30 \ 117 | --top=20 \ 118 | $OPTIONS 119 | done 120 | } 121 | $TASK "" 122 | -------------------------------------------------------------------------------- /deit_pruning/vendor/nn_pruning_v1/nn_pruning/modules/quantization_config.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | import torch 4 | from torch.quantization import ( 5 | FakeQuantize, 6 | MinMaxObserver, 7 | MovingAverageMinMaxObserver, 8 | QConfig, 9 | float_qparams_weight_only_qconfig, 10 | get_default_qat_qconfig, 11 | get_default_qconfig, 12 | ) 13 | 14 | # TensorFlow Lite Quantization Specs 15 | # https://www.tensorflow.org/lite/performance/quantization_spec?hl=en 16 | # For activations: int8 asymmetric per-tensor [-128, 127] range 17 | # For weights: int8 symmetric per-tensor [-127, 127] range 18 | _TFLITE_QCONFIG = QConfig( 19 | activation=MovingAverageMinMaxObserver.with_args( 20 | dtype=torch.qint8, 21 | quant_min=-128, 22 | quant_max=127, 23 | qscheme=torch.per_tensor_affine, 24 | ), 25 | weight=MinMaxObserver.with_args( 26 | dtype=torch.qint8, quant_min=-127, quant_max=127, qscheme=torch.per_tensor_symmetric 27 | ), 28 | ) 29 | _TFLITE_QAT_QCONFIG = QConfig( 30 | activation=FakeQuantize.with_args( 31 | observer=MovingAverageMinMaxObserver, 32 | dtype=torch.qint8, 33 | quant_min=-128, 34 | quant_max=127, 35 | qscheme=torch.per_tensor_affine, 36 | ), 37 | weight=FakeQuantize.with_args( 38 | observer=MinMaxObserver, dtype=torch.qint8, quant_min=-127, quant_max=127, qscheme=torch.per_tensor_symmetric 39 | ), 40 | ) 41 | _ONNX_QCONFIG = QConfig( 42 | activation=MinMaxObserver.with_args( 43 | quant_min=0, 44 | quant_max=255, 45 | reduce_range=True, 46 | ), 47 | weight=MinMaxObserver.with_args( 48 | quant_min=-128, quant_max=127, dtype=torch.qint8, reduce_range=False, qscheme=torch.per_tensor_symmetric 49 | ), 50 | ) 51 | _ONNX_QAT_QCONFIG = QConfig( 52 | activation=FakeQuantize.with_args( 53 | observer=MinMaxObserver, 54 | quant_min=0, 55 | quant_max=255, 56 | reduce_range=True, 57 | ), 58 | weight=FakeQuantize.with_args( 59 | observer=MinMaxObserver, 60 | quant_min=-128, 61 | quant_max=127, 62 | dtype=torch.qint8, 63 | reduce_range=False, 64 | qscheme=torch.per_tensor_symmetric, 65 | ), 66 | ) 67 | 68 | _QCONFIG_DICT = {"object_type": [(torch.nn.Embedding, float_qparams_weight_only_qconfig)]} 69 | 70 | _QAT_QCONFIG_DICT = {"object_type": [(torch.nn.Embedding, float_qparams_weight_only_qconfig)]} 71 | 72 | _QCONFIG_MAPPING = { 73 | "default": "fbgemm", 74 | "mobile": "qnnpack", 75 | "fbgemm": "fbgemm", 76 | "qnnpack": "qnnpack", 77 | "tflite": _TFLITE_QCONFIG, 78 | "onnx": _ONNX_QCONFIG, 79 | } 80 | 81 | _QAT_QCONFING_MAPPING = { 82 | "default": "fbgemm", 83 | "mobile": "qnnpack", 84 | "fbgemm": "fbgemm", 85 | "qnnpack": "qnnpack", 86 | "tflite": _TFLITE_QAT_QCONFIG, 87 | "onnx": _ONNX_QAT_QCONFIG, 88 | } 89 | 90 | 91 | def create_qconfig(qconfig_name, mode): 92 | mode = mode.lower() 93 | if mode not in ["static", "qat"]: 94 | raise ValueError(f"mode must either be static or qat, here: {mode}") 95 | 96 | mapping = _QCONFIG_MAPPING if mode == "static" else _QAT_QCONFING_MAPPING 97 | qconfig = mapping.get(qconfig_name, None) 98 | if isinstance(qconfig, str): 99 | qconfig_function = get_default_qconfig if mode == "static" else get_default_qat_qconfig 100 | qconfig = qconfig_function(qconfig) 101 | if qconfig is None: 102 | raise ValueError(f"qconfig name must be in {mapping.keys()}, but {qconfig_name} was provided") 103 | qconfig_dict = _QCONFIG_DICT if mode == "static" else _QAT_QCONFIG_DICT 104 | qconfig_dict = copy.deepcopy(qconfig_dict) 105 | qconfig_dict[""] = qconfig 106 | return qconfig_dict 107 | -------------------------------------------------------------------------------- /modeling/models/cnn_zoo.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow_hub as hub 3 | from .squeezenet import SqueezeNet 4 | from . import shufflenet 5 | from . import shufflenetv2 6 | from .proxylessnas import get_proxylessnas 7 | from .mnasnet import mnasnet_a1 8 | import os 9 | 10 | 11 | def add_keras_input_layer(model, input_shape, batch_size=1): 12 | import tensorflow as tf 13 | return tf.keras.Sequential([ 14 | tf.keras.layers.InputLayer(input_shape, batch_size=batch_size), 15 | model 16 | ]) 17 | 18 | def get_mobilenetv1(): 19 | model = tf.keras.applications.MobileNet() 20 | return add_keras_input_layer(model, [224, 224, 3], 1) 21 | 22 | def get_mobilenetv2(): 23 | model = tf.keras.applications.MobileNetV2() 24 | return add_keras_input_layer(model, [224, 224, 3], 1) 25 | 26 | def get_mobilenetv3small(): 27 | model = tf.keras.applications.MobileNetV3Small() 28 | return add_keras_input_layer(model, [224, 224, 3], 1) 29 | 30 | def get_mobilenetv3large(): 31 | model = tf.keras.applications.MobileNetV3Large() 32 | return add_keras_input_layer(model, [224, 224, 3], 1) 33 | 34 | def get_squeezenet(): 35 | model = SqueezeNet(image_size=[224, 224, 3], batch_size=1) 36 | return model 37 | 38 | def get_inception_resnetv2(): 39 | model = tf.keras.applications.InceptionResNetV2() # input_shape=[299, 299, 3] 40 | return add_keras_input_layer(model, [299, 299, 3], 1) 41 | 42 | def get_inceptionv3(): 43 | model = tf.keras.applications.InceptionV3() # input_shape=[299, 299, 3] 44 | return add_keras_input_layer(model, [299, 299, 3], 1) 45 | 46 | def get_efficientnetb0(): 47 | model = tf.keras.applications.EfficientNetB0() 48 | return add_keras_input_layer(model, [224, 224, 3], 1) 49 | 50 | def get_efficientnetb0_lite(): 51 | model = hub.KerasLayer("https://tfhub.dev/tensorflow/efficientnet/lite0/classification/2") 52 | return add_keras_input_layer(model, [224, 224, 3], 1) 53 | 54 | def get_resnet50(): 55 | model = tf.keras.applications.ResNet50() 56 | return add_keras_input_layer(model, [224, 224, 3], 1) 57 | 58 | def get_resnet50v2(): 59 | model = tf.keras.applications.ResNet50V2() 60 | return add_keras_input_layer(model, [224, 224, 3], 1) 61 | 62 | def get_shufflenet(): 63 | model = shufflenet.shufflenet_g1_w1() 64 | return add_keras_input_layer(model, [224, 224, 3], 1) 65 | 66 | def get_shufflenetv2(): 67 | model = shufflenetv2.shufflenetv2_w1() 68 | return add_keras_input_layer(model, [224, 224, 3], 1) 69 | 70 | def get_proxyless_mobile(): 71 | model = get_proxylessnas('mobile') 72 | return add_keras_input_layer(model, [224, 224, 3], 1) 73 | 74 | def get_mnasneta1(): 75 | model = mnasnet_a1() 76 | return add_keras_input_layer(model, [224, 224, 3], 1) 77 | 78 | cnn_zoo_dict = { 79 | 'mobilenetv1': get_mobilenetv1, 80 | 'mobilenetv2': get_mobilenetv2, 81 | 'mobilenetv3small': get_mobilenetv3small, 82 | 'mobilenetv3large': get_mobilenetv3large, 83 | 'squeezenet': get_squeezenet, 84 | 'inception_resnetv2': get_inception_resnetv2, 85 | 'inceptionv3': get_inceptionv3, 86 | 'efficientnetb0': get_efficientnetb0, 87 | 'efficientnetb0_lite': get_efficientnetb0_lite, 88 | 'resnet50': get_resnet50, 89 | 'resnet50v2': get_resnet50v2, 90 | 'shufflenet': get_shufflenet, 91 | 'shufflenetv2': get_shufflenetv2, 92 | 'proxyless_mobile': get_proxyless_mobile, 93 | 'mnasneta1': get_mnasneta1 94 | } 95 | 96 | def get_model(model_name): 97 | if model_name not in cnn_zoo_dict.keys(): 98 | raise ValueError(f'{model_name} not supported') 99 | return cnn_zoo_dict[model_name]() 100 | 101 | def save_all(output_dir): 102 | for model_name in cnn_zoo_dict.keys(): 103 | model = cnn_zoo_dict[model_name]() 104 | model.save(os.path.join(output_dir, model_name + '.tf')) -------------------------------------------------------------------------------- /are_16_heads/finetune.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | import torch 3 | from torch.nn.parallel import DistributedDataParallel as DDP 4 | from logger import logger 5 | import os 6 | import random 7 | import numpy as np 8 | 9 | import classifier_args 10 | import classifier_training as training 11 | from classifier_scoring import Accuracy 12 | from classifier_eval import evaluate 13 | from util import build_dataset 14 | 15 | 16 | 17 | def main(): 18 | parser = classifier_args.get_base_parser() 19 | classifier_args.training_args(parser) 20 | classifier_args.eval_args(parser) 21 | classifier_args.finetune_args(parser) 22 | 23 | args = parser.parse_args() 24 | 25 | # ==== SETUP DEVICE ==== 26 | # This code only support distributed data parallel & gpu training 27 | torch.cuda.set_device(args.local_rank) 28 | device = torch.device("cuda", args.local_rank) 29 | n_gpu = 1 30 | logger.info( 31 | f"device: {device} n_gpu: {n_gpu}, " 32 | f"distributed training: {bool(args.local_rank != -1)}, " 33 | ) 34 | 35 | # ==== SETUP EXPERIMENT ==== 36 | 37 | def set_seeds(seed, n_gpu): 38 | random.seed(seed) 39 | np.random.seed(seed) 40 | torch.manual_seed(seed) 41 | if n_gpu > 0: 42 | torch.cuda.manual_seed_all(seed) 43 | 44 | set_seeds(args.seed, n_gpu) 45 | 46 | os.makedirs(args.output_dir, exist_ok=True) 47 | 48 | # ==== PREPARE DATA ==== 49 | train_dataset, _ = build_dataset(args.data_dir, is_train=True, shuffle=True, return_dict=True) 50 | eval_dataset, _ = build_dataset(args.data_dir, is_train=False, shuffle=False, return_dict=False) 51 | 52 | # ==== PREPARE TRAINING ==== 53 | training_args = training.get_training_args( 54 | learning_rate=args.finetune_learning_rate, 55 | micro_batch_size=args.train_batch_size, 56 | n_steps=args.n_finetune_steps_after_pruning, 57 | n_epochs=args.n_finetune_epochs_after_pruning, 58 | local_rank=args.local_rank, 59 | num_workers=args.num_workers, 60 | output_dir=args.output_dir, 61 | ) 62 | is_main = args.local_rank == -1 or args.local_rank == 0 63 | 64 | # ==== PREPARE MODEL ==== 65 | def get_deit_model(model_path): 66 | from transformers import ViTForImageClassification 67 | model = ViTForImageClassification.from_pretrained(model_path) 68 | model.to(device) 69 | return model 70 | 71 | model = get_deit_model(args.finetune_model_path) 72 | 73 | # ==== START TRAINING ==== 74 | training.huggingface_trainer_train( 75 | train_dataset=train_dataset, 76 | model=model, 77 | args=training_args 78 | ) 79 | 80 | if is_main: 81 | model.save_pretrained(os.path.join(args.output_dir, 'final_finetuned')) 82 | 83 | # ==== EVALUATE ==== 84 | if args.eval_finetuned: 85 | # Print the pruning descriptor 86 | if is_main: 87 | logger.info("*** Evaluating ***") 88 | # Eval accuracy 89 | scorer = Accuracy() 90 | accuracy = evaluate( 91 | eval_dataset, 92 | model, 93 | args.eval_batch_size, 94 | save_attention_probs=args.save_attention_probs, 95 | print_head_entropy=False, 96 | device=device, 97 | verbose=False, 98 | disable_progress_bar=args.no_progress_bars, 99 | scorer=scorer, 100 | distributed=args.local_rank != -1, 101 | num_workers=args.num_workers 102 | )[scorer.name] 103 | 104 | if is_main: 105 | logger.info("***** Pruning eval results *****") 106 | logger.info(f"Accuracy\t{accuracy}") 107 | accuracy_file_name = f'accuracy{int(accuracy * 10000)}.txt' 108 | os.system(f'touch {os.path.join(args.output_dir, "final_finetuned", accuracy_file_name)}') 109 | 110 | 111 | if __name__ == '__main__': 112 | main() -------------------------------------------------------------------------------- /deit_pruning/src/onnx_export.py: -------------------------------------------------------------------------------- 1 | # from transformers.convert_graph_to_onnx import convert 2 | from pathlib import Path 3 | 4 | # missing tokenizer, so just cannot use convert directly 5 | # convert(framework="pt", model="results/playground/final/", output=Path("results/playground/final/output.onnx"), opset=13) 6 | 7 | import torch 8 | import argparse 9 | from model import SwiftBERTOutput 10 | 11 | from nn_pruning.inference_model_patcher import optimize_model as nn_optimize 12 | 13 | from onnxruntime.transformers.optimizer import optimize_model 14 | from onnxruntime.transformers.onnx_model_bert import BertOptimizationOptions 15 | from onnxruntime.quantization import QuantizationMode, quantize, quantize_dynamic, QuantType 16 | 17 | def quant(use_original=False): 18 | if use_original: 19 | input_model = f'{output_name}.onnx' 20 | output_model = f'{output_name}-quant.onnx' 21 | else: 22 | input_model = f'{output_name}-opt.onnx' 23 | output_model = f'{output_name}-opt-quant.onnx' 24 | quantize_dynamic(str(args.model_dir / input_model), 25 | str(args.model_dir / output_model), 26 | weight_type=QuantType.QUInt8, 27 | # optimize_model=False, # onnxruntime 1.8.x requires this arg 28 | ) 29 | 30 | parser = argparse.ArgumentParser() 31 | parser.add_argument("--model_dir", type=Path, default='./results/playground/final') 32 | parser.add_argument("--nn_pruning", action='store_true') 33 | parser.add_argument("--no_opt", action='store_true') 34 | parser.add_argument("--force_opt", action='store_true') 35 | parser.add_argument("--max_ad_length", type=int, default=38) 36 | parser.add_argument("--output_name", type=str, default="output") 37 | parser.add_argument("--opset_version", type=int, default=13) 38 | 39 | args = parser.parse_args() 40 | assert not (args.no_opt and args.force_opt), "no_opt and force_opt cannot be set together." 41 | # python src/onnx_export.py --model_dir ./results/dummy_mini/final/ 42 | 43 | model = SwiftBERTOutput.from_pretrained(args.model_dir) 44 | original_params = model.num_parameters() 45 | model = nn_optimize(model, "dense") 46 | pruned_params = model.num_parameters() 47 | print("Original params:", original_params) 48 | print("After-pruned params:", pruned_params) 49 | print(model) 50 | 51 | bert_config = model.config 52 | 53 | max_ad_length = args.max_ad_length 54 | 55 | print("==== export ====") 56 | output_name = args.output_name 57 | if args.nn_pruning: 58 | model = nn_optimize(model, "dense") 59 | print(model) 60 | output_name += "_removepruned" 61 | torch.onnx.export( 62 | model, 63 | (torch.tensor([1] * (max_ad_length)).view(-1, max_ad_length), 64 | torch.tensor([1] * (max_ad_length)).view(-1, max_ad_length), 65 | torch.tensor([1] * (max_ad_length)).view(-1, max_ad_length)), 66 | args.model_dir / f'{output_name}.onnx', 67 | input_names=['input_ids', 'attention_mask', 'token_type_ids'], 68 | output_names=['score'], 69 | verbose=False, 70 | export_params=True, 71 | opset_version=args.opset_version, 72 | do_constant_folding=True 73 | ) 74 | 75 | print("==== optimization ====") 76 | if args.nn_pruning or args.no_opt: 77 | # TODO: how to fix that? 78 | if not args.force_opt: 79 | print("No optimization (nn_pruning or set no_opt). Doing quanting only") 80 | quant(use_original=True) 81 | exit(0) 82 | if args.nn_pruning: 83 | print("Warn: num_heads & hidden_size may be changed during nn_pruning! The optimized result can be incorrect.") 84 | 85 | optimization_options = BertOptimizationOptions('bert') 86 | optimization_options.embed_layer_norm = True 87 | optimization_options.enable_layer_norm = True 88 | optimization_options.enable_skip_layer_norm = True 89 | optimization_options.enable_bias_gelu = True 90 | optimization_options.enable_attention = True 91 | 92 | optimized_model = optimize_model( 93 | str(args.model_dir / f'{output_name}.onnx'), 94 | model_type='bert', 95 | num_heads=bert_config.num_attention_heads, 96 | hidden_size=bert_config.hidden_size, 97 | optimization_options=optimization_options) 98 | optimized_model.save_model_to_file(str(args.model_dir / f'{output_name}-opt.onnx')) 99 | 100 | print("==== quantize ====") 101 | quant(use_original=True) 102 | quant(use_original=False) 103 | -------------------------------------------------------------------------------- /experiments/D0104_tvm_fusion_test.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from pathlib import Path 4 | import re 5 | import subprocess 6 | 7 | 8 | OUTPUT_CSV_PATH = 'D0105_tvm_fusion_test.csv' 9 | 10 | 11 | class TvmFusionTester: 12 | def __init__(self, tflite_dir, tvm_dir, tracker_host, tracker_port, rpc_key, cross_compiler_path): 13 | self.tflite_dir = tflite_dir 14 | self.tvm_dir=tvm_dir 15 | self.tracker_host = tracker_host 16 | self.tracker_port = tracker_port 17 | self.rpc_key = rpc_key 18 | self.cross_compiler_path = cross_compiler_path 19 | 20 | def _tune_single(self, input_path, output_path): 21 | pass # TODO 22 | 23 | def _compile_single(self, input_path, output_path): 24 | Path(os.path.dirname(output_path)).mkdir(parents=True, exist_ok=True) 25 | cmd = f'tvmc compile {input_path} -o {output_path} --target "llvm -mtriple=arm64-linux-android -mattr=+neon" --cross-compiler {self.cross_compiler_path}' 26 | result = subprocess.check_output(cmd, shell=True).decode('utf-8') 27 | print(result) 28 | 29 | # unzip 30 | tar_output_dir = output_path.replace('.tar', '') 31 | Path(tar_output_dir).mkdir(parents=True, exist_ok=True) 32 | subprocess.check_output(f'tar -xvf {output_path} -C {tar_output_dir}', shell=True) 33 | 34 | def _benchmark_single(self, model_path): 35 | print(os.path.basename(model_path)) 36 | result = subprocess.run( 37 | f'tvmc run {model_path} --rpc-key {self.rpc_key} --rpc-tracker {self.tracker_host}:{self.tracker_port} --print-time --device=cpu --repeat 100', 38 | shell=True, capture_output=True).stdout.decode('utf-8') 39 | print(result) 40 | numbers = re.findall(r'\d*\.?\d+', result) 41 | return [float(x) for x in numbers] 42 | 43 | def _compile(self): 44 | print('==== Compiling ====') 45 | for root, dirs, files in os.walk(self.tflite_dir): 46 | for file in sorted(files): 47 | if file.endswith('.tflite'): 48 | input_path = os.path.join(root, file) 49 | output_path = os.path.join(self.tvm_dir, input_path.replace(self.tflite_dir + '/', '').replace('.tflite', '.tar')) 50 | self._compile_single(input_path, output_path) 51 | 52 | def _tune(self): 53 | print('==== Tuning ====') 54 | for root, dirs, files in os.walk(self.tflite_dir): 55 | for file in sorted(files): 56 | if file.endswith('.tflite'): 57 | input_path = os.path.join(root, file) 58 | output_path = os.path.join(self.tvm_dir, input_path.replace(self.tflite_dir + '/', '').replace('.tflite', '_autotuner_records.json')) 59 | self._tune_single(input_path, output_path) 60 | 61 | def _benchmark(self): 62 | with open(OUTPUT_CSV_PATH, 'a') as f: 63 | f.write('model_name,mean,median,max,min,std\n') 64 | 65 | print('==== benchmarking ====') 66 | for root, dirs, files in os.walk(self.tvm_dir): 67 | for file in sorted(files): 68 | if file.endswith('.tar'): 69 | model_path = os.path.join(root, file) 70 | mean_median_max_min_std_ms = self._benchmark_single(model_path) 71 | with open(OUTPUT_CSV_PATH, 'a') as f: 72 | f.write(f'{os.path.basename(model_path)},{",".join([str(round(ms, 2)) for ms in mean_median_max_min_std_ms])}\n') 73 | 74 | def run(self): 75 | self._compile() 76 | self._benchmark() 77 | 78 | 79 | def main(): 80 | parser = argparse.ArgumentParser() 81 | parser.add_argument('--tflite_dir', default='models/tflite_model/fusion', type=str) 82 | parser.add_argument('--tvm_dir', default='models/tvm_model/fusion_test', type=str) 83 | parser.add_argument('--tracker_host', default='127.0.0.1', type=str) 84 | parser.add_argument('--tracker_port', default=9090, type=int) 85 | parser.add_argument('--rpc_key', default='android', type=str) 86 | parser.add_argument('--cross_compiler_path', default=os.environ['TVM_NDK_CC'], type=str) 87 | args = parser.parse_args() 88 | 89 | tester = TvmFusionTester(args.tflite_dir, args.tvm_dir, args.tracker_host, args.tracker_port, args.rpc_key, args.cross_compiler_path) 90 | tester.run() 91 | 92 | if __name__ == '__main__': 93 | main() 94 | -------------------------------------------------------------------------------- /are_16_heads/util.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchvision import datasets 3 | from torchvision.datasets.folder import ImageFolder 4 | import torch.distributed as dist 5 | from typing import Dict 6 | import numpy as np 7 | import torch 8 | 9 | def head_entropy(p): 10 | plogp = p * torch.log(p) 11 | plogp[p == 0] = 0 12 | return - plogp.sum(dim=-1) 13 | 14 | 15 | def head_pairwise_kl(p): 16 | # p has shape bsz x nheads x L x L and is normalized in the last 17 | # dim 18 | logp = torch.log(p) 19 | logp[p == 0] = -40 20 | H_pq = -torch.einsum("bilk,bjlk->bijl", [p, logp]) 21 | H_p = head_entropy(p).unsqueeze(-2) 22 | KL = H_pq - H_p 23 | KL.masked_fill_(p.sum(-1).eq(0).unsqueeze(1), 0.0) 24 | KL.masked_fill_(p.sum(-1).eq(0).unsqueeze(2), 0.0) 25 | return KL 26 | 27 | 28 | def attn_disagreement(p): 29 | # p has shape bsz x nheads x L x L and is normalized in the last 30 | # dim 31 | n_heads = p.size(1) 32 | return torch.einsum("bilk,bjlk->b", [p, p]) / n_heads ** 2 33 | 34 | 35 | def out_disagreement(out): 36 | # out has shape bsz x nheads x L x d 37 | n_heads = out.size(1) 38 | # Normalize 39 | out /= torch.sqrt((out ** 2).sum(-1)).unsqueeze(-1) + 1e-20 40 | cosine = torch.einsum("bild,bjld->b", [out, out]) 41 | return cosine / n_heads ** 2 42 | 43 | 44 | def print_1d_tensor(tensor): 45 | print("\t".join(f"{x:.5f}" for x in tensor.cpu().data)) 46 | 47 | 48 | def print_2d_tensor(tensor): 49 | for row in range(len(tensor)): 50 | print_1d_tensor(tensor[row]) 51 | 52 | 53 | def none_if_empty(string): 54 | return string if string != "" else None 55 | 56 | 57 | def get_vit_encoder(model): 58 | if hasattr(model, 'vit'): 59 | return model.vit.encoder 60 | if hasattr(model, 'module'): 61 | return model.module.vit.encoder 62 | else: 63 | raise RuntimeError('Model neither has attribute "vit" or "module".') 64 | 65 | def get_vit_config(model): 66 | if hasattr(model, 'vit'): 67 | return model.vit.config 68 | if hasattr(model, 'module'): 69 | return model.module.vit.config 70 | else: 71 | raise RuntimeError('Model neither has attribute "vit" or "module".') 72 | 73 | 74 | 75 | '''============================================================ 76 | load data 77 | =================================================================''' 78 | class DictImageFolder(datasets.ImageFolder): 79 | def __init__(self, shuffle, *args, **kwargs): 80 | super().__init__(*args, **kwargs) 81 | self.shuffle = shuffle 82 | self.idx_list = np.arange(super().__len__()) 83 | if self.shuffle: 84 | np.random.shuffle(self.idx_list) 85 | 86 | def __getitem__(self, index: int) -> Dict: 87 | index = self.idx_list[index] 88 | item = super().__getitem__(index) 89 | return dict( 90 | pixel_values=item[0], 91 | label=item[1] 92 | ) 93 | 94 | def build_dataset(data_path, input_size=224, is_train=False, shuffle=False, return_dict=True): 95 | def build_transform(input_size): 96 | from torchvision import transforms 97 | from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD 98 | t = [] 99 | if input_size > 32: 100 | size = int((256 / 224) * input_size) 101 | t.append( 102 | transforms.Resize(size, interpolation=3), # to maintain same ratio w.r.t. 224 images 103 | ) 104 | t.append(transforms.CenterCrop(input_size)) 105 | 106 | t.append(transforms.ToTensor()) 107 | t.append(transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD)) 108 | return transforms.Compose(t) 109 | 110 | import os 111 | from torchvision import datasets 112 | transform = build_transform(input_size) 113 | root = os.path.join(data_path, 'val' if not is_train else 'train') 114 | if return_dict: 115 | dataset = DictImageFolder(shuffle, root, transform=transform) 116 | else: 117 | dataset = ImageFolder(root, transform=transform) 118 | num_classes = 1000 119 | return dataset, num_classes 120 | 121 | 122 | def to_data_loader(dataset, batch_size, num_workers): 123 | import torch 124 | sampler = torch.utils.data.SequentialSampler(dataset) 125 | data_loader = torch.utils.data.DataLoader( 126 | dataset, sampler=sampler, 127 | batch_size=batch_size, 128 | num_workers=num_workers, 129 | pin_memory=True, 130 | drop_last=False 131 | ) 132 | return data_loader -------------------------------------------------------------------------------- /deit_pruning/src/layers/super_bertlayers.py: -------------------------------------------------------------------------------- 1 | import transformers 2 | from transformers import BertModel, BertLayer 3 | from transformers.models.bert.modeling_bert import BertEncoder, BertSelfAttention, BertAttention,BertIntermediate,BertEmbeddings,BertPooler,BertOutput,BertSelfOutput 4 | from transformers.modeling_outputs import SequenceClassifierOutput 5 | from transformers.activations import ACT2FN 6 | import torch 7 | from torch import nn 8 | from torch.nn import BCEWithLogitsLoss 9 | class VA_BertIntermediate(BertIntermediate): 10 | def __init__(self, config,layerconfig): 11 | super().__init__(config) 12 | print(layerconfig['intermediate_size']) 13 | self.dense = nn.Linear(config.hidden_size, layerconfig['intermediate_size']) 14 | if isinstance(config.hidden_act, str): 15 | self.intermediate_act_fn = ACT2FN[config.hidden_act] 16 | else: 17 | self.intermediate_act_fn = config.hidden_act 18 | class VA_BertOutput(BertOutput): 19 | def __init__(self, config,layerconfig): 20 | super().__init__(config) 21 | self.dense = nn.Linear(layerconfig['intermediate_size'], config.hidden_size) 22 | self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) 23 | self.dropout = nn.Dropout(config.hidden_dropout_prob) 24 | class VA_BertSelfAttention(BertSelfAttention): 25 | def __init__(self, config,heads_num): 26 | super().__init__(config) 27 | if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): 28 | raise ValueError( 29 | "The hidden size (%d) is not a multiple of the number of attention " 30 | "heads (%d)" % (config.hidden_size, config.num_attention_heads) 31 | ) 32 | 33 | self.num_attention_heads = heads_num 34 | self.attention_head_size = int(config.hidden_size / config.num_attention_heads) ##original head size 35 | self.all_head_size = heads_num * self.attention_head_size 36 | print('here',heads_num,self.all_head_size) 37 | 38 | self.query = nn.Linear(config.hidden_size, self.all_head_size) 39 | self.key = nn.Linear(config.hidden_size, self.all_head_size) 40 | self.value = nn.Linear(config.hidden_size, self.all_head_size) 41 | 42 | self.dropout = nn.Dropout(config.attention_probs_dropout_prob) 43 | class VA_BertSelfOutput(BertSelfOutput): 44 | def __init__(self, config,head_num): 45 | super().__init__(config) 46 | attention_head_size = int(config.hidden_size / config.num_attention_heads) ##original head size 47 | all_head_size = head_num * attention_head_size 48 | self.dense = nn.Linear(all_head_size, config.hidden_size) 49 | self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) 50 | self.dropout = nn.Dropout(config.hidden_dropout_prob) 51 | 52 | 53 | class VA_BertAttention(BertAttention): 54 | def __init__(self, config,layerconfig): 55 | super().__init__(config) 56 | self.self = VA_BertSelfAttention(config,layerconfig['heads']) 57 | self.output = VA_BertSelfOutput(config,layerconfig['heads']) 58 | self.pruned_heads = set() 59 | 60 | 61 | 62 | class VA_BertLayer(BertLayer): 63 | def __init__(self, config,layerconfig): 64 | super().__init__(config) 65 | self.chunk_size_feed_forward = config.chunk_size_feed_forward 66 | self.seq_len_dim = 1 67 | self.attention = VA_BertAttention(config,layerconfig) 68 | self.is_decoder = config.is_decoder 69 | self.add_cross_attention = config.add_cross_attention 70 | if self.add_cross_attention: 71 | assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added" 72 | self.crossattention = VA_BertAttention(config,layerconfig) 73 | self.intermediate = VA_BertIntermediate(config,layerconfig) 74 | self.output = VA_BertOutput(config,layerconfig) 75 | 76 | class VA_BertEncoder(BertEncoder): 77 | def __init__(self, config): 78 | super().__init__(config) 79 | self.config = config 80 | print(self.config) 81 | self.layer = nn.ModuleList([VA_BertLayer(config,config.layers[str(i)]) for i in range(config.num_hidden_layers)]) 82 | 83 | class VA_BertModel(BertModel): 84 | 85 | def __init__(self, config, add_pooling_layer=True): 86 | super().__init__(config) 87 | self.config = config 88 | 89 | self.embeddings = BertEmbeddings(config) 90 | self.encoder = VA_BertEncoder(config) 91 | 92 | self.pooler = BertPooler(config) if add_pooling_layer else None 93 | 94 | self.init_weights() 95 | -------------------------------------------------------------------------------- /modeling/layers/transformer_encoder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import math 4 | from .residual import Residual 5 | from .norm import LayerNorm 6 | from .attention import Attention 7 | from .ffn import FeedForward 8 | 9 | class TransformerEncoderBlock(tf.keras.Model): 10 | def __init__(self, hidden_size, num_layers, num_heads, intermediate_size, norm_first=True): 11 | super().__init__() 12 | layers = [] 13 | for _ in range(num_layers): 14 | layers.extend([ 15 | LayerNorm(Residual(Attention(hidden_size, num_heads=num_heads)), pre=norm_first), 16 | LayerNorm(Residual(FeedForward(hidden_size, intermediate_size)), pre=norm_first) 17 | ]) 18 | self.net = tf.keras.Sequential(layers) 19 | 20 | def call(self, x): 21 | return self.net(x) 22 | 23 | 24 | class TransformerEncoderBlock_Pruned(tf.keras.Model): 25 | def __init__(self, hidden_size, num_layers, num_remain_heads_list, intermediate_size_list, head_size=64, norm_first=True): 26 | super().__init__() 27 | layers = [] 28 | for i in range(num_layers): 29 | layers.extend([ 30 | LayerNorm(Residual(Attention(hidden_size, num_heads=num_remain_heads_list[i], h_k=head_size)), pre=norm_first), 31 | LayerNorm(Residual(FeedForward(hidden_size, intermediate_size_list[i])), pre=norm_first) 32 | ]) 33 | self.net = tf.keras.Sequential(layers) 34 | 35 | def call(self, x): 36 | return self.net(x) 37 | 38 | 39 | class TokenPerformer(tf.keras.Model): 40 | ''' 41 | T2T-Module performer for T2T-ViT 42 | ''' 43 | def __init__(self, head_size, num_heads, kernel_ratio=0.5, dp1=0.1, dp2=0.1): 44 | super().__init__() 45 | self.hidden_size = head_size * num_heads 46 | self.kqv = tf.keras.layers.Dense(self.hidden_size * 3) 47 | self.dp = tf.keras.layers.Dropout(dp1) 48 | self.attn_output = tf.keras.layers.Dense(self.hidden_size) 49 | self.num_heads = num_heads 50 | self.norm1 = tf.keras.layers.LayerNormalization(epsilon=1e-5) 51 | self.norm2 = tf.keras.layers.LayerNormalization(epsilon=1e-5) 52 | self.epsilon = 1e-8 # for stable in division 53 | 54 | self.mlp = tf.keras.Sequential([ 55 | FeedForward(self.hidden_size, self.hidden_size), 56 | tf.keras.layers.Dropout(dp2) 57 | ]) 58 | 59 | self.m = int(self.hidden_size * kernel_ratio) 60 | self.w = self.add_weight('w', 61 | shape=[self.m, self.hidden_size], 62 | initializer=tf.keras.initializers.Orthogonal(), 63 | dtype=tf.float32, 64 | trainable=False) 65 | self.w = self.w * math.sqrt(self.m) 66 | 67 | def prm_exp(self, x): 68 | # part of the function is borrow from https://github.com/lucidrains/performer-pytorch 69 | # and Simo Ryu (https://github.com/cloneofsimo) 70 | # ==== positive random features for gaussian kernels ==== 71 | # x = (B, T, hs) 72 | # w = (m, hs) 73 | # return : x : B, T, m 74 | # SM(x, y) = E_w[exp(w^T x - |x|/2) exp(w^T y - |y|/2)] 75 | # therefore return exp(w^Tx - |x|/2)/sqrt(m) 76 | xd = tf.math.reduce_sum(x * x, axis=-1, keepdims=True) 77 | broadcast_shape = tf.where([True, True, False], tf.shape(xd), [0, 0, self.m]) 78 | xd = tf.broadcast_to(xd, broadcast_shape) / 2 79 | wtd = tf.einsum('bti,mi->btm', tf.convert_to_tensor(x, dtype=tf.float32), self.w) 80 | 81 | return tf.exp(wtd - xd) / math.sqrt(self.m) 82 | 83 | def single_attn(self, x): 84 | k, q, v = tf.split(self.kqv(x), 3, axis=-1) 85 | kp, qp = self.prm_exp(k), self.prm_exp(q) # (B, T, m), (B, T, m) 86 | D = tf.einsum('bti,bi->bt', qp, tf.math.reduce_sum(kp, axis=1)) # (B, T, m) * (B, m) -> (B, T, 1) 87 | D = tf.expand_dims(D, axis=2) 88 | kptv = tf.einsum('bin,bim->bnm', tf.convert_to_tensor(v, dtype=tf.float32), kp) # (B, emb, m) 89 | broadcast_shape = tf.where([True, True, False], tf.shape(D), [0, 0, self.hidden_size]) 90 | y = tf.einsum('bti,bni->btn', qp, kptv) / (tf.broadcast_to(D, broadcast_shape) + self.epsilon) # (B, T, emb) / Diag 91 | 92 | # skip connection 93 | y = v + self.dp(self.attn_output(y)) 94 | return y 95 | 96 | def call(self, x): 97 | x = self.norm1(x) 98 | x = self.single_attn(x) 99 | x = x + self.mlp(self.norm2(x)) 100 | 101 | return x 102 | -------------------------------------------------------------------------------- /benchmark/tensorrt/onnx_trt_test.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import os 18 | import sys 19 | import argparse 20 | import numpy as np 21 | # This import causes pycuda to automatically manage CUDA context creation and cleanup. 22 | import pycuda.autoinit 23 | import tensorrt as trt 24 | import onnx 25 | import torch 26 | 27 | import common 28 | from calibrator import DummyCalibrator 29 | 30 | 31 | # You can set the logger severity higher to suppress messages (or lower to display more messages). 32 | TRT_LOGGER = trt.Logger(trt.Logger.WARNING) 33 | 34 | 35 | def get_onnx_input_shape(model_path): 36 | model = onnx.load(model_path) 37 | input0 = model.graph.input[0] 38 | tensor_type = input0.type.tensor_type 39 | input_shape = [] 40 | for d in tensor_type.shape.dim: 41 | if d.HasField('dim_value'): 42 | input_shape.append(d.dim_value) 43 | else: 44 | input_shape.append(1) 45 | return input_shape 46 | 47 | # The Onnx path is used for Onnx models. 48 | def build_engine_onnx(model_file, quant=None): 49 | builder = trt.Builder(TRT_LOGGER) 50 | network = builder.create_network(common.EXPLICIT_BATCH) 51 | config = builder.create_builder_config() 52 | runtime = trt.Runtime(TRT_LOGGER) 53 | parser = trt.OnnxParser(network, TRT_LOGGER) 54 | 55 | config.max_workspace_size = common.GiB(1) 56 | 57 | if quant == 'int8' or quant == 'both': 58 | config.set_flag(trt.BuilderFlag.INT8) 59 | input_shape = get_onnx_input_shape(model_file) 60 | dummy_input = torch.rand(input_shape).numpy() 61 | config.int8_calibrator = DummyCalibrator(dummy_input, batch_size=1) 62 | if quant == 'fp16' or quant == 'both': 63 | config.set_flag(trt.BuilderFlag.FP16) 64 | 65 | # Load the Onnx model and parse it in order to populate the TensorRT network. 66 | with open(model_file, 'rb') as model: 67 | if not parser.parse(model.read()): 68 | print ('ERROR: Failed to parse the ONNX file.') 69 | for error in range(parser.num_errors): 70 | print (parser.get_error(error)) 71 | return None 72 | 73 | plan = builder.build_serialized_network(network, config) 74 | return runtime.deserialize_cuda_engine(plan) 75 | 76 | 77 | def main(): 78 | parser = argparse.ArgumentParser() 79 | parser.add_argument('--model', required=True, type=str, help="torch state_dict path") 80 | parser.add_argument('--quant', default=None, choices=['int8', 'fp16', 'both'], help='inference with int8') 81 | parser.add_argument('--num_runs', default=50, type=int, help='number of inference runs') 82 | parser.add_argument('--warmup_runs', default=20, type=int, help='number of warmup runs') 83 | parser.add_argument('--topk', default=None, type=int, help='take the avg of top k latency to reduce variance') 84 | parser.add_argument('--precision', default=3, type=int, help='the precision of latency result') 85 | args = parser.parse_args() 86 | 87 | 88 | # Build a TensorRT engine. 89 | engine = build_engine_onnx(args.model, quant=args.quant) 90 | # Inference is the same regardless of which parser is used to build the engine, since the model architecture is the same. 91 | # Allocate buffers and create a CUDA stream. 92 | inputs, outputs, bindings, stream = common.allocate_buffers(engine) 93 | # Contexts are used to perform inference. 94 | context = engine.create_execution_context() 95 | 96 | latency_list = [] 97 | for _ in range(10): 98 | common.do_inference_v2_with_timer(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream) 99 | for _ in range(50): 100 | latency_ms = common.do_inference_v2_with_timer(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream) 101 | latency_list.append(latency_ms) 102 | 103 | if args.topk: 104 | latency_list.sort() 105 | latency_list = latency_list[:args.topk] 106 | 107 | avg_ms = np.average(latency_list) 108 | std_ms = np.std(latency_list) 109 | print(f'{os.path.basename(args.model)} Avg latency {avg_ms:.3f} ms Std {std_ms:.3f} ms') 110 | 111 | 112 | if __name__ == '__main__': 113 | main() 114 | -------------------------------------------------------------------------------- /deit_pruning/vendor/nn_pruning_v1/nn_pruning/training_patcher.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict 2 | 3 | import torch.nn as nn 4 | 5 | from .model_patcher import ModelPatcher 6 | from nn_pruning.model_structure import ModelStructure 7 | 8 | class PatcherContextModule(nn.Module): 9 | pass 10 | 11 | class PatcherContext: 12 | def __init__(self): 13 | self.context_modules: Dict = {} 14 | self.context_data = {} 15 | 16 | def set_context_data(self, data_key, data): 17 | # print("set_context_data", data_key, data) 18 | self.context_data[data_key] = data 19 | 20 | def set_context_data_dict(self, d: Dict[str, Any]): 21 | for k, v in d.items(): 22 | self.set_context_data(k, v) 23 | 24 | def get_context_data(self, data_key,layerindex): 25 | #print('self.context_data',self.context_data) 26 | return self.context_data[layerindex][data_key] 27 | 28 | def enumerate_context_data(self): 29 | for k, v in self.context_data.items(): 30 | yield k, v 31 | 32 | def has_module_context(self, key): 33 | d = self.context_modules 34 | for key_part in key: 35 | if key_part not in d: 36 | return False 37 | d = d[key_part] 38 | return True 39 | 40 | def get_context_module(self, key) -> PatcherContextModule: 41 | d = self.context_modules 42 | for key_part in key: 43 | d = d[key_part] 44 | return d 45 | 46 | def set_module_context(self, key, module_context: PatcherContextModule): 47 | d = self.context_modules 48 | for key_part in key[:-1]: 49 | if key_part not in d: 50 | d[key_part] = {} 51 | d = d[key_part] 52 | assert key[-1] not in d 53 | d[key[-1]] = module_context 54 | 55 | 56 | class ReplacementModule(nn.Module): 57 | def set_context(self, context): 58 | self._context = context 59 | 60 | def get_context_data(self, key,module_name): 61 | layerindex=int(module_name.split('.')[3]) 62 | #print(layerindex) 63 | 64 | return self._context.get_context_data(key,layerindex) 65 | 66 | 67 | class ModulePatcher: 68 | def __init__(self, context: PatcherContext): 69 | self.context = context 70 | 71 | def get_context_key(self, child_module_name, kind="default"): 72 | # Default implementation: each module has its own context 73 | return (kind, child_module_name) 74 | 75 | def create_context_module(self, child_module_name, child_module, key): 76 | raise NotImplementedError("Implement in subclass") 77 | 78 | def get_context_module(self, child_module_name, child_module, kind="default", **kwargs): 79 | key = self.get_context_key(child_module_name, kind=kind) 80 | if key == None: 81 | return None 82 | if self.context.has_module_context(key): 83 | return self.context.get_context_module(key) 84 | else: 85 | print('here context',child_module_name) 86 | module_context = self.create_context_module(child_module_name, child_module, key, **kwargs) 87 | self.context.set_module_context(key, module_context) 88 | return module_context 89 | 90 | def patch(self, child_module_name, child_module) -> ReplacementModule: 91 | raise NotImplementedError("Implement in subclass") 92 | 93 | def patch_and_connect(self, child_module_name, child_module) -> ReplacementModule: 94 | ret = self.patch(child_module_name, child_module) 95 | 96 | if ret is not None: 97 | ret.set_context(self.context) 98 | return ret 99 | 100 | 101 | class ModelDispatchingPatcher(ModelPatcher): 102 | def __init__(self): 103 | super().__init__() 104 | 105 | def add_patcher(self, pattern: str, patcher: ModulePatcher): 106 | patch_info = dict(patcher=patcher) 107 | super().add_pattern(pattern, patch_info) 108 | 109 | def new_child_module(self, child_module_name: str, child_module: nn.Module, patch_info: Dict): 110 | if patch_info.get("patcher") is not None: 111 | return patch_info["patcher"].patch_and_connect(child_module_name, child_module) 112 | 113 | def is_patchable(self, module_name, module, raiseError): 114 | return isinstance(module, nn.Linear) 115 | 116 | 117 | class LinearModelPatcher(ModelDispatchingPatcher): 118 | def __init__(self, patchers: Dict[str, ModulePatcher], model_structure: ModelStructure): 119 | super().__init__() 120 | self.model_structure = model_structure 121 | for layer_type, patcher in patchers.items(): 122 | layer = self.model_structure.LAYER_PATTERNS.get(layer_type) 123 | if layer is not None: 124 | layer_pattern = (self.model_structure.PATTERN_PREFIX + layer).replace(".", "\.") 125 | self.add_patcher(layer_pattern, patcher) 126 | -------------------------------------------------------------------------------- /deit_pruning/src/model.py: -------------------------------------------------------------------------------- 1 | from transformers import BertPreTrainedModel, BertModel 2 | from transformers.modeling_outputs import SequenceClassifierOutput 3 | import torch 4 | from torch import nn 5 | from torch.nn import BCEWithLogitsLoss 6 | 7 | 8 | class SwiftBERT(BertPreTrainedModel): 9 | def __init__(self, config): 10 | super().__init__(config) 11 | # self.num_labels = config.num_labels 12 | self.num_labels = 1 13 | self.config = config 14 | 15 | self.bert = BertModel(config) 16 | # self.dropout = nn.Dropout(config.hidden_dropout_prob) 17 | # self.relu = nn.ReLU() 18 | self.classifier = nn.Linear(config.hidden_size, self.num_labels) 19 | 20 | self.init_weights() 21 | 22 | def forward( 23 | self, 24 | input_ids=None, 25 | attention_mask=None, 26 | token_type_ids=None, 27 | position_ids=None, 28 | head_mask=None, 29 | inputs_embeds=None, 30 | labels=None, 31 | output_attentions=None, 32 | output_hidden_states=None, 33 | return_dict=None, 34 | ): 35 | r""" 36 | labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): 37 | Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., 38 | config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), 39 | If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). 40 | """ 41 | return_dict = return_dict if return_dict is not None else self.config.use_return_dict 42 | 43 | outputs = self.bert( 44 | input_ids, 45 | attention_mask=attention_mask, 46 | token_type_ids=token_type_ids, 47 | position_ids=position_ids, 48 | head_mask=head_mask, 49 | inputs_embeds=inputs_embeds, 50 | output_attentions=output_attentions, 51 | output_hidden_states=output_hidden_states, 52 | return_dict=return_dict, 53 | ) 54 | 55 | last_hidden_state = outputs[0] 56 | logits = self.classifier(last_hidden_state[:, 0]) 57 | #print('logits',logits.shape,logits) 58 | # logits = self.relu(self.classifier(last_hidden_state[:, 0])) 59 | 60 | # pooled_output = outputs[1] 61 | 62 | # pooled_output = self.dropout(pooled_output) 63 | # logits = self.classifier(pooled_output) 64 | 65 | loss = None 66 | if labels is not None: 67 | # if self.config.problem_type is None: 68 | # if self.num_labels == 1: 69 | # self.config.problem_type = "regression" 70 | # elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): 71 | # self.config.problem_type = "single_label_classification" 72 | # else: 73 | # self.config.problem_type = "multi_label_classification" 74 | 75 | # if self.config.problem_type == "regression": 76 | # loss_fct = MSELoss() 77 | # if self.num_labels == 1: 78 | # loss = loss_fct(logits.squeeze(), labels.squeeze()) 79 | # else: 80 | # loss = loss_fct(logits, labels) 81 | # elif self.config.problem_type == "single_label_classification": 82 | # loss_fct = CrossEntropyLoss() 83 | # loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) 84 | # elif self.config.problem_type == "multi_label_classification": 85 | # loss_fct = BCEWithLogitsLoss() 86 | # loss = loss_fct(logits, labels) 87 | loss_fct = BCEWithLogitsLoss() 88 | loss = loss_fct(logits[:, 0:1], labels[:, 0:1]) 89 | if not return_dict: 90 | output = (logits,) + outputs[2:] 91 | return ((loss,) + output) if loss is not None else output 92 | 93 | return SequenceClassifierOutput( 94 | loss=loss, 95 | logits=logits, 96 | hidden_states=outputs.hidden_states, 97 | attentions=outputs.attentions, 98 | ) 99 | 100 | class SwiftBERTOutput(SwiftBERT): 101 | def __init__(self, config): 102 | super().__init__(config) 103 | self.sigmoid = nn.Sigmoid() 104 | self.one = torch.nn.parameter.Parameter(torch.tensor(1), requires_grad=False) 105 | 106 | def forward(self, 107 | input_ids=None, 108 | attention_mask=None, 109 | token_type_ids=None, 110 | position_ids=None, 111 | head_mask=None, 112 | inputs_embeds=None, 113 | labels=None, 114 | output_attentions=None, 115 | output_hidden_states=None, 116 | return_dict=None, 117 | ): 118 | outputs = super().forward( 119 | input_ids=input_ids, 120 | attention_mask=attention_mask, 121 | token_type_ids=torch.min(token_type_ids, self.one), 122 | position_ids=position_ids, 123 | head_mask=head_mask, 124 | inputs_embeds=inputs_embeds, 125 | labels=labels, 126 | output_attentions=output_attentions, 127 | output_hidden_states=output_hidden_states, 128 | return_dict=return_dict, 129 | ) 130 | 131 | return self.sigmoid(outputs.logits[0, 0]) 132 | -------------------------------------------------------------------------------- /deit_pruning/src/supernet.py: -------------------------------------------------------------------------------- 1 | from layers.super_bertlayers import VA_BertModel 2 | from transformers.modeling_outputs import SequenceClassifierOutput 3 | from transformers import BertPreTrainedModel 4 | import torch 5 | from torch import nn 6 | from torch.nn import BCEWithLogitsLoss 7 | 8 | 9 | class SwiftBERT(BertPreTrainedModel): 10 | def __init__(self, config): 11 | super().__init__(config) 12 | # self.num_labels = config.num_labels 13 | self.num_labels = 1 14 | self.config = config 15 | print(config) 16 | 17 | self.bert = VA_BertModel(config) 18 | # self.dropout = nn.Dropout(config.hidden_dropout_prob) 19 | # self.relu = nn.ReLU() 20 | self.classifier = nn.Linear(config.hidden_size, self.num_labels) 21 | 22 | #self.init_weights() 23 | 24 | def forward( 25 | self, 26 | input_ids=None, 27 | attention_mask=None, 28 | token_type_ids=None, 29 | position_ids=None, 30 | head_mask=None, 31 | inputs_embeds=None, 32 | labels=None, 33 | output_attentions=None, 34 | output_hidden_states=None, 35 | return_dict=None, 36 | ): 37 | r""" 38 | labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): 39 | Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., 40 | config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), 41 | If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). 42 | """ 43 | return_dict = return_dict if return_dict is not None else self.config.use_return_dict 44 | 45 | outputs = self.bert( 46 | input_ids, 47 | attention_mask=attention_mask, 48 | token_type_ids=token_type_ids, 49 | position_ids=position_ids, 50 | head_mask=head_mask, 51 | inputs_embeds=inputs_embeds, 52 | output_attentions=output_attentions, 53 | output_hidden_states=output_hidden_states, 54 | return_dict=return_dict, 55 | ) 56 | 57 | last_hidden_state = outputs[0] 58 | logits = self.classifier(last_hidden_state[:, 0]) 59 | #print('logits',logits.shape,logits) 60 | # logits = self.relu(self.classifier(last_hidden_state[:, 0])) 61 | 62 | # pooled_output = outputs[1] 63 | 64 | # pooled_output = self.dropout(pooled_output) 65 | # logits = self.classifier(pooled_output) 66 | 67 | loss = None 68 | if labels is not None: 69 | # if self.config.problem_type is None: 70 | # if self.num_labels == 1: 71 | # self.config.problem_type = "regression" 72 | # elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): 73 | # self.config.problem_type = "single_label_classification" 74 | # else: 75 | # self.config.problem_type = "multi_label_classification" 76 | 77 | # if self.config.problem_type == "regression": 78 | # loss_fct = MSELoss() 79 | # if self.num_labels == 1: 80 | # loss = loss_fct(logits.squeeze(), labels.squeeze()) 81 | # else: 82 | # loss = loss_fct(logits, labels) 83 | # elif self.config.problem_type == "single_label_classification": 84 | # loss_fct = CrossEntropyLoss() 85 | # loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) 86 | # elif self.config.problem_type == "multi_label_classification": 87 | # loss_fct = BCEWithLogitsLoss() 88 | # loss = loss_fct(logits, labels) 89 | loss_fct = BCEWithLogitsLoss() 90 | loss = loss_fct(logits[:, 0:1], labels[:, 0:1]) 91 | if not return_dict: 92 | output = (logits,) + outputs[2:] 93 | return ((loss,) + output) if loss is not None else output 94 | 95 | return SequenceClassifierOutput( 96 | loss=loss, 97 | logits=logits, 98 | hidden_states=outputs.hidden_states, 99 | attentions=outputs.attentions, 100 | ) 101 | 102 | class SwiftBERTOutput(SwiftBERT): 103 | def __init__(self, config): 104 | super().__init__(config) 105 | self.sigmoid = nn.Sigmoid() 106 | self.one = torch.nn.parameter.Parameter(torch.tensor(1), requires_grad=False) 107 | 108 | def forward(self, 109 | input_ids=None, 110 | attention_mask=None, 111 | token_type_ids=None, 112 | position_ids=None, 113 | head_mask=None, 114 | inputs_embeds=None, 115 | labels=None, 116 | output_attentions=None, 117 | output_hidden_states=None, 118 | return_dict=None, 119 | ): 120 | outputs = super().forward( 121 | input_ids=input_ids, 122 | attention_mask=attention_mask, 123 | token_type_ids=torch.min(token_type_ids, self.one), 124 | position_ids=position_ids, 125 | head_mask=head_mask, 126 | inputs_embeds=inputs_embeds, 127 | labels=labels, 128 | output_attentions=output_attentions, 129 | output_hidden_states=output_hidden_states, 130 | return_dict=return_dict, 131 | ) 132 | 133 | return self.sigmoid(outputs.logits[0, 0]) 134 | -------------------------------------------------------------------------------- /are_16_heads/pruning.py: -------------------------------------------------------------------------------- 1 | from math import sqrt 2 | from logger import logger 3 | 4 | 5 | def parse_head_pruning_descriptors( 6 | descriptors, 7 | reverse_descriptors=False, 8 | n_heads=None 9 | ): 10 | """Returns a dictionary mapping layers to the set of heads to prune in 11 | this layer""" 12 | to_prune = {} 13 | for descriptor in descriptors: 14 | layer, heads = descriptor.split(":") 15 | layer = int(layer) - 1 16 | heads = set(int(head) - 1 for head in heads.split(",")) 17 | if layer not in to_prune: 18 | to_prune[layer] = set() 19 | to_prune[layer].update(heads) 20 | # Reverse 21 | if reverse_descriptors: 22 | if n_heads is None: 23 | raise ValueError("You need to specify the total number of heads") 24 | for layer, heads in to_prune.items(): 25 | to_prune[layer] = set([head for head in range(n_heads) 26 | if head not in heads]) 27 | return to_prune 28 | 29 | 30 | def to_pruning_descriptor(to_prune): 31 | """Inverse of parse_head_pruning_descriptors""" 32 | descriptors = [f"{layer+1}:{','.join(str(head+1) for head in heads)}" 33 | for layer, heads in to_prune.items()] 34 | return " ".join(descriptors) 35 | 36 | 37 | def determine_pruning_sequence( 38 | prune_numbers, 39 | prune_percents, 40 | n_heads, 41 | n_layers, 42 | at_least_x_heads_per_layer=0, 43 | ): 44 | all_n_to_prune = prune_numbers 45 | if all_n_to_prune is None: 46 | # Compute the number of heads to prune on percentage if needed 47 | all_n_to_prune = [] 48 | for prune_percent in prune_percents: 49 | total_heads = n_heads * n_layers 50 | n_to_prune = int(total_heads * prune_percent / 100) 51 | # Make sure we keep at least one head per layer 52 | if at_least_x_heads_per_layer > 0: 53 | if n_to_prune > total_heads - at_least_x_heads_per_layer * n_layers: 54 | logger.warn( 55 | f"Can't prune {prune_percent}% ({n_to_prune})" 56 | f" heads AND keep at least {at_least_x_heads_per_layer}" 57 | " head(s) per layer. Will" 58 | f" prune only {(1-(at_least_x_heads_per_layer*n_layers)/total_heads)*100:.1f} " 59 | f"({total_heads-n_layers}) heads instead" 60 | ) 61 | n_to_prune = total_heads - at_least_x_heads_per_layer * n_layers 62 | all_n_to_prune.append(n_to_prune) 63 | break 64 | all_n_to_prune.append(n_to_prune) 65 | 66 | # We'll incrementally prune layers and evaluate 67 | all_n_to_prune = sorted(all_n_to_prune) 68 | n_to_prune_sequence = all_n_to_prune[:] 69 | for idx in range(1, len(all_n_to_prune)): 70 | n_to_prune_sequence[idx] = all_n_to_prune[idx] - all_n_to_prune[idx-1] 71 | # Verify that the total number of heads pruned stayed the same 72 | assert all_n_to_prune[-1] == sum(n_to_prune_sequence) 73 | return n_to_prune_sequence 74 | 75 | 76 | def what_to_prune( 77 | head_importance, 78 | n_to_prune, 79 | to_prune=None, 80 | at_least_x_heads_per_layer=0, 81 | rescale_by_number=False, 82 | ): 83 | head_importance = head_importance.clone() 84 | n_layers, n_heads = head_importance.size() 85 | to_prune = to_prune or {} 86 | if rescale_by_number: 87 | for layer in to_prune: 88 | #head_importance[layer] *= sqrt(n_layers / len(to_prune[layer])) 89 | head_importance[layer] *= sqrt(len(to_prune[layer]) / n_layers) 90 | # Sort heads by score 91 | heads_and_score = [ 92 | ((layer, head), head_importance[layer, head]) 93 | for layer in range(n_layers) 94 | for head in range(n_heads) 95 | ] 96 | heads_and_score = sorted(heads_and_score, key=lambda x: x[1]) 97 | sorted_heads = [head_and_score[0] 98 | for head_and_score in heads_and_score] 99 | # Ensure we don't delete all heads in a layer 100 | if at_least_x_heads_per_layer: 101 | # Remove the top scoring head in each layer 102 | to_protect = {l: 0 for l in range(n_layers)} 103 | filtered_sorted_heads = [] 104 | for layer, head in reversed(sorted_heads): 105 | if layer in to_protect: 106 | if to_protect[layer] < at_least_x_heads_per_layer: 107 | to_protect[layer] += 1 108 | continue 109 | else: 110 | to_protect.pop(layer) 111 | filtered_sorted_heads.insert(0, (layer, head)) 112 | sorted_heads = filtered_sorted_heads 113 | # layer/heads that were already pruned 114 | # Prune the lowest scoring heads 115 | sorted_heads = [ 116 | (layer, head) 117 | for (layer, head) in sorted_heads 118 | if layer not in to_prune or head not in to_prune[layer] 119 | ] 120 | # Update heads to prune 121 | for layer, head in sorted_heads[:n_to_prune]: 122 | if layer not in to_prune: 123 | to_prune[layer] = set() 124 | to_prune[layer].add(head) 125 | return to_prune 126 | -------------------------------------------------------------------------------- /deit_pruning/vendor/nn_pruning_v1/nn_pruning/hp_naming.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import copy 16 | import re 17 | 18 | 19 | class TrialShortNamer: 20 | PREFIX = "hp" 21 | DEFAULTS = {} 22 | NAMING_INFO = None 23 | MAX_LENGTH = 200 24 | 25 | @classmethod 26 | def set_defaults(cls, prefix, defaults): 27 | cls.PREFIX = prefix 28 | cls.DEFAULTS = defaults 29 | cls.build_naming_info() 30 | 31 | @staticmethod 32 | def length_tuple_enumerator(parts): 33 | for i in range(len(parts[0])): 34 | if len(parts) == 1: 35 | yield (i + 1,) 36 | else: 37 | for l in TrialShortNamer.length_tuple_enumerator(parts[1:]): 38 | yield (i + 1,) + l 39 | 40 | @staticmethod 41 | def names_enumerator(s, splitter="_", separator="_"): 42 | parts = s.split(splitter) 43 | for ls in TrialShortNamer.length_tuple_enumerator(parts): 44 | ps = [p[: ls[i]] for i, p in enumerate(parts)] 45 | yield separator.join(ps) 46 | 47 | @staticmethod 48 | def shortname_for_key(info, param_name): 49 | # We try to create a separatorless short name, but if there is a collision we have to fallback 50 | # to a separated short name 51 | separators = ["", "_"] 52 | 53 | for separator in separators: 54 | for shortname in TrialShortNamer.names_enumerator(param_name, separator=separator): 55 | if shortname not in info["reverse_short_param"]: 56 | info["short_param"][param_name] = shortname 57 | info["reverse_short_param"][shortname] = param_name 58 | return shortname 59 | 60 | return param_name 61 | 62 | @staticmethod 63 | def add_new_param_name(info, param_name): 64 | short_name = TrialShortNamer.shortname_for_key(info, param_name) 65 | info["short_param"][param_name] = short_name 66 | info["reverse_short_param"][short_name] = param_name 67 | 68 | @classmethod 69 | def build_naming_info(cls): 70 | if cls.NAMING_INFO is not None: 71 | return 72 | 73 | info = dict( 74 | short_word={}, 75 | reverse_short_word={}, 76 | short_param={}, 77 | reverse_short_param={}, 78 | ) 79 | 80 | field_keys = list(cls.DEFAULTS.keys()) 81 | 82 | for k in field_keys: 83 | cls.add_new_param_name(info, k) 84 | 85 | cls.NAMING_INFO = info 86 | 87 | @classmethod 88 | def shortname(cls, params): 89 | cls.build_naming_info() 90 | assert cls.PREFIX is not None 91 | name = [copy.copy(cls.PREFIX)] 92 | 93 | missing_defaults = {} 94 | for k, v in params.items(): 95 | if k not in cls.DEFAULTS: 96 | missing_defaults[k] = v 97 | 98 | if len(missing_defaults) != 0: 99 | print(missing_defaults) 100 | raise Exception(f"You should provide a default value for the params {missing_defaults}") 101 | 102 | for k, v in params.items(): 103 | if v == cls.DEFAULTS[k]: 104 | # The default value is not added to the name 105 | continue 106 | 107 | key = cls.NAMING_INFO["short_param"][k] 108 | 109 | if isinstance(v, bool): 110 | v = 1 if v else 0 111 | 112 | sep = "" if isinstance(v, (int, float)) else "-" 113 | e = f"{key}{sep}{v}" 114 | name.append(e) 115 | 116 | ret = "_".join(name).replace("/", "__") 117 | 118 | if len(ret) > cls.MAX_LENGTH: 119 | h = hex(abs(hash(ret)))[2:] 120 | ret = ret[:cls.MAX_LENGTH] 121 | ret = ret[:-len(h) - 2] 122 | ret = ret + "--" + h 123 | return ret 124 | 125 | @classmethod 126 | def parse_repr(cls, repr): 127 | repr = repr[len(cls.PREFIX) + 1 :] 128 | if repr == "": 129 | values = [] 130 | else: 131 | values = repr.split("_") 132 | 133 | parameters = {} 134 | 135 | for value in values: 136 | if "-" in value: 137 | p_k, p_v = value.split("-") 138 | else: 139 | p_k = re.sub("[0-9.]", "", value) 140 | p_v = float(re.sub("[^0-9.]", "", value)) 141 | 142 | key = cls.NAMING_INFO["reverse_short_param"][p_k] 143 | 144 | parameters[key] = p_v 145 | 146 | for k in cls.DEFAULTS: 147 | if k not in parameters: 148 | parameters[k] = cls.DEFAULTS[k] 149 | 150 | return parameters 151 | -------------------------------------------------------------------------------- /deit_pruning/src/get_latency.py: -------------------------------------------------------------------------------- 1 | from supernet import SwiftBERT 2 | import random 3 | import torch 4 | from glob import glob 5 | from pathlib import Path 6 | import json 7 | from transformers import BertConfig 8 | from supernet import SwiftBERTOutput 9 | import argparse 10 | baseconfig={ 11 | "_name_or_path": "google/bert_uncased_L-4_H-256_A-4", 12 | "architectures": [ 13 | "SwiftBERT" 14 | ], 15 | "attention_probs_dropout_prob": 0.1, 16 | "gradient_checkpointing": False, 17 | "hidden_act": "gelu", 18 | "hidden_dropout_prob": 0.1, 19 | "hidden_size": 256, 20 | "initializer_range": 0.02, 21 | "intermediate_size": 1024, 22 | "layer_norm_eps": 1e-12, 23 | "max_position_embeddings": 512, 24 | "model_type": "bert", 25 | "num_attention_heads": 4, 26 | "num_hidden_layers": 4, 27 | "pad_token_id": 0, 28 | "position_embedding_type": "absolute", 29 | "transformers_version": "4.7.0", 30 | "type_vocab_size": 2, 31 | "use_cache": True, 32 | "vocab_size": 30522 33 | } 34 | def gen_testconfigs(sample_num): 35 | heads_nums=[0.25,0.5,0.75,1] 36 | intermediate_sizes=[a/100.0 for a in list(range(1,101))] 37 | 38 | #sample_num=100 39 | 40 | for si in range(sample_num): 41 | curconfig=baseconfig 42 | 43 | ## generate a test file 44 | curconfig['layers']={} 45 | tag="" 46 | for layer in range(4): 47 | head_num=random.sample(heads_nums,1)[0] 48 | im_size=random.sample(intermediate_sizes,1)[0] 49 | print(layer,head_num,im_size) 50 | curconfig['layers'][layer]={} 51 | curconfig['layers'][layer]['heads']=int(head_num*4) 52 | curconfig['layers'][layer]['intermediate_size']=int(im_size*1024) 53 | tag+='h_'+str(head_num)+'_d_'+str(im_size)+'-' 54 | 55 | tag=tag[0:-1] 56 | fw=open('latency_data/'+tag+'.json','w') 57 | fw.write(json.dumps(curconfig,indent=4)) 58 | def gen_original(): 59 | curconfig=baseconfig 60 | 61 | ## generate a test file 62 | curconfig['layers']={} 63 | tag="" 64 | for layer in range(4): 65 | head_num=4 66 | im_size=1024 67 | print(layer,head_num,im_size) 68 | curconfig['layers'][layer]={} 69 | curconfig['layers'][layer]['heads']=head_num 70 | curconfig['layers'][layer]['intermediate_size']=im_size 71 | tag+='h_'+str(head_num)+'_d_'+str(im_size)+'-' 72 | 73 | tag='config' 74 | fw=open('latency_data/'+tag+'.json','w') 75 | fw.write(json.dumps(curconfig,indent=4)) 76 | 77 | def gen_uniform(): 78 | for h in range(1,5): 79 | for j in range(1,101): 80 | 81 | curconfig=baseconfig 82 | 83 | ## generate a test file 84 | curconfig['layers']={} 85 | tag="" 86 | for layer in range(4): 87 | head_num=h 88 | im_size=1024 89 | print(layer,head_num,im_size) 90 | curconfig['layers'][layer]={} 91 | curconfig['layers'][layer]['heads']=head_num 92 | curconfig['layers'][layer]['intermediate_size']=int(im_size*(j/100.0)) 93 | tag+='h_'+str(head_num/4.0)+'_d_'+str(j/100.0)+'-' 94 | 95 | print(tag) 96 | fw=open('latency_data/'+tag[0:-1]+'.json','w') 97 | fw.write(json.dumps(curconfig,indent=4)) 98 | 99 | 100 | 101 | #gen_testconfigs() 102 | 103 | 104 | parser = argparse.ArgumentParser() 105 | parser.add_argument("--model_dir", type=Path, default='latency_data') 106 | parser.add_argument("--nn_pruning", action='store_true') 107 | parser.add_argument("--no_opt", action='store_true') 108 | parser.add_argument("--force_opt", action='store_true') 109 | parser.add_argument("--max_ad_length", type=int, default=38) 110 | parser.add_argument("--output_name", type=str, default="output") 111 | parser.add_argument("--opset_version", type=int, default=13) 112 | 113 | args = parser.parse_args() 114 | assert not (args.no_opt and args.force_opt), "no_opt and force_opt cannot be set together." 115 | # python src/onnx_export.py --model_dir ./results/dummy_mini/final/ 116 | def gen_onnx(config): 117 | myconfig=BertConfig.from_pretrained(config) 118 | print(myconfig) 119 | model = SwiftBERTOutput(myconfig) 120 | print(model) 121 | bert_config = model.config 122 | 123 | max_ad_length = args.max_ad_length 124 | 125 | print("==== export ====") 126 | output_name = config.replace(".json","") 127 | 128 | torch.onnx.export( 129 | model, 130 | (torch.tensor([1] * (max_ad_length)).view(-1, max_ad_length), 131 | torch.tensor([1] * (max_ad_length)).view(-1, max_ad_length), 132 | torch.tensor([1] * (max_ad_length)).view(-1, max_ad_length)), 133 | f'{output_name}.onnx', 134 | input_names=['input_ids', 'attention_mask', 'token_type_ids'], 135 | output_names=['score'], 136 | verbose=False, 137 | export_params=True, 138 | opset_version=args.opset_version, 139 | do_constant_folding=True 140 | ) 141 | 142 | ''' 143 | gen_original() 144 | gen_onnx('latency_data/config.json') 145 | ''' 146 | 147 | #gen_testconfigs(2000) 148 | #gen_original() 149 | #gen_uniform() 150 | filenames=glob('latency_data/**.json') 151 | for filename in filenames: 152 | gen_onnx(filename) 153 | -------------------------------------------------------------------------------- /modeling/models/vit.py: -------------------------------------------------------------------------------- 1 | from numpy.core import numeric 2 | import tensorflow as tf 3 | 4 | from einops.layers.tensorflow import Rearrange 5 | from modeling.layers.transformer_encoder import TransformerEncoderBlock, TransformerEncoderBlock_Pruned 6 | from modeling.layers.activation import gelu 7 | 8 | 9 | class ViT(tf.keras.Model): 10 | 11 | def __init__(self, *, image_size=224, patch_size=16, num_classes=1000, dim=768, depth=12, heads=12, mlp_dim=3072): 12 | super().__init__() 13 | assert image_size % patch_size == 0, 'image dimensions must be divisible by the patch size' 14 | num_patches = (image_size // patch_size) ** 2 15 | 16 | self.patch_size = patch_size 17 | self.dim = dim 18 | self.pos_embedding = self.add_weight("position_embeddings", 19 | shape=[num_patches + 1, 20 | dim], 21 | initializer=tf.keras.initializers.RandomNormal(), 22 | dtype=tf.float32) 23 | self.patch_to_embedding = tf.keras.layers.Dense(dim) 24 | self.cls_token = self.add_weight("cls_token", 25 | shape=[1, 26 | 1, 27 | dim], 28 | initializer=tf.keras.initializers.RandomNormal(), 29 | dtype=tf.float32) 30 | 31 | self.rearrange = Rearrange( 32 | 'b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=self.patch_size, p2=self.patch_size) 33 | 34 | self.transformer = TransformerEncoderBlock(dim, depth, heads, mlp_dim) 35 | 36 | self.to_cls_token = tf.identity 37 | 38 | self.mlp_head = tf.keras.Sequential([tf.keras.layers.Dense(mlp_dim, activation=gelu), 39 | tf.keras.layers.Dense(num_classes)]) 40 | 41 | @tf.function 42 | def call(self, img): 43 | shapes = tf.shape(img) 44 | 45 | x = self.rearrange(img) # [b, h * w, p * p * c] 46 | x = self.patch_to_embedding(x) # [b, h * w = n, dim] 47 | 48 | cls_tokens = tf.broadcast_to( 49 | self.cls_token, (shapes[0], 1, self.dim)) # [b, 1, dim] 50 | x = tf.concat((cls_tokens, x), axis=1) # [b, n + 1, dim] 51 | x += self.pos_embedding 52 | x = self.transformer(x) 53 | 54 | x = self.to_cls_token(x[:, 0]) # [b, dim] 55 | return self.mlp_head(x) # [b, num_classes] 56 | 57 | 58 | class ViT_Pruned(ViT): 59 | 60 | def __init__(self, *, image_size=224, patch_size=16, num_classes=1000, dim=768, depth=12, heads=12, mlp_dim=3072, head_size=64, prune_encoding='all_head12_ffn1.0'): 61 | prune_setting, num_remain_heads, ffn_thresholds = self.decode_prune_encoding(prune_encoding) 62 | if prune_setting == 'all': 63 | num_remain_heads_list = [num_remain_heads for _ in range(depth)] 64 | intermediate_size_list = [int(ffn_thresholds * mlp_dim) for _ in range(depth)] 65 | else: # prune_setting == 'layerwise' 66 | assert(len(num_remain_heads) == depth and len(ffn_thresholds) == depth) 67 | num_remain_heads_list = num_remain_heads 68 | intermediate_size_list = [int(ffn_thresholds[i] * mlp_dim) for i in range(depth)] 69 | 70 | super().__init__(image_size=image_size, patch_size=patch_size, 71 | num_classes=num_classes, dim=dim, depth=depth, heads=heads, mlp_dim=mlp_dim) 72 | 73 | # override TransformerEncoderBlock 74 | self.transformer = TransformerEncoderBlock_Pruned(hidden_size=dim, num_layers=depth, num_remain_heads_list=num_remain_heads_list, 75 | intermediate_size_list=intermediate_size_list, head_size=head_size, norm_first=True) 76 | 77 | def decode_prune_encoding(self, prune_encoding: str): 78 | tokens = prune_encoding.split('_') 79 | print(tokens) 80 | prune_setting = tokens[0] 81 | assert prune_setting in ['layerwise', 'all'] 82 | if prune_setting == 'all': 83 | # e.g. prune_encoding = 'all_head12_ffn1.0': all layers remain 12 heads and 100% ffn 84 | head_setting = tokens[1] 85 | ffn_setting = tokens[2] 86 | num_heads = int(head_setting.replace('head', '')) 87 | ffn_threshold = float(ffn_setting.replace('ffn', '')) 88 | return prune_setting, num_heads, ffn_threshold 89 | else: # prune_setting == 'layerwise' 90 | # e.g. prune_encoding = 'layerwise_h2-d1.0_h3-d0.5_h1-d0.5' 91 | num_heads_list = [] 92 | ffn_threshold_list = [] 93 | for token in tokens[1: ]: 94 | hx, dx = token.split('-') 95 | num_heads_list.append(int(hx.replace('h', ''))) 96 | ffn_threshold_list.append(float(dx.replace('d', ''))) 97 | return prune_setting, num_heads_list, ffn_threshold_list 98 | 99 | 100 | def get_deit_base(): 101 | return ViT(dim=768, depth=12) 102 | 103 | 104 | def get_deit_small(): 105 | return ViT(dim=384, heads=6, mlp_dim=384 * 4) 106 | 107 | 108 | def get_deit_tiny(): 109 | return ViT(dim=192, heads=3, mlp_dim=192 * 4) 110 | -------------------------------------------------------------------------------- /deit_pruning/src/pytorch_prune/pruner.py: -------------------------------------------------------------------------------- 1 | # Still WIP 2 | from torch.nn.utils import prune 3 | import torch.nn 4 | import argparse 5 | from pathlib import Path 6 | from .block import BlockPruningMethod, block_pruning 7 | from .ln_smart import LnSmartStructured, ln_smart_structured 8 | 9 | def is_encoder(name, module): 10 | return isinstance(module, torch.nn.Linear) and 'bert.encoder' in name 11 | 12 | prune_mapping = { 13 | "random_unstructured": (prune.random_unstructured, prune.RandomUnstructured), 14 | "l1_unstructured": (prune.l1_unstructured, prune.L1Unstructured), 15 | "random_structured": (prune.random_structured, prune.RandomStructured), 16 | "ln_structured": (prune.ln_structured, prune.LnStructured), 17 | "block": (block_pruning, BlockPruningMethod), 18 | "ln_smart_structured": (ln_smart_structured, LnSmartStructured), 19 | } 20 | 21 | def argbuilder(args): 22 | if "unstructured" in args.func or args.func == 'block': 23 | block_args = {} 24 | if args.func == "block": 25 | assert args.block_row is not None and args.block_col is not None 26 | block_args['block_row'] = args.block_row 27 | block_args['block_col'] = args.block_col 28 | if args.ln is not None: 29 | # use fro by default 30 | block_args['n'] = args.ln 31 | return {**block_args, **{ 32 | "amount": args.amount 33 | }} 34 | else: 35 | ret = { 36 | "amount": args.amount, 37 | } 38 | if args.func != "ln_smart_structured": 39 | ret["dim"] = args.dim 40 | if "ln" in args.func: 41 | ret = {**ret, **{ 42 | "n": args.ln 43 | }} 44 | return ret 45 | 46 | def isInt(s): 47 | try: 48 | int(s) 49 | return True 50 | except ValueError: 51 | return False 52 | 53 | def norm_converter(ln: str): 54 | if isInt(ln): 55 | return int(ln) 56 | elif "inf" in ln: 57 | return float(ln) 58 | else: 59 | return ln 60 | 61 | if __name__ == "__main__": 62 | from ..model import SwiftBERT 63 | from src.inspector.get_sparsity import show 64 | from ..utils import set_random 65 | 66 | parser = argparse.ArgumentParser() 67 | parser.add_argument("--func", type=str) 68 | parser.add_argument("--global", dest='glob', action='store_true') 69 | parser.add_argument("--amount", type=float, default=0.5) 70 | parser.add_argument("--deit_model_name", type=Path, required=True) 71 | parser.add_argument("--output_dir", type=Path, default='./results/playground/torch_pruned/') 72 | parser.add_argument("--ln", type=str, default=None) 73 | parser.add_argument("--dim", type=int, default=None) 74 | parser.add_argument("--block_row", type=int, default=None) 75 | parser.add_argument("--block_col", type=int, default=None) 76 | parser.add_argument("--seed", type=int, default=12345) 77 | parser.add_argument("--hybrid", action='store_true', help='It overwrites func, global & ln options') 78 | 79 | args = parser.parse_args() 80 | # python -m src.pytorch_prune.pruner --deit_model_name ./results/playground/final 81 | # python -m src.pytorch_prune.pruner --deit_model_name ./results/AdsSwiftBERT/final/ --func random_unstructured --amount 0.5 --output_dir ./results/AdsSwiftBERT/random_unstructured_0.5 82 | 83 | set_random(args.seed) 84 | if args.ln is not None: 85 | args.ln = norm_converter(args.ln) 86 | 87 | model = SwiftBERT.from_pretrained(args.deit_model_name) 88 | 89 | if args.hybrid: 90 | for name, module in model.named_modules(): 91 | if is_encoder(name, module): 92 | if "attention" in name: 93 | block_pruning(module, 'weight', amount=args.amount, block_row=args.block_row, block_col=args.block_col, n='fro') 94 | elif "dense" in name: 95 | if args.dim is None: 96 | ln_smart_structured(module, 'weight', amount=args.amount, n=1) 97 | else: 98 | prune.ln_structured(module, 'weight', amount=args.amount, n=1, dim=args.dim) 99 | else: 100 | assert 0 101 | prune.remove(module, 'weight') 102 | else: 103 | assert args.func in [ 104 | "random_unstructured", 105 | "l1_unstructured", 106 | "random_structured", 107 | "ln_structured", 108 | "block", 109 | "ln_smart_structured" 110 | ] 111 | 112 | if args.glob: 113 | assert "_structured" not in args.func 114 | 115 | if "_structured" in args.func: 116 | if args.func != "ln_smart_structured": 117 | assert args.dim is not None 118 | assert not ("ln" in args.func and args.ln is None) 119 | 120 | # start! 121 | if not args.glob: 122 | for name, module in model.named_modules(): 123 | if is_encoder(name, module): 124 | prune_mapping[args.func][0](module, 'weight', **argbuilder(args)) 125 | prune.remove(module, 'weight') 126 | else: 127 | parameters_to_prune = [] 128 | for name, module in model.named_modules(): 129 | if is_encoder(name, module): 130 | parameters_to_prune.append((module, 'weight')) 131 | prune.global_unstructured( 132 | parameters_to_prune, 133 | pruning_method=prune_mapping[args.func][1], 134 | **argbuilder(args) 135 | ) 136 | for name, module in model.named_modules(): 137 | if is_encoder(name, module): 138 | prune.remove(module, 'weight') 139 | 140 | # check sparsity 141 | show(model, skip_embedding=True, skip_layernorm=True, skip_bias=True) 142 | 143 | # export 144 | model.save_pretrained(args.output_dir) 145 | -------------------------------------------------------------------------------- /deit_pruning/src/data.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import random 3 | 4 | # The training data has ~300GB, loading directly into mem is almost impossible 5 | # That's why we need an iterabledataset here 6 | class AdIterableDataset(torch.utils.data.IterableDataset): 7 | def __init__(self, input_file, rids=False): 8 | super(AdIterableDataset).__init__() 9 | self.file = input_file 10 | 11 | # Transformers have already implemented IterableDatasetShard 12 | # So we don't need to handle it ourselves, 13 | # or the actual steps taken may be smaller than max_steps 14 | 15 | # if distributed: 16 | # self.rank = torch.distributed.get_rank() 17 | # self.world_size = torch.distributed.get_world_size() 18 | # else: 19 | # self.rank = 0 20 | # self.world_size = 1 21 | 22 | self.rids = rids 23 | 24 | def __iter__(self): 25 | # worker_info = torch.utils.data.get_worker_info() 26 | # num_workers = 1 if worker_info is None else worker_info.num_workers 27 | # local_worker_id = 0 if worker_info is None else worker_info.id 28 | 29 | # skip = self.world_size * num_workers 30 | # idx = self.rank * num_workers + local_worker_id 31 | # worker_id = self.rank * num_workers + local_worker_id 32 | 33 | with open(self.file, "r", encoding='utf-8') as reader: 34 | for entry in reader: 35 | # if idx % skip == worker_id: 36 | 37 | line = entry.rstrip("\n").split("\t") 38 | labels = torch.tensor([int(line[0])], dtype=torch.long) if self.rids else torch.tensor([float(line[0])], dtype=torch.float) 39 | input_ids = torch.tensor(list(map(int, line[1].split(" "))), dtype=torch.long) 40 | train_data = {'labels': labels, 'input_ids': input_ids} 41 | # idx = idx + 1 42 | yield train_data 43 | 44 | # else: 45 | # idx = idx + 1 46 | # continue 47 | 48 | # work like transformers' tokenizer? 49 | def get_token_att_ids(zero, one, token_ids, type_count=2): 50 | attention_mask = torch.min(token_ids, one) 51 | token_type_ids = torch.nn.functional.pad(torch.cumsum( torch.where(token_ids[:,0:-1] == 102, one, zero), dim=1), pad=(1,0), mode='constant', value=0) 52 | if type_count == 2: 53 | token_type_ids = torch.min(torch.mul(attention_mask, token_type_ids), one) 54 | else: 55 | token_type_ids = torch.min(torch.mul(attention_mask, token_type_ids), zero) 56 | return attention_mask, token_type_ids 57 | 58 | # For small dataset (validset) 59 | class AdDataset(torch.utils.data.Dataset): 60 | def __init__(self, input_file, model_structure="EarlyCrossModel", distributed=False, rids=False): 61 | assert model_structure == "EarlyCrossModel" 62 | assert distributed is False # hasn't test distribute training yet 63 | # self.zero = torch.nn.parameter.Parameter(torch.tensor(0), requires_grad=False) 64 | # self.one = torch.nn.parameter.Parameter(torch.tensor(1), requires_grad=False) 65 | self.data = [] 66 | with open(input_file, "r", encoding='utf-8') as reader: 67 | for entry in reader: 68 | line = entry.rstrip("\n").split("\t") 69 | labels = torch.tensor([int(line[0])], dtype=torch.long) if rids else torch.tensor([float(line[0])], dtype=torch.float) 70 | # labels = torch.tensor([int(line[0]), 1-int(line[0])], dtype=torch.long) if rids else torch.tensor([float(line[0]), 1-float(line[0])], dtype=torch.float) 71 | input_ids = torch.tensor(list(map(int, line[1].split(" "))), dtype=torch.long) 72 | # attention_mask, token_type_ids = get_token_att_ids(self.zero, self.one, input_ids.unsqueeze(0)) # TODO: optimize 73 | # train_data = [labels, input_ids, attention_mask[0], token_type_ids[0]] 74 | train_data = [labels, input_ids] 75 | self.data.append(train_data) 76 | 77 | def __getitem__(self, idx): 78 | # labels, input_ids, attention_mask, token_type_ids = self.data[idx] 79 | labels, input_ids = self.data[idx] 80 | 81 | return { 82 | 'input_ids': input_ids, 83 | # 'token_type_ids': token_type_ids, 84 | # 'attention_mask': attention_mask, 85 | 'labels': labels 86 | } 87 | 88 | def __len__(self): 89 | return len(self.data) 90 | 91 | # Shuffling IterableDataset 92 | class ShuffleDataset(torch.utils.data.IterableDataset): 93 | def __init__(self, dataset, buffer_size): 94 | super().__init__() 95 | self.dataset = dataset 96 | self.buffer_size = buffer_size 97 | 98 | def set_epoch(self, seed): 99 | random.seed(seed) 100 | 101 | def __iter__(self): 102 | shufbuf = [] 103 | try: 104 | dataset_iter = iter(self.dataset) 105 | for i in range(self.buffer_size): 106 | shufbuf.append(next(dataset_iter)) 107 | except: 108 | self.buffer_size = len(shufbuf) 109 | 110 | try: 111 | while True: 112 | try: 113 | item = next(dataset_iter) 114 | evict_idx = random.randint(0, self.buffer_size - 1) 115 | yield shufbuf[evict_idx] 116 | shufbuf[evict_idx] = item 117 | except StopIteration: 118 | if len(shufbuf) > 0: 119 | yield shufbuf.pop() 120 | else: 121 | break 122 | except GeneratorExit: 123 | pass 124 | 125 | # get large dataset line count 126 | # saves memory 127 | def _make_gen(reader): 128 | size = 1024 * 1024 129 | b = reader(size) 130 | while b: 131 | yield b 132 | b = reader(size) 133 | 134 | def rawgencount(filename): 135 | f = open(filename, 'rb') 136 | f_gen = _make_gen(f.raw.read) 137 | return sum(buf.count(b'\n') for buf in f_gen) 138 | -------------------------------------------------------------------------------- /experiments/D1207_tflite_quant_cnn_test.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import re 3 | import os 4 | import sys 5 | import subprocess 6 | import re 7 | 8 | sys.path.insert(0, f'{os.path.dirname(sys.argv[0])}/..') 9 | from modeling.models.cnn_zoo import cnn_zoo_dict 10 | from utils import tf2tflite_dir 11 | 12 | class ADB: 13 | def __init__(self, serino): 14 | self.serino = serino 15 | 16 | def push(self, src, dst): 17 | subprocess.run(f'adb -s {self.serino} push {src} {dst}', shell=True) 18 | 19 | def pull(self, src, dst): 20 | subprocess.run(f'adb -s {self.serino} pull {src} {dst}', shell=True) 21 | 22 | def remove(self, dst): 23 | subprocess.run(f'adb -s {self.serino} shell rm {dst}', shell=True) 24 | 25 | def run_cmd(self, cmd): 26 | result = subprocess.check_output(f'adb -s {self.serino} shell {cmd}', shell=True).decode('utf-8') 27 | print(result) 28 | return result 29 | 30 | class TfliteCnnTester: 31 | def __init__(self, adb: ADB, cnn_zoo_dict: dict, model_zoo_dir: str): 32 | self.adb = adb 33 | self.cnn_zoo_dict = cnn_zoo_dict 34 | self.tf_model_dir = os.path.join(model_zoo_dir, 'tf_model', 'quant_cnn_test') 35 | self.tflite_model_dir = os.path.join(model_zoo_dir, 'tflite_model', 'quant_cnn_test') 36 | 37 | def _get_tf_path(self, model_name) -> str: 38 | return os.path.join(self.tf_model_dir, model_name+'.tf') 39 | 40 | def _get_fp32_tflite_path(self, model_name) -> str: 41 | return os.path.join(self.tflite_model_dir, 'fp32', model_name+'.tflite') 42 | 43 | def _get_int8_tflite_path(self, model_name) -> str: 44 | return os.path.join(self.tflite_model_dir, 'int8', model_name+'_quant_int8.tflite') 45 | 46 | def _export_tf(self, ): 47 | print('===== Exporting TF Saved Model =====') 48 | for model_name, generator_func in self.cnn_zoo_dict.items(): 49 | model = generator_func() 50 | model.save(self._get_tf_path(model_name)) 51 | 52 | def _convert(self, ): 53 | print('===== Converting TFLite =====') 54 | tf2tflite_dir(self.tf_model_dir, os.path.join(self.tflite_model_dir, 'fp32'), 'None') 55 | 56 | def _quantize(self, ): 57 | print('===== Quantizing =====') 58 | tf2tflite_dir(self.tf_model_dir, os.path.join(self.tflite_model_dir, 'int8'), 'int8') 59 | 60 | def _fetch_latency(self, text: str, target='cpu_fp32'): 61 | if target in ['cpu_fp32', 'cpu_int8']: 62 | match = re.findall(r'avg=\d+\.\d+|avg=\d+', text)[-1] 63 | return float(match[len('avg='): ]) / 1000 64 | else: 65 | match = re.findall(r'Total time - \d+\.\d+ms|Total time - \d+ms', text)[-1] 66 | return float(match[len('Total time - '): -len('ms')]) 67 | 68 | def _benchmark_single(self, model_path, target='cpu_fp32'): 69 | assert target in ['cpu_fp32', 'cpu_int8', 'gpu_fp32', 'gpu_fp16'] 70 | file_name = os.path.basename(model_path) 71 | dst_path = f'/sdcard/{file_name}' 72 | avg_ms = 0.0 73 | self.adb.push(model_path, f'/sdcard/{file_name}') 74 | 75 | try: 76 | if target in ['cpu_fp32', 'cpu_int8']: 77 | output_text = self.adb.run_cmd( 78 | f'taskset 70 /data/local/tmp/benchmark_model_plus_flex_r27 --graph={dst_path} --num_runs=30 --warmup_runs=10 --use_xnnpack=false --num_threads=1') 79 | else: 80 | output_text = self.adb.run_cmd(f'/data/local/tmp/performance_profiling_plus_f32 {dst_path} {"F32" if target == "gpu_fp32" else "F16"}') 81 | except: 82 | pass 83 | 84 | self.adb.remove(dst_path) 85 | avg_ms = self._fetch_latency(output_text, target) 86 | return avg_ms 87 | 88 | def _benchmark(self, ): 89 | print('===== Benchmarking =====') 90 | name_list = list(self.cnn_zoo_dict.keys()) 91 | result_dict = {} 92 | for model_name in name_list: 93 | result_dict[model_name] = {} 94 | 95 | for target in ['cpu_fp32', 'gpu_fp32', 'gpu_fp16']: 96 | for model_name in name_list: 97 | tflite_path = self._get_fp32_tflite_path(model_name) 98 | avg_ms = self._benchmark_single(tflite_path, target) 99 | result_dict[model_name][target] = round(avg_ms, 2 if target == 'cpu_fp32' else 5) 100 | for model_name in name_list: 101 | tflite_path = self._get_int8_tflite_path(model_name) 102 | avg_ms = self._benchmark_single(tflite_path, 'cpu_int8') 103 | result_dict[model_name]['cpu_int8'] = round(avg_ms, 2) 104 | 105 | print('===============================') 106 | print(' SUMMARY') 107 | print('===============================') 108 | print(*name_list) 109 | for target in ['cpu_fp32', 'cpu_int8', 'gpu_fp32', 'gpu_fp16']: 110 | print(target, *[result_dict[k][target] for k in name_list]) 111 | 112 | def run(self, ): 113 | self._export_tf() 114 | self._convert() 115 | self._quantize() 116 | self._benchmark() 117 | 118 | 119 | def main(): 120 | parser = argparse.ArgumentParser() 121 | parser.add_argument('--model_zoo_dir', default='models', help='root dir to save tf and tflite models') 122 | parser.add_argument('--serial_number', default='98281FFAZ009SV', help='phone serial number') 123 | args = parser.parse_args() 124 | 125 | adb = ADB(args.serial_number) 126 | tflite_cnn_tester = TfliteCnnTester(adb, cnn_zoo_dict, args.model_zoo_dir) 127 | tflite_cnn_tester.run() 128 | 129 | if __name__ == '__main__': 130 | main() -------------------------------------------------------------------------------- /deit_pruning/src/trainer.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from transformers import Trainer 3 | import torch.nn as nn 4 | import torch 5 | from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union 6 | from nn_pruning.sparse_trainer import SparseTrainer 7 | from data import get_token_att_ids 8 | from utils import get_distil_loss 9 | 10 | 11 | @dataclass 12 | class DistilTrainingArguments: 13 | teacher_model: torch.nn.Module 14 | distil_temperature: float 15 | alpha_distil: float 16 | 17 | 18 | class TrainerWithTokenizer(Trainer): 19 | def __init__(self, *args, **kwargs): 20 | Trainer.__init__(self, *args, **kwargs) 21 | self.zero = torch.nn.parameter.Parameter( 22 | torch.tensor(0), requires_grad=False) 23 | self.one = torch.nn.parameter.Parameter( 24 | torch.tensor(1), requires_grad=False) 25 | 26 | def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor: 27 | attention_mask, token_type_ids = get_token_att_ids( 28 | self.zero, self.one, inputs['input_ids']) 29 | # print('train with tokenizer',inputs['input_ids']) 30 | inputs['attention_mask'] = attention_mask 31 | inputs['token_type_ids'] = token_type_ids 32 | 33 | return super().training_step(model, inputs) 34 | 35 | def prediction_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]], 36 | prediction_loss_only: bool, ignore_keys: Optional[List[str]] = None,) \ 37 | -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: 38 | attention_mask, token_type_ids = get_token_att_ids( 39 | self.zero, self.one, inputs['input_ids']) 40 | 41 | inputs['attention_mask'] = attention_mask 42 | inputs['token_type_ids'] = token_type_ids 43 | # print('predict with tokenizer',inputs['input_ids']) 44 | 45 | return super().prediction_step(model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys) 46 | 47 | 48 | class SparseWithoutTeacherTrainer(SparseTrainer, Trainer): 49 | def __init__(self, sparse_args, *args, **kwargs): 50 | Trainer.__init__(self, *args, **kwargs) 51 | SparseTrainer.__init__(self, sparse_args) 52 | 53 | def compute_loss(self, model, inputs, return_outputs=False): 54 | """ 55 | We override the default loss in SparseTrainer because it throws an 56 | error when run without distillation 57 | """ 58 | outputs = model(**inputs) 59 | 60 | # Save past state if it exists 61 | # TODO: this needs to be fixed and made cleaner later. 62 | if self.args.past_index >= 0: 63 | self._past = outputs[self.args.past_index] 64 | 65 | # We don't use .loss here since the model may return tuples instead of ModelOutput. 66 | loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] 67 | self.metrics["ce_loss"] += float(loss.mean()) 68 | self.loss_counter += 1 69 | return (loss, outputs) if return_outputs else loss 70 | 71 | 72 | class SparserWithTeacherTrainer(SparseTrainer, Trainer): 73 | def __init__(self, sparse_args, distil_args: DistilTrainingArguments, *args, **kwargs): 74 | Trainer.__init__(self, *args, **kwargs) 75 | SparseTrainer.__init__(self, sparse_args) 76 | self.teacher_model = distil_args.teacher_model 77 | self.alpha_distil = distil_args.alpha_distil 78 | self.distil_temperature = distil_args.distil_temperature 79 | 80 | def compute_loss(self, model, inputs, return_outputs=False): 81 | with torch.no_grad(): 82 | teacher_logits = self.teacher_model(**inputs).logits 83 | outputs = model(**inputs) 84 | # Save past state if it exists 85 | # TODO: this needs to be fixed and made cleaner later. 86 | if self.args.past_index >= 0: 87 | self._past = outputs[self.args.past_index] 88 | 89 | # We don't use .loss here since the model may return tuples instead of ModelOutput. 90 | loss = outputs['loss'] if isinstance(outputs, dict) else outputs[0] 91 | self.metrics['ce_loss'] += float(loss.mean()) 92 | distil_loss = get_distil_loss(outputs.logits, teacher_logits, self.distil_temperature, 'kldiv') 93 | self.metrics['distil_loss'] += float(distil_loss) 94 | loss = (1 - self.alpha_distil) * loss + self.alpha_distil * distil_loss 95 | self.loss_counter += 1 96 | 97 | return (loss, outputs) if return_outputs else loss 98 | 99 | 100 | class TrainerWithTeacher(Trainer): 101 | def __init__(self, distil_args: DistilTrainingArguments, *args, **kwargs): 102 | Trainer.__init__(self, *args, **kwargs) 103 | self.teacher_model = distil_args.teacher_model 104 | self.alpha_distil = distil_args.alpha_distil 105 | self.distil_temperature = distil_args.distil_temperature 106 | 107 | def compute_loss(self, model, inputs, return_outputs=False): 108 | with torch.no_grad(): 109 | teacher_logits = self.teacher_model(**inputs).logits 110 | outputs = model(**inputs) 111 | # Save past state if it exists 112 | # TODO: this needs to be fixed and made cleaner later. 113 | if self.args.past_index >= 0: 114 | self._past = outputs[self.args.past_index] 115 | 116 | # We don't use .loss here since the model may return tuples instead of ModelOutput. 117 | loss = outputs['loss'] if isinstance(outputs, dict) else outputs[0] 118 | distil_loss = get_distil_loss(outputs.logits, teacher_logits, self.distil_temperature, 'kldiv') 119 | loss = (1 - self.alpha_distil) * loss + self.alpha_distil * distil_loss 120 | 121 | return (loss, outputs) if return_outputs else loss -------------------------------------------------------------------------------- /draw.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | 5 | def draw_macs_accuracy_curve(): 6 | @dataclass 7 | class ModelInfo: 8 | b_macs : float = 0.0 9 | acc : float = 0.0 10 | m_params : float = 0.0 11 | 12 | 13 | modelinfo_dict = dict( 14 | deit_base = ModelInfo(17.7, 81.8), 15 | deit_small = ModelInfo(4.64, 79.9), 16 | deit_tiny = ModelInfo(1.28, 72.2), 17 | t2t_vit_14 = ModelInfo(4.8, 81.5), 18 | t2t_vit_12 = ModelInfo(1.8, 76.5), 19 | t2t_vit_10 = ModelInfo(1.5, 75.2), 20 | t2t_vit_7 = ModelInfo(1.1, 71.7), 21 | swin_base = ModelInfo(15.4, 83.5), 22 | swin_small = ModelInfo(8.7, 83), 23 | swin_tiny = ModelInfo(4.5, 81.3), 24 | autoformer_base = ModelInfo(11, 82.4), 25 | autoformer_small = ModelInfo(5.1, 81.7), 26 | autoformer_tiny = ModelInfo(1.3, 74.7), 27 | efficientnet_b7 = ModelInfo(37, 84.3), 28 | efficientnet_b6 = ModelInfo(19, 84), 29 | efficientnet_b5 = ModelInfo(9.9, 83.6), 30 | efficientnet_b4 = ModelInfo(4.2, 82.9), 31 | efficientnet_b3 = ModelInfo(1.8, 81.6), 32 | efficientnet_b2 = ModelInfo(1.0, 80.1), 33 | efficientnet_b1 = ModelInfo(0.7, 79.1), 34 | efficientnet_b0 = ModelInfo(0.39, 77.1), 35 | resnet_152 = ModelInfo(11, 77.8), 36 | resnet_101 = ModelInfo(7.9, 77.4), 37 | resnet_50 = ModelInfo(4.1, 76), 38 | mobilenet_v2 = ModelInfo(0.3, 72), 39 | mobilenet_v3_large = ModelInfo(0.22, 75.6), 40 | proxyless_mobile = ModelInfo(0.32, 74.6) 41 | ) 42 | 43 | 44 | deit_list = ['deit_tiny', 'deit_small', 'deit_base'] 45 | t2t_vit_list = ['t2t_vit_7', 't2t_vit_10', 't2t_vit_12', 't2t_vit_14'] 46 | swin_list = ['swin_tiny', 'swin_small', 'swin_base'] 47 | autoformer_list = ['autoformer_base', 'autoformer_small', 'autoformer_tiny'] 48 | efficientnet_list = [f'efficientnet_b{v}' for v in range(0, 8)] 49 | resnet_list = ['resnet_50', 'resnet_101', 'resnet_152'] 50 | mobilenet_list = ['mobilenet_v2', 'mobilenet_v3_large'] 51 | proxyless_mobile_list = ['proxyless_mobile'] 52 | 53 | plt.plot([modelinfo_dict[x].b_macs for x in deit_list], 54 | [modelinfo_dict[x].acc for x in deit_list], 55 | label='deit', c='#0099ff', marker='^') 56 | plt.plot([modelinfo_dict[x].b_macs for x in t2t_vit_list], 57 | [modelinfo_dict[x].acc for x in t2t_vit_list], 58 | label='t2t_vit', c='#4d4dff', marker='^') 59 | plt.plot([modelinfo_dict[x].b_macs for x in swin_list], 60 | [modelinfo_dict[x].acc for x in swin_list], 61 | label='swin transformer', c='#944dff', marker='^') 62 | plt.plot([modelinfo_dict[x].b_macs for x in autoformer_list], 63 | [modelinfo_dict[x].acc for x in autoformer_list], 64 | label='autoformer', c='#0099cc', marker='^') 65 | 66 | plt.plot([modelinfo_dict[x].b_macs for x in efficientnet_list], 67 | [modelinfo_dict[x].acc for x in efficientnet_list], 68 | label='efficientnet', c='#cc3300', marker='o') 69 | plt.plot([modelinfo_dict[x].b_macs for x in resnet_list], 70 | [modelinfo_dict[x].acc for x in resnet_list], 71 | label='resnet', c='#e67300', marker='o') 72 | plt.plot([modelinfo_dict[x].b_macs for x in mobilenet_list], 73 | [modelinfo_dict[x].acc for x in mobilenet_list], 74 | label='mobilenet', c='#ffaa00', marker='o') 75 | plt.plot([modelinfo_dict[x].b_macs for x in proxyless_mobile_list], 76 | [modelinfo_dict[x].acc for x in proxyless_mobile_list], 77 | label='proxyless_mobile', c='#ff4d4d', marker='o') 78 | 79 | plt.title('Model MACs and Accuracy') 80 | plt.xlabel('Billion MACs') 81 | plt.ylabel('Accuracy (%)') 82 | plt.legend() 83 | plt.savefig('tmp.png') 84 | 85 | 86 | def draw_are16heads_pruned_heads(): 87 | def pruned_head_str_to_dict(str: str): 88 | layers = str.split(' ') 89 | rv = {} 90 | for item in layers: 91 | key, value = item.split(':') 92 | key = int(key) 93 | value = [int(x) for x in value.split(',')] 94 | rv[key] = value 95 | return rv 96 | 97 | deit_base_heads72 = pruned_head_str_to_dict( 98 | '1:1,2,3,4,5,6,8,9,10,12 2:2,3,4,5,6,7,9,10,11,12 12:2,3,4,6,7,8 3:1,2,4,5,6,9,10,11,12 4:5,6,8,10,11,12 5:11,4,6 7:1,2,11 11:1,2,4,5,6,8,10,12 6:9,11,5,7 8:4,1,12,5 10:1,4,6,9,11 9:12,11,4,7' 99 | ) 100 | 101 | deit_small_head36 = pruned_head_str_to_dict( 102 | '12:1,2,3,4 1:1,2,4,5 2:2,3,4 11:1,3,4,5 4:4 10:1,2,6 3:1,2,5,6 9:1,2,6 5:4 6:2,5,6 7:1,2,4 8:2,4,6' 103 | ) 104 | deit_tiny_head18 = pruned_head_str_to_dict( 105 | '1:2,3 12:1 2:1,3 6:1,3 3:1,2 11:1,2 7:3 10:2 9:2 4:2,3 5:3 8:1' 106 | ) 107 | 108 | # head_mask = np.ones(shape=[12,3]) 109 | # for k, values in deit_tiny_head18.items(): 110 | # for v in values: 111 | # head_mask[k - 1, v - 1] = 0 112 | # plt.imshow(head_mask) 113 | # plt.title('DeiT-Tiny prune 18 (50%) heads') 114 | # plt.ylabel('Layer number') 115 | # plt.xlabel('Head number') 116 | # plt.savefig('deit_tiny_prune_heads18.png') 117 | 118 | # head_mask = np.ones(shape=[12,6]) 119 | # for k, values in deit_small_head36.items(): 120 | # for v in values: 121 | # head_mask[k - 1, v - 1] = 0 122 | # plt.imshow(head_mask) 123 | # plt.title('DeiT-Small prune 36 (50%) heads') 124 | # plt.ylabel('Layer number') 125 | # plt.xlabel('Head number') 126 | # plt.savefig('deit_small_prune_heads36.png') 127 | 128 | head_mask = np.ones(shape=[12,12]) 129 | for k, values in deit_base_heads72.items(): 130 | for v in values: 131 | head_mask[k - 1, v - 1] = 0 132 | plt.imshow(head_mask) 133 | plt.title('DeiT-Base prune 72 (50%) heads') 134 | plt.ylabel('Layer number') 135 | plt.xlabel('Head number') 136 | plt.savefig('deit_base_prune_heads72.png') 137 | 138 | draw_are16heads_pruned_heads() -------------------------------------------------------------------------------- /deit_pruning/vendor/nn_pruning_v1/nn_pruning/tests/test_patch.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest import TestCase 3 | 4 | from transformers import BertConfig, BertForQuestionAnswering 5 | 6 | from nn_pruning.model_structure import BertStructure 7 | from nn_pruning.modules.masked_nn import ( 8 | ChannelPruningModulePatcher, 9 | JointPruningModulePatcher, 10 | LinearPruningArgs, 11 | LinearPruningModulePatcher, 12 | LinearPruningArgs, 13 | ) 14 | from nn_pruning.training_patcher import LinearModelPatcher, PatcherContext 15 | 16 | 17 | class TestFun(TestCase): 18 | MODEL_STRUCTURE = BertStructure 19 | def test_base(self): 20 | config = BertConfig.from_pretrained("bert-base-uncased") 21 | model = BertForQuestionAnswering(config) 22 | 23 | patcher = LinearModelPatcher({}, self.MODEL_STRUCTURE) 24 | layers = patcher.get_patchable_layers(model) 25 | # for regexp, layers in layers.items(): 26 | # print(regexp) 27 | 28 | def test_patch_module_independent_parameters(self): 29 | config = BertConfig.from_pretrained("bert-base-uncased") 30 | model = BertForQuestionAnswering(config) 31 | 32 | parameters = LinearPruningArgs( 33 | method="topK", 34 | submethod="default", 35 | ampere_method="disabled", 36 | block_rows=32, 37 | block_cols=32, 38 | min_elements=0.005, 39 | ) 40 | 41 | context = PatcherContext() 42 | 43 | p = LinearPruningModulePatcher(context, parameters, self.MODEL_STRUCTURE) 44 | 45 | module_patchers = dict(query=p, key=p, value=p, att_dense=p, interm_dense=p, output_dense=p) 46 | 47 | patcher = LinearModelPatcher(module_patchers, self.MODEL_STRUCTURE) 48 | patcher.patch(model) 49 | 50 | self.assertEqual(patcher.stats["patched"], 72) 51 | key_sizes = {k: len(v) for k, v in context.context_modules.items()} 52 | 53 | self.assertEqual(key_sizes, {"mask": 72}) 54 | 55 | def test_patch_module_ampere(self): 56 | config = BertConfig.from_pretrained("bert-base-uncased") 57 | model = BertForQuestionAnswering(config) 58 | 59 | parameters = LinearPruningArgs( 60 | method="topK", 61 | submethod="default", 62 | ampere_method="annealing", 63 | block_rows=32, 64 | block_cols=32, 65 | min_elements=0.005, 66 | ) 67 | 68 | context = PatcherContext() 69 | 70 | p = LinearPruningModulePatcher(context, parameters, self.MODEL_STRUCTURE) 71 | 72 | module_patchers = dict(query=p, key=p, value=p, att_dense=p, interm_dense=p, output_dense=p) 73 | 74 | patcher = LinearModelPatcher(module_patchers, self.MODEL_STRUCTURE) 75 | patcher.patch(model) 76 | 77 | self.assertEqual(patcher.stats["patched"], 72) 78 | key_sizes = {k: len(v) for k, v in context.context_modules.items()} 79 | 80 | self.assertEqual(key_sizes, {"ampere_mask": 72, "mask": 72}) 81 | 82 | def test_patch_module_tied_attention(self): 83 | config = BertConfig.from_pretrained("bert-base-uncased") 84 | model = BertForQuestionAnswering(config) 85 | 86 | parameters = LinearPruningArgs( 87 | method="topK", 88 | submethod="default", 89 | ampere_method="annealing", 90 | block_rows=32, 91 | block_cols=32, 92 | min_elements=0.005, 93 | ) 94 | 95 | context = PatcherContext() 96 | 97 | p_attention = JointPruningModulePatcher(context, parameters, self.MODEL_STRUCTURE, "attention") 98 | p_dense = LinearPruningModulePatcher(context, parameters, self.MODEL_STRUCTURE) 99 | 100 | module_patchers = dict( 101 | query=p_attention, 102 | key=p_attention, 103 | value=p_attention, 104 | att_dense=p_dense, 105 | interm_dense=p_dense, 106 | output_dense=p_dense, 107 | ) 108 | 109 | patcher = LinearModelPatcher(module_patchers, self.MODEL_STRUCTURE) 110 | patcher.patch(model) 111 | 112 | self.assertEqual(patcher.stats["patched"], 72) 113 | key_sizes = {k: len(v) for k, v in context.context_modules.items()} 114 | 115 | self.assertEqual(key_sizes, {"ampere_mask": 72, "mask": 48}) 116 | 117 | def test_patch_tiedattention_line_pruning(self): 118 | config = BertConfig.from_pretrained("bert-base-uncased") 119 | model = BertForQuestionAnswering(config) 120 | 121 | parameters_attention = LinearPruningArgs( 122 | method="topK", 123 | submethod="default", 124 | ampere_method="annealing", 125 | block_rows=32, 126 | block_cols=32, 127 | min_elements=0.005, 128 | ) 129 | 130 | parameters_dense = LinearPruningArgs( 131 | method="topK", submethod="1d", ampere_method="annealing", block_rows=32, block_cols=32, min_elements=0.005 132 | ) 133 | 134 | context = PatcherContext() 135 | 136 | p_attention = JointPruningModulePatcher(context, parameters_attention, self.MODEL_STRUCTURE, suffix=".attention") 137 | p_dense = ChannelPruningModulePatcher(context, parameters_dense, self.MODEL_STRUCTURE, suffix="dense") 138 | 139 | module_patchers = dict( 140 | query=p_attention, 141 | key=p_attention, 142 | value=p_attention, 143 | att_dense=p_dense, 144 | interm_dense=p_dense, 145 | output_dense=p_dense, 146 | ) 147 | 148 | patcher = LinearModelPatcher(module_patchers, self.MODEL_STRUCTURE) 149 | patcher.patch(model) 150 | 151 | self.assertEqual(patcher.stats["patched"], 72) 152 | key_sizes = {k: len(v) for k, v in context.context_modules.items()} 153 | 154 | for k, v in key_sizes.items(): 155 | print(k, v) 156 | 157 | for k, v in context.context_modules.items(): 158 | print(k, v) 159 | self.assertEqual(key_sizes, {"ampere_mask": 72, "mask": 12, "mask_1d": 48}) 160 | 161 | 162 | if __name__ == "__main__": 163 | unittest.main() 164 | -------------------------------------------------------------------------------- /deit_pruning/vendor/nn_pruning_v1/nn_pruning/modules/nonorm.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | from nn_pruning.model_patcher import ModelPatcher 4 | 5 | 6 | class Layer2NoNorm(nn.Module): 7 | # There are two ways to specify how the module will move progressively from a LayerNorm to a NoNorm 8 | # If you give a non-None schedule_callback, steps and start_delta won't be used. 9 | # It must be a function that returns a dictionary containing at least two keys: 10 | # - mix : moving from 1.0 to 0.0 , it is the lerp factor between LayerNorm and NoNorm: 1.0 -> LayerNorm, 0.0 -> NoNorm 11 | # - delta : moving from 0.99 to 1.0 for example, it is the accumulator exponential decay, 12 | # the higher the longer the period it smooth the mean/variance accumulator 13 | # If you don't specify a schedule_callback, each call to forward will count as a step, and in 'steps' steps 14 | # it will move to a LayerNorm to a NoNorm 15 | 16 | def __init__(self, layerNorm, 17 | steps = 5000, 18 | start_delta = 0.99, 19 | schedule_callback = None): 20 | super().__init__() 21 | self.normalized_shape = layerNorm.normalized_shape 22 | self.eps = layerNorm.eps 23 | self.elementwise_affine = layerNorm.elementwise_affine 24 | assert(self.elementwise_affine) 25 | self.weight = nn.Parameter(layerNorm.weight.detach().clone()) 26 | self.bias = nn.Parameter(layerNorm.bias.detach().clone()) 27 | # Accumulators are for mean and std, and accumulator normalization factor 28 | self.schedule_callback = schedule_callback 29 | 30 | if self.schedule_callback is None: 31 | self.steps = steps 32 | self.delta = start_delta 33 | self.final_delta = 1.0 34 | self.delta_step = (self.final_delta - self.delta) / self.steps 35 | self.mix_step = 1 / self.steps 36 | self.mix = 1.0 37 | else: 38 | self.steps = None 39 | self.delta_step = None 40 | self.mix_step = None 41 | self.delta = None 42 | self.final_delta = None 43 | self.mix = None 44 | 45 | self.register_buffer("accumulator", torch.zeros(3, device=layerNorm.weight.device)) 46 | 47 | def forward(self, batch): 48 | accumulator = self.accumulator.clone() 49 | 50 | if self.schedule_callback is not None: 51 | d = self.schedule_callback() 52 | mix = d["mix"] 53 | delta = d["delta"] 54 | else: 55 | if self.training: 56 | mix = self.mix 57 | delta = self.delta 58 | else: 59 | mix = 0 60 | delta = 1.0 61 | 62 | if mix == 0 and delta == 1.0: 63 | batch_mean = accumulator[0] / accumulator[2] 64 | batch_var = accumulator[1] / accumulator[2] 65 | else: 66 | batch_mean = batch.mean(-1, keepdim=True) 67 | batch_var = batch.var(-1, unbiased=False, keepdim=True) 68 | 69 | if self.training: 70 | one = torch.tensor(1.0, device=batch_var.device) 71 | new_acc = torch.stack([batch_mean.mean(), batch_var.mean(), one]) 72 | accumulator = torch.lerp(new_acc, accumulator, delta) 73 | 74 | batch_mean = torch.lerp(accumulator[0] / accumulator[2], batch_mean, mix) 75 | batch_var = torch.lerp(accumulator[1] / accumulator[2], batch_var, mix) 76 | 77 | ret = (batch - batch_mean) / (batch_var + self.eps).sqrt() 78 | ret = ret * self.weight + self.bias 79 | 80 | if self.training: 81 | self.accumulator = accumulator.detach() 82 | if self.schedule_callback is None: 83 | self.mix = max(0.0, self.mix - self.mix_step) 84 | self.delta = min(self.final_delta, self.delta + self.delta_step) 85 | 86 | return ret 87 | 88 | def compile(self): 89 | accumulator = self.accumulator 90 | mean = accumulator[0] / accumulator[2] 91 | var = accumulator[1] / accumulator[2] 92 | 93 | inv_var = 1.0 / (var + self.eps).sqrt() 94 | 95 | weight = self.weight * inv_var 96 | bias = - mean * inv_var * self.weight + self.bias 97 | 98 | return NoNorm(weight.detach().clone(), bias.detach().clone()) 99 | 100 | class Layer2NoNormPatcher(ModelPatcher): 101 | def __init__(self, 102 | steps = 5000, 103 | start_delta = 0.99, 104 | schedule_callback = None): 105 | super().__init__(all_match=True) 106 | self.steps = steps 107 | self.start_delta = start_delta 108 | self.schedule_callback = schedule_callback 109 | 110 | def is_patchable(self, module_name, module, raiseError): 111 | return isinstance(module, nn.LayerNorm) 112 | 113 | def new_child_module(self, child_module_name, child_module, patch_info): 114 | return Layer2NoNorm(child_module, 115 | steps = self.steps, 116 | start_delta = self.start_delta, 117 | schedule_callback = self.schedule_callback) 118 | 119 | class NoNorm(nn.Module): 120 | def __init__(self, weight, bias): 121 | super().__init__() 122 | self.weight = nn.Parameter(weight) 123 | self.bias = nn.Parameter(bias) 124 | 125 | def forward(self, batch): 126 | return batch * self.weight + self.bias 127 | 128 | class NoNormCompiler(ModelPatcher): 129 | def __init__(self): 130 | super().__init__(all_match=True) 131 | 132 | def is_patchable(self, module_name, module, raiseError): 133 | return isinstance(module, Layer2NoNorm) 134 | 135 | def new_child_module(self, child_module_name, child_module, patch_info): 136 | return child_module.compile() 137 | 138 | class NoNormPatcher(ModelPatcher): 139 | def __init__(self): 140 | super().__init__(all_match=True) 141 | 142 | def is_patchable(self, module_name, module, raiseError): 143 | return isinstance(module, nn.LayerNorm) 144 | 145 | def new_child_module(self, child_module_name, child_module, patch_info): 146 | return NoNorm(child_module.weight.detach(), child_module.bias.detach()) 147 | -------------------------------------------------------------------------------- /experiments/D1207_vino_quant_cnn_test.py: -------------------------------------------------------------------------------- 1 | from tensorflow import keras 2 | import os 3 | import sys 4 | import subprocess 5 | import argparse 6 | import re 7 | import json 8 | 9 | sys.path.insert(0, f'{os.path.dirname(sys.argv[0])}/..') 10 | from utils import freeze_graph 11 | from modeling.models.cnn_zoo import cnn_zoo_dict 12 | from benchmark.openvino.vino_cli import openvino_benchmark 13 | 14 | class PotConfigJson: 15 | def __init__(self, model_xml_path: str, dataset_path) -> None: 16 | model_name = os.path.splitext(os.path.basename(model_xml_path))[0] 17 | self.pot_config = { 18 | "model": { 19 | "model_name": f"{model_name}", 20 | "model": f"{model_xml_path}", 21 | "weights": f"{model_xml_path.replace('.xml', '.bin')}" 22 | }, 23 | "engine": { 24 | "type": "simplified", 25 | "data_source": f"{dataset_path}" 26 | }, 27 | "compression": { 28 | "target_device": "CPU", 29 | "algorithms": [ 30 | { 31 | "name": "DefaultQuantization", 32 | "params": { 33 | "preset": "performance", 34 | "stat_subset_size": 3 35 | } 36 | } 37 | ] 38 | } 39 | } 40 | 41 | def dump(self, output_path): 42 | with open(output_path, 'w') as f: 43 | json.dump(self.pot_config, f, indent=4) 44 | f.write('\n') 45 | 46 | class VinoCnnTester: 47 | def __init__(self, cnn_zoo_dict: dict, vino_model_dir: str, dataset_dir: str): 48 | self.src_model_dir = os.path.join(vino_model_dir, 'src_model', 'quant_cnn_test') 49 | self.ir_model_dir = os.path.join(vino_model_dir, 'ir', 'quant_cnn_test') 50 | self.dataset_dir = dataset_dir 51 | self.cnn_zoo_dict = cnn_zoo_dict 52 | 53 | def _get_src_pb_path(self, model_name) -> str: 54 | return os.path.join(self.src_model_dir, model_name+'.pb') 55 | 56 | def _get_fp32_ir_dir(self, model_name) -> str: 57 | return os.path.join(self.ir_model_dir, model_name, 'FP32') 58 | 59 | def _export_src(self, ): 60 | print('===== Exporting SRC =====') 61 | for model_name, generator_func in self.cnn_zoo_dict.items(): 62 | model = generator_func() 63 | freeze_graph(keras_model=model, output_path=self._get_src_pb_path(model_name)) 64 | 65 | def _convert(self, ): 66 | print('===== Converting SRC to OpenVINO IR =====') 67 | for model_name in self.cnn_zoo_dict.keys(): 68 | input_path = self._get_src_pb_path(model_name) 69 | output_dir = self._get_fp32_ir_dir(model_name) 70 | subprocess.run(f'python $VINO_MO --input_model={input_path} --model_name={model_name} --output_dir={output_dir} --batch=1 --data_type=FP32', shell=True) 71 | 72 | def _quantize(self, ): 73 | print('==== Quantizing FP32IR to INT8 ====') 74 | for model_name in self.cnn_zoo_dict.keys(): 75 | fp32_ir_dir = self._get_fp32_ir_dir(model_name) 76 | output_dir = fp32_ir_dir.replace('/FP32', '/FP32-INT8') 77 | if os.path.exists(output_dir): 78 | subprocess.run(f'rm -r {output_dir}', shell=True) 79 | os.mkdir(output_dir) 80 | # save pot-config.json 81 | pot_config_json_path = os.path.join(output_dir, 'pot-config.json') 82 | pot_config = PotConfigJson( 83 | model_xml_path=os.path.join(fp32_ir_dir, model_name+'.xml'), 84 | dataset_path=self.dataset_dir 85 | ) 86 | pot_config.dump(pot_config_json_path) 87 | # quant 88 | subprocess.run(f'python -m pot -c {pot_config_json_path} --direct-dump --output-dir={output_dir}', shell=True) 89 | # move file 90 | subprocess.run(f'mv {output_dir}/optimized/{model_name}.xml {output_dir}', shell=True) 91 | subprocess.run(f'mv {output_dir}/optimized/{model_name}.bin {output_dir}', shell=True) 92 | 93 | def _benchmark(self): 94 | print('====== Benchmarking model performance on CPU with 1 thread ======') 95 | latency_list_fp32 = [] 96 | latency_list_int8 = [] 97 | for model_name in self.cnn_zoo_dict.keys(): 98 | xml_path_fp32 = os.path.join(self._get_fp32_ir_dir(model_name), model_name+'.xml') 99 | xml_path_int8 = xml_path_fp32.replace('/FP32/', '/FP32-INT8/') 100 | latency_fp32 = 0.0 101 | latency_int8 = 0.0 102 | try: 103 | latency_fp32 = openvino_benchmark('$VINO_BENCHMARK_APP', xml_path_fp32, niter=30, num_threads=1, batch_size=1, csv_output_dir=os.path.dirname(xml_path_fp32), show_detail=False) 104 | latency_int8 = openvino_benchmark('$VINO_BENCHMARK_APP', xml_path_int8, niter=30, num_threads=1, batch_size=1, csv_output_dir=os.path.dirname(xml_path_int8), show_detail=False) 105 | except: 106 | pass 107 | latency_list_fp32.append(round(latency_fp32, 2)) 108 | latency_list_int8.append(round(latency_int8, 2)) 109 | 110 | print('==================================================================================') 111 | print(' SUMMARY') 112 | print('==================================================================================') 113 | 114 | print(list(self.cnn_zoo_dict.keys())) 115 | print('FP32 ms') 116 | print(latency_list_fp32) 117 | print('INT8 ms') 118 | print(latency_list_int8) 119 | 120 | def run(self, ): 121 | self._export_src() 122 | self._convert() 123 | self._quantize() 124 | self._benchmark() 125 | 126 | def main(): 127 | parser = argparse.ArgumentParser() 128 | parser.add_argument('--vino_model_dir', required=True, type=str, help='vino model root dir to save SRC(tf, onnx) and IR model') 129 | parser.add_argument('--dataset_dir', required=True, type=str, help='dataset dir to do quantization') 130 | args = parser.parse_args() 131 | 132 | vino_cnn_tester = VinoCnnTester(cnn_zoo_dict, args.vino_model_dir, args.dataset_dir) 133 | vino_cnn_tester.run() 134 | 135 | 136 | if __name__ == '__main__': 137 | main() --------------------------------------------------------------------------------