├── profile ├── power_results │ ├── bert-l-320.csv │ └── power_usage.csv ├── power_monitor.sh ├── customized_layer.py ├── vit_infer.py ├── bert_infer.py ├── README.md └── benckmark_logs │ ├── deit-s_power.csv │ ├── bert-b-128_power.csv │ ├── bert-b-384_power.csv │ ├── deit-t_power.csv │ ├── bert-l-320_power.csv │ └── deit-b-power.csv ├── software_model ├── models │ └── __init__.py ├── ops │ ├── __init__.py │ ├── simulator.py │ └── _quant_base.py ├── hubconf.py ├── deit_t_sweep_wavelength.csv ├── deit_t_sweep_input_noise_std.csv ├── deit_t_sweep_phase_noise_std.csv ├── scripts │ ├── process_output_logs.sh │ ├── train_quant_transformer_with_noise.sh │ ├── evaluate_quant_transformer.sh │ └── evaluate_quant_transformer_scan_noise.sh ├── process_logs.py ├── samplers.py ├── losses.py ├── augment.py ├── engine.py ├── datasets.py ├── resmlp_models.py ├── models.py ├── utils.py └── readme.md ├── HPCA24_LT_poster_v1_02.pdf ├── hardware_simulator ├── params │ ├── models │ │ ├── bert_base.yaml │ │ ├── bert_large.yaml │ │ ├── deit_base.yaml │ │ ├── deit_small.yaml │ │ └── deit_tiny.yaml │ └── device_params │ │ ├── Dota_B_4bit.yaml │ │ ├── Dota_B_8bit.yaml │ │ ├── Dota_L_4bit.yaml │ │ ├── Dota_L_8bit.yaml │ │ ├── Bs_mrr_bank_4bit.yaml │ │ ├── Bs_mrr_bank_8bit.yaml │ │ ├── Bs_mzi_4bit.yaml │ │ ├── Bs_mzi_8bit.yaml │ │ └── default.yaml ├── hardware │ ├── __init__.py │ ├── SRAM.py │ ├── ADC.py │ ├── DAC.py │ └── photonic_core_base.py ├── scripts │ ├── energy_latency_onns_deit_t.sh │ ├── energy_latency_single.sh │ ├── energy_latency_onns_deit.sh │ ├── area_power_all.sh │ └── energy_latency_all.sh ├── utils │ ├── __init__.py │ ├── config.py │ ├── model.py │ └── cal_flops_for_transformer.py ├── entry_energy_latency_workload.py └── readme.md ├── readme.md └── .gitignore /profile/power_results/bert-l-320.csv: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /profile/power_monitor.sh: -------------------------------------------------------------------------------- 1 | nvidia-smi dmon -s puc -d 1 -i 0 -------------------------------------------------------------------------------- /software_model/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .quant_vit import * -------------------------------------------------------------------------------- /HPCA24_LT_poster_v1_02.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuhanqing/Lightening-Transformer-AE/HEAD/HPCA24_LT_poster_v1_02.pdf -------------------------------------------------------------------------------- /hardware_simulator/params/models/bert_base.yaml: -------------------------------------------------------------------------------- 1 | model 2 | name: "bert_base" 3 | depth: 12 4 | num_heads: 12 5 | embed_dim: 768 6 | mlp_ratio: 4 7 | tokens: 128 -------------------------------------------------------------------------------- /hardware_simulator/params/models/bert_large.yaml: -------------------------------------------------------------------------------- 1 | model 2 | name: "bert_large" 3 | depth: 24 4 | num_heads: 12 5 | embed_dim: 768 6 | mlp_ratio: 4 7 | tokens: 384 -------------------------------------------------------------------------------- /hardware_simulator/params/models/deit_base.yaml: -------------------------------------------------------------------------------- 1 | model 2 | name: "deit-s" 3 | patch: 16 4 | depth: 12 5 | embed_dim: 768 6 | num_heads: 12 7 | mlp_ratio: 4 8 | tokens: 197 -------------------------------------------------------------------------------- /hardware_simulator/params/models/deit_small.yaml: -------------------------------------------------------------------------------- 1 | model 2 | name: "deit-s" 3 | patch: 16 4 | depth: 12 5 | embed_dim: 368 6 | num_heads: 6 7 | mlp_ratio: 4 8 | tokens: 197 -------------------------------------------------------------------------------- /hardware_simulator/params/models/deit_tiny.yaml: -------------------------------------------------------------------------------- 1 | model 2 | name: "deit-t" 3 | patch: 16 4 | depth: 12 5 | embed_dim: 192 6 | num_heads: 3 7 | mlp_ratio: 4 8 | tokens: 197 -------------------------------------------------------------------------------- /software_model/ops/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Hanqing Zhu 3 | # @Date: 2023-01-02 21:13:44 4 | # @Last Modified by: Hanqing Zhu 5 | # @Last Modified time: 2023-03-28 16:41:34 6 | from .quantize import * 7 | from .simulator import * -------------------------------------------------------------------------------- /software_model/hubconf.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015-present, Facebook, Inc. 2 | # All rights reserved. 3 | from models import * 4 | from cait_models import * 5 | from resmlp_models import * 6 | #from patchconvnet_models import * 7 | 8 | dependencies = ["torch", "torchvision", "timm"] 9 | -------------------------------------------------------------------------------- /software_model/deit_t_sweep_wavelength.csv: -------------------------------------------------------------------------------- 1 | test1,test2,test3,mean,std 2 | 71.174,71.014,70.99,71.05933333333333,0.10002666311206546 3 | 71.052,71.1,70.972,71.04133333333333,0.06466323014923916 4 | 71.034,70.924,70.924,70.96066666666667,0.06350852961085851 5 | 70.99,70.952,71.144,71.02866666666667,0.10167267741794891 6 | 71.206,70.82,71.184,71.07,0.21678560837842034 7 | -------------------------------------------------------------------------------- /hardware_simulator/hardware/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Hanqing Zhu 3 | # @Date: 2023-02-25 11:30:16 4 | # @Last Modified by: Hanqing Zhu(hqzhu@utexas.edu) 5 | # @Last Modified time: 2023-11-08 00:46:22 6 | from .photonic_crossbar import * 7 | from .photonic_mrr_bank import * 8 | from .photonic_MZI import * 9 | from .SRAM import * 10 | from .ADC import * 11 | from .DAC import * -------------------------------------------------------------------------------- /software_model/deit_t_sweep_input_noise_std.csv: -------------------------------------------------------------------------------- 1 | test1,test2,test3,mean,std 2 | 71.052,71.1,70.972,71.04133333333333,0.06466323014923916 3 | 71.044,70.86,71.048,70.984,0.10740577265678043 4 | 71.002,70.762,70.938,70.90066666666667,0.12427925544246267 5 | 70.844,70.674,70.672,70.73,0.09873196037757545 6 | 70.724,70.532,70.518,70.59133333333334,0.11510574848083822 7 | 70.61,70.334,70.434,70.45933333333333,0.13973307888017394 8 | -------------------------------------------------------------------------------- /software_model/deit_t_sweep_phase_noise_std.csv: -------------------------------------------------------------------------------- 1 | test1,test2,test3,mean,std 2 | 71.052,71.1,70.972,71.04133333333333,0.06466323014923916 3 | 70.964,70.942,70.982,70.96266666666666,0.020033305601758828 4 | 71.12,70.934,71.062,71.03866666666667,0.09517002329165371 5 | 71.116,70.938,71.206,71.08666666666667,0.13638670511942627 6 | 71.086,70.884,71.13,71.03333333333333,0.13118434865994025 7 | 71.168,71.022,71.038,71.07600000000001,0.08007496487667297 8 | -------------------------------------------------------------------------------- /hardware_simulator/scripts/energy_latency_onns_deit_t.sh: -------------------------------------------------------------------------------- 1 | exp='energy_latency_compare_onns_deit_t' 2 | model_name='deit-t' 3 | tokens=197 4 | declare -A config_dict 5 | config_dict=( 6 | ['dota_b_4bit']='./params/device_params/Dota_B_4bit.yaml' 7 | # ['mrr_4bit']='./params/device_params/Bs_mrr_bank_4bit.yaml' 8 | # ['mzi_4bit']='./params/device_params/Bs_mzi_4bit.yaml' 9 | ) 10 | 11 | for key in "${!config_dict[@]}" 12 | do 13 | # Get the value associated with the key 14 | onn_params="${config_dict[$key]}" 15 | 16 | python entry_energy_latency_workload.py \ 17 | -e ${exp} \ 18 | --tokens ${tokens} \ 19 | --model_name ${model_name} \ 20 | --config ${onn_params} \ 21 | -o 'broadcast' 22 | done -------------------------------------------------------------------------------- /profile/power_results/power_usage.csv: -------------------------------------------------------------------------------- 1 | # gpu pwr gtemp mtemp sm mem enc dec jpg ofa mclk pclk 2 | # Idx W C C % % % % % % MHz MHz 3 | 0 61 38 54 0 0 0 0 0 0 1593 210 4 | 0 61 38 54 0 0 0 0 0 0 1593 210 5 | 0 61 38 53 0 0 0 0 0 0 1593 210 6 | 0 61 38 53 0 0 0 0 0 0 1593 210 7 | 0 61 38 53 0 0 0 0 0 0 1593 210 8 | 0 61 38 54 0 0 0 0 0 0 1593 210 9 | 0 61 38 54 0 0 0 0 0 0 1593 210 10 | -------------------------------------------------------------------------------- /hardware_simulator/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Hanqing Zhu 3 | # @Date: 2023-02-23 22:51:07 4 | # @Last Modified by: Hanqing Zhu 5 | # @Last Modified time: 2023-02-23 22:51:23 6 | import importlib 7 | import os 8 | 9 | # automatically import any Python files in this directory 10 | for file in sorted(os.listdir(os.path.dirname(__file__))): 11 | if file.endswith(".py") and not file.startswith("_"): 12 | source = file[: file.find(".py")] 13 | module = importlib.import_module("utils." + source) 14 | if "__all__" in module.__dict__: 15 | names = module.__dict__["__all__"] 16 | else: 17 | # import all names that do not begin with _ 18 | names = [x for x in module.__dict__ if not x.startswith("_")] 19 | globals().update({k: getattr(module, k) for k in names}) -------------------------------------------------------------------------------- /hardware_simulator/scripts/energy_latency_single.sh: -------------------------------------------------------------------------------- 1 | exp='energy_latency_single_workload' 2 | model_name='deit-t' 3 | tokens=197 4 | onn_params='./params/device_params/Dota_B_4bit.yaml' 5 | # choose onn accelerator params from 6 | # config_dict=( 7 | # ['dota_b_4bit']='./params/device_params/Dota_B_4bit.yaml' 8 | # ['dota_b_8bit']='./params/device_params/Dota_B_8bit.yaml' 9 | # ['dota_l_4bit']='./params/device_params/Dota_L_4bit.yaml' 10 | # ['dota_l_8bit']='./params/device_params/Dota_L_8bit.yaml' 11 | # ['mrr_4bit']='./params/device_params/Bs_mrr_bank_4bit.yaml' 12 | # ['mrr_8bit']='./params/device_params/Bs_mrr_bank_8bit.yaml' 13 | # ['mzi_4bit']='./params/device_params/Bs_mzi_4bit.yaml' 14 | # ['mzi_8bit']='./params/device_params/Bs_mzi_8bit.yaml' 15 | # ) 16 | 17 | 18 | python entry_energy_latency_workload.py \ 19 | -e ${exp} \ 20 | --tokens ${tokens} \ 21 | --model_name ${model_name} \ 22 | --config ${onn_params} 23 | -------------------------------------------------------------------------------- /hardware_simulator/params/device_params/Dota_B_4bit.yaml: -------------------------------------------------------------------------------- 1 | # @Author: Hanqing Zhu 2 | # @Date: 2023-02-23 22:53:17 3 | # @Last Modified by: Hanqing Zhu 4 | # @Last Modified time: 2023-03-24 23:07:35 5 | # device level 6 | 7 | core: 8 | type: "dota" 9 | width: 12 10 | height: 12 11 | num_wavelength: 12 12 | work_freq: 5 13 | interface: 14 | ADC: 15 | choice: 1 16 | sharing_factor: 1 17 | DAC: 18 | choice: 1 19 | TIA: 20 | power: 3 21 | area: 50 22 | precision: 23 | in_bit: 4 24 | w_bit: 4 25 | act_bit: 4 26 | 27 | arch: 28 | num_tiles: 4 29 | num_pe_per_tile: 2 30 | full_range_support_factor: 1 # pe support full range or not -> 2: one operand is positive only -> 4 all operands are positive only 31 | ### unique arch params for our DOTA 32 | time_accum_factor: 3 33 | input_mod_sharing_flag: 1 # whether input is globally shared cross tiles 34 | adc_share_flag: 1 # multiple PEs share one adc array -------------------------------------------------------------------------------- /hardware_simulator/params/device_params/Dota_B_8bit.yaml: -------------------------------------------------------------------------------- 1 | # @Author: Hanqing Zhu 2 | # @Date: 2023-02-23 22:53:17 3 | # @Last Modified by: Hanqing Zhu 4 | # @Last Modified time: 2023-03-24 23:07:35 5 | # device level 6 | 7 | core: 8 | type: "dota" 9 | width: 12 10 | height: 12 11 | num_wavelength: 12 12 | work_freq: 5 13 | interface: 14 | ADC: 15 | choice: 1 16 | sharing_factor: 1 17 | DAC: 18 | choice: 1 19 | TIA: 20 | power: 3 21 | area: 50 22 | precision: 23 | in_bit: 8 24 | w_bit: 8 25 | act_bit: 8 26 | 27 | arch: 28 | num_tiles: 4 29 | num_pe_per_tile: 2 30 | full_range_support_factor: 1 # pe support full range or not -> 2: one operand is positive only -> 4 all operands are positive only 31 | ### unique arch params for our DOTA 32 | time_accum_factor: 3 33 | input_mod_sharing_flag: 1 # whether input is globally shared cross tiles 34 | adc_share_flag: 1 # multiple PEs share one adc array -------------------------------------------------------------------------------- /hardware_simulator/params/device_params/Dota_L_4bit.yaml: -------------------------------------------------------------------------------- 1 | # @Author: Hanqing Zhu 2 | # @Date: 2023-02-23 22:53:17 3 | # @Last Modified by: Hanqing Zhu 4 | # @Last Modified time: 2023-03-24 23:07:35 5 | # device level 6 | 7 | core: 8 | type: "dota" 9 | width: 12 10 | height: 12 11 | num_wavelength: 12 12 | work_freq: 5 13 | interface: 14 | ADC: 15 | choice: 1 16 | sharing_factor: 1 17 | DAC: 18 | choice: 1 19 | TIA: 20 | power: 3 21 | area: 50 22 | precision: 23 | in_bit: 4 24 | w_bit: 4 25 | act_bit: 4 26 | 27 | arch: 28 | num_tiles: 8 29 | num_pe_per_tile: 2 30 | full_range_support_factor: 1 # pe support full range or not -> 2: one operand is positive only -> 4 all operands are positive only 31 | ### unique arch params for our DOTA 32 | time_accum_factor: 3 33 | input_mod_sharing_flag: 1 # whether input is globally shared cross tiles 34 | adc_share_flag: 1 # multiple PEs share one adc array -------------------------------------------------------------------------------- /hardware_simulator/params/device_params/Dota_L_8bit.yaml: -------------------------------------------------------------------------------- 1 | # @Author: Hanqing Zhu 2 | # @Date: 2023-02-23 22:53:17 3 | # @Last Modified by: Hanqing Zhu 4 | # @Last Modified time: 2023-03-24 23:07:35 5 | # device level 6 | 7 | core: 8 | type: "dota" 9 | width: 12 10 | height: 12 11 | num_wavelength: 12 12 | work_freq: 5 13 | interface: 14 | ADC: 15 | choice: 1 16 | sharing_factor: 1 17 | DAC: 18 | choice: 1 19 | TIA: 20 | power: 3 21 | area: 50 22 | precision: 23 | in_bit: 8 24 | w_bit: 8 25 | act_bit: 8 26 | 27 | arch: 28 | num_tiles: 8 29 | num_pe_per_tile: 2 30 | full_range_support_factor: 1 # pe support full range or not -> 2: one operand is positive only -> 4 all operands are positive only 31 | ### unique arch params for our DOTA 32 | time_accum_factor: 3 33 | input_mod_sharing_flag: 1 # whether input is globally shared cross tiles 34 | adc_share_flag: 1 # multiple PEs share one adc array -------------------------------------------------------------------------------- /hardware_simulator/scripts/energy_latency_onns_deit.sh: -------------------------------------------------------------------------------- 1 | exp='energy_latency_onns_deit' 2 | # model_name='deit-t' 3 | tokens=197 4 | declare -A config_dict 5 | config_dict=( 6 | ['dota_b_4bit']='./params/device_params/Dota_B_4bit.yaml' 7 | ['mrr_4bit']='./params/device_params/Bs_mrr_bank_4bit.yaml' 8 | ['mzi_4bit']='./params/device_params/Bs_mzi_4bit.yaml' 9 | ['dota_b_8bit']='./params/device_params/Dota_B_8bit.yaml' 10 | ['mrr_8bit']='./params/device_params/Bs_mrr_bank_8bit.yaml' 11 | ['mzi_8bit']='./params/device_params/Bs_mzi_8bit.yaml' 12 | ) 13 | 14 | for model_name in 'deit-t' 'deit-b' 15 | do 16 | for key in "${!config_dict[@]}" 17 | do 18 | # Get the value associated with the key 19 | onn_params="${config_dict[$key]}" 20 | 21 | python entry_energy_latency_workload.py \ 22 | -e ${exp} \ 23 | --tokens ${tokens} \ 24 | --model_name ${model_name} \ 25 | --config ${onn_params} 26 | done 27 | done -------------------------------------------------------------------------------- /hardware_simulator/params/device_params/Bs_mrr_bank_4bit.yaml: -------------------------------------------------------------------------------- 1 | # @Author: Hanqing Zhu 2 | # @Date: 2023-02-23 22:53:17 3 | # @Last Modified by: Hanqing Zhu 4 | # @Last Modified time: 2023-03-20 15:23:26 5 | # device level 6 | device: 7 | mrr_modulator: 8 | type: 'ring' 9 | energy_per_bit: 42 10 | static_power: 1.2 #mW 11 | length: 9.66 12 | width: 9.66 13 | insertion_loss: 0.95 # db 14 | insertion_loss_uc: 0.1 # db uncoupled ring loss 15 | 16 | core: 17 | type: "mrrbank" 18 | width: 12 19 | height: 12 20 | num_wavelength: 12 21 | work_freq: 5 22 | interface: 23 | ADC: 24 | choice: 1 25 | sharing_factor: 1 26 | DAC: 27 | choice: 1 28 | TIA: 29 | power: 3 30 | area: 50 31 | precision: 32 | in_bit: 4 33 | w_bit: 4 34 | act_bit: 4 35 | 36 | arch: 37 | num_tiles: 7 38 | num_pe_per_tile: 2 39 | full_range_support_factor: 2 # add-drop ring, only support full-range weights 40 | weight_reuse_factor: -1 # set to -1 means fully weight-stationary dataflow -------------------------------------------------------------------------------- /hardware_simulator/params/device_params/Bs_mrr_bank_8bit.yaml: -------------------------------------------------------------------------------- 1 | # @Author: Hanqing Zhu 2 | # @Date: 2023-02-23 22:53:17 3 | # @Last Modified by: Hanqing Zhu 4 | # @Last Modified time: 2023-03-20 15:23:26 5 | # device level 6 | device: 7 | mrr_modulator: 8 | type: 'ring' 9 | energy_per_bit: 42 10 | static_power: 1.2 #mW 11 | length: 9.66 12 | width: 9.66 13 | insertion_loss: 0.95 # db 14 | insertion_loss_uc: 0.1 # db uncoupled ring loss 15 | 16 | core: 17 | type: "mrrbank" 18 | width: 12 19 | height: 12 20 | num_wavelength: 12 21 | work_freq: 5 22 | interface: 23 | ADC: 24 | choice: 1 25 | sharing_factor: 1 26 | DAC: 27 | choice: 1 28 | TIA: 29 | power: 3 30 | area: 50 31 | precision: 32 | in_bit: 8 33 | w_bit: 8 34 | act_bit: 8 35 | 36 | arch: 37 | num_tiles: 7 38 | num_pe_per_tile: 2 39 | full_range_support_factor: 2 # add-drop ring, only support full-range weights 40 | weight_reuse_factor: -1 # set to -1 means fully weight-stationary dataflow -------------------------------------------------------------------------------- /software_model/scripts/process_output_logs.sh: -------------------------------------------------------------------------------- 1 | # This is the scripts to process the saved log file from evaluate_quant_transformer_scan_noise.sh 2 | # It will generate a csv file to give you the accurcay mean and std of multiple runs 3 | 4 | # set the log file directory 5 | 6 | ## params when you parse logs for sweep_wavelength 7 | log_file='./logs/deit_t_sweep_input_noise_std.log' 8 | num_iters=3 # number of runs you launch for accurcay test 9 | num_vars=6 # how many variations you sweep 10 | 11 | # ## params when you parse logs for sweep input noise std 12 | # log_file='./logs/deit_t_sweep_input_noise_std.log' 13 | # num_iters=3 # number of runs you launch for accurcay test 14 | # num_vars=6 # how many variations you sweep 15 | 16 | # ## params when you parse logs for sweep input noise std 17 | # log_file='./logs/deit_t_sweep_phase_noise_std.log' 18 | # num_iters=3 # number of runs you launch for accurcay test 19 | # num_vars=6 # how many variations you sweep 20 | 21 | python process_logs.py \ 22 | --file ${log_file} \ 23 | --iters ${num_iters} \ 24 | --num_vars ${num_vars} -------------------------------------------------------------------------------- /hardware_simulator/scripts/area_power_all.sh: -------------------------------------------------------------------------------- 1 | ## scripts to generate area and power estimation of our optical accelerator system. 2 | ## It will save the results to ./results/{exp_name}/ 3 | ## dota is our circuit 4 | ## we will also generate area report for the optical baselines 5 | 6 | declare -A config_dict 7 | config_dict=( 8 | ['dota_b_4bit']='./params/device_params/Dota_B_4bit.yaml' 9 | ['dota_b_8bit']='./params/device_params/Dota_B_8bit.yaml' 10 | ['dota_l_4bit']='./params/device_params/Dota_L_4bit.yaml' 11 | ['dota_l_8bit']='./params/device_params/Dota_L_8bit.yaml' 12 | ['mrr_4bit']='./params/device_params/Bs_mrr_bank_4bit.yaml' 13 | ['mzi_4bit']='./params/device_params/Bs_mzi_4bit.yaml' 14 | ) 15 | 16 | 17 | exp='area_power_all' 18 | 19 | # Iterate through the keys in the config_dict 20 | for key in "${!config_dict[@]}" 21 | do 22 | # Get the value associated with the key 23 | value="${config_dict[$key]}" 24 | 25 | # launch the are and power estimation .py 26 | python entry_area_power_profile.py \ 27 | -e ${exp} \ 28 | --config "$value" 29 | done -------------------------------------------------------------------------------- /hardware_simulator/params/device_params/Bs_mzi_4bit.yaml: -------------------------------------------------------------------------------- 1 | # @Author: Hanqing Zhu 2 | # @Date: 2023-02-23 22:53:17 3 | # @Last Modified by: Hanqing Zhu 4 | # @Last Modified time: 2023-03-20 15:23:26 5 | # device level 6 | device: 7 | mzi_modulator: # input modulation 8 | type: 'mzi' 9 | energy_per_bit: 450 # fJ/bit # 150 10 | static_power: 0 # 0 mW 11 | length: 260 12 | width: 20 13 | insertion_loss: 1.2 14 | mzi: # mzi for mzi mesh 15 | type: 'mzi' 16 | energy_per_bit: 450 # fJ/bit 17 | static_power: 0 # 0 mW 18 | length: 180 # phase shifter plus directional coupler plus spacing 19 | width: 100 20 | insertion_loss: 0.99 # two directional coupler and one phase shifter 21 | response_time: 2.0e-3 # 2us scale to ms 22 | 23 | core: 24 | type: "mzi" 25 | width: 12 26 | height: 12 27 | work_freq: 5 28 | interface: 29 | ADC: 30 | choice: 1 31 | sharing_factor: 1 32 | DAC: 33 | choice: 1 34 | TIA: 35 | power: 3 36 | area: 50 37 | precision: 38 | in_bit: 4 39 | w_bit: 4 40 | act_bit: 4 41 | 42 | arch: 43 | num_tiles: 4 44 | num_pe_per_tile: 2 45 | full_range_support_factor: 1 46 | weight_reuse_factor: -1 -------------------------------------------------------------------------------- /hardware_simulator/params/device_params/Bs_mzi_8bit.yaml: -------------------------------------------------------------------------------- 1 | # @Author: Hanqing Zhu 2 | # @Date: 2023-02-23 22:53:17 3 | # @Last Modified by: Hanqing Zhu 4 | # @Last Modified time: 2023-03-20 15:23:26 5 | # device level 6 | device: 7 | mzi_modulator: # input modulation 8 | type: 'mzi' 9 | energy_per_bit: 450 # fJ/bit # 150 10 | static_power: 0 # 0 mW 11 | length: 260 12 | width: 20 13 | insertion_loss: 1.2 14 | mzi: # mzi for mzi mesh 15 | type: 'mzi' 16 | energy_per_bit: 450 # fJ/bit 17 | static_power: 0 # 0 mW 18 | length: 180 # phase shifter plus directional coupler plus spacing 19 | width: 100 20 | insertion_loss: 0.99 # two directional coupler and one phase shifter 21 | response_time: 2.0e-3 # 2us scale to ms 22 | 23 | core: 24 | type: "mzi" 25 | width: 12 26 | height: 12 27 | work_freq: 5 28 | interface: 29 | ADC: 30 | choice: 1 31 | sharing_factor: 1 32 | DAC: 33 | choice: 1 34 | TIA: 35 | power: 3 36 | area: 50 37 | precision: 38 | in_bit: 8 39 | w_bit: 8 40 | act_bit: 8 41 | 42 | arch: 43 | num_tiles: 4 44 | num_pe_per_tile: 2 45 | full_range_support_factor: 1 46 | weight_reuse_factor: -1 -------------------------------------------------------------------------------- /software_model/scripts/train_quant_transformer_with_noise.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # @Author: Hanqing Zhu(hqzhu@utexas.edu) 3 | # @Date: 1969-12-31 18:00:00 4 | # @Last Modified by: Hanqing Zhu(hqzhu@utexas.edu) 5 | # @Last Modified time: 2023-11-09 03:03:06 6 | wbits=4 7 | abits=4 8 | id=4bit 9 | lr=5e-4 10 | weight_decay=1e-8 11 | batch_size=512 12 | epochs=300 13 | port=47771 14 | headwise=1 15 | input_noise_std=0.03 16 | output_noise_std=0.05 17 | 18 | torchrun \ 19 | --master_port ${port} \ 20 | --nproc_per_node=4 main.py \ 21 | --model deit_tiny_patch16_224_quant \ 22 | --drop-path 0 \ 23 | --batch-size ${batch_size} \ 24 | --lr ${lr} \ 25 | --min-lr 0 \ 26 | --epochs ${epochs} \ 27 | --warmup-epochs 0 \ 28 | --weight-decay ${weight_decay} \ 29 | --wbits ${wbits} \ 30 | --abits ${abits} \ 31 | --dist-eval \ 32 | --output_dir test/deit_tiny_${id}/${wbits}w${abits}a_bs${batch_size}_baselr${lr}_weightdecay${weight_decay}_ft${epochs}_headwise${headwise}_noise_i_${input_noise_std}_o_${output_noise_std}_linear_noise \ 33 | --finetune pretrained/deit_tiny_patch16_224-a1311bcf.pth \ 34 | --data-path /home/usr1/zixuan/ImageNet/data \ 35 | --headwise \ 36 | --input_noise_std ${input_noise_std} \ 37 | --output_noise_std ${output_noise_std} \ 38 | --enable_linear_noise -------------------------------------------------------------------------------- /hardware_simulator/scripts/energy_latency_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Define the experiment type 4 | exp='energy_latency_all' 5 | 6 | # Define the config_dict with possible values 7 | declare -A config_dict 8 | config_dict=( 9 | ['dota_b_4bit']='./params/device_params/Dota_B_4bit.yaml' 10 | ['dota_b_8bit']='./params/device_params/Dota_B_8bit.yaml' 11 | ['dota_l_4bit']='./params/device_params/Dota_L_4bit.yaml' 12 | ['dota_l_8bit']='./params/device_params/Dota_L_8bit.yaml' 13 | ) 14 | 15 | # Define the workload_dict with possible values 16 | declare -A workload_dict 17 | workload_dict=( 18 | ['deit-t']='197' 19 | ['deit-s']='197' 20 | ['deit-b']='197' 21 | ['bert-b']='128' 22 | ['bert-l']='320' 23 | ) 24 | 25 | # Loop through the workload_dict 26 | for model_name in "${!workload_dict[@]}" 27 | do 28 | # Get the value associated with the key 29 | tokens="${workload_dict[$model_name]}" 30 | 31 | # Loop through the config_dict 32 | for onn in "${!config_dict[@]}" 33 | do 34 | onn_params="${config_dict[$onn]}" 35 | 36 | # Call your Python script with the arguments 37 | python entry_energy_latency_workload.py \ 38 | -e "${exp}" \ 39 | --tokens "${tokens}" \ 40 | --model_name "${model_name}" \ 41 | --config "${onn_params}" 42 | done 43 | done -------------------------------------------------------------------------------- /software_model/scripts/evaluate_quant_transformer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # @Author: Hanqing Zhu 3 | # @Date: 2023-01-04 22:18:40 4 | # @Last Modified by: Hanqing Zhu(hqzhu@utexas.edu) 5 | # @Last Modified time: 2023-11-10 16:51:54 6 | exp='eval_accuracy' 7 | wbits=4 8 | abits=4 9 | id=4bit 10 | headwise=1 11 | 12 | # noise settings 13 | input_noise_std=0.03 14 | output_noise_std=0.05 15 | # following setting is added for inference only 16 | phase_noise_std=2 17 | num_wavelength=12 18 | channel_spacing=0.4 19 | seed=0 20 | 21 | resumed_ckpt_path='./resumed_ckpt/best_checkpoint.pth' 22 | 23 | for i in {1..1} 24 | do 25 | for input_noise_std in 0.03 26 | do 27 | CUDA_VISIBLE_DEVICES=0 python main.py --eval \ 28 | --resume ${resumed_ckpt_path} \ 29 | --model deit_tiny_patch16_224_quant \ 30 | --drop-path 0 \ 31 | --wbits ${wbits} \ 32 | --abits ${abits} \ 33 | --data-path /home/usr1/zixuan/ImageNet/data \ 34 | --headwise \ 35 | --input_noise_std ${input_noise_std} \ 36 | --output_noise_std ${output_noise_std} \ 37 | --phase_noise_std ${phase_noise_std} \ 38 | --num_wavelength ${num_wavelength} \ 39 | --channel_spacing ${channel_spacing} \ 40 | --seed ${seed+$i} \ 41 | --enable_wdm_noise \ 42 | --enable_linear_noise 43 | done 44 | done 45 | 46 | -------------------------------------------------------------------------------- /software_model/ops/simulator.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Hanqing Zhu 3 | # @Date: 2023-03-28 15:37:29 4 | # @Last Modified by: Hanqing Zhu 5 | # @Last Modified time: 2023-03-28 18:58:41 6 | import math 7 | 8 | __all__ = ["cal_coupler_wdm_error_list"] 9 | 10 | def cal_coupler_wdm_error_list(num_wavelength, channel_spacing): 11 | channel_spacing = channel_spacing *1e-3 12 | error_list = [] # 2 * kappa - 1 13 | 14 | def coupling_length(w, g=100): 15 | a = -5.44 16 | b = 3.53 17 | c = 0.185 18 | d = 0.15 19 | 20 | L_c = (a * (w - 1.55) + b) * math.exp(g / 1000 / (c * (w - 1.55) + d)) 21 | 22 | return L_c 23 | odd_num_wavelength = True if num_wavelength % 2 == 1 else False 24 | 25 | for wave_length in range(num_wavelength): 26 | if odd_num_wavelength: 27 | wave_length = 1.55 + channel_spacing * (wave_length - (num_wavelength // 2)) 28 | else: 29 | if wave_length < num_wavelength // 2: 30 | wave_length = 1.55 + channel_spacing * (wave_length - (num_wavelength // 2)) 31 | else: 32 | wave_length = 1.55 + channel_spacing * (wave_length - (num_wavelength // 2) + 1) 33 | kappa = math.sin(math.pi / 4 * coupling_length(1.55) / coupling_length(wave_length)) ** 2 34 | error_list.append(2 * kappa - 1) 35 | 36 | return error_list 37 | -------------------------------------------------------------------------------- /software_model/scripts/evaluate_quant_transformer_scan_noise.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # @Author: Hanqing Zhu 3 | # @Date: 2023-01-04 22:18:40 4 | # @Last Modified by: Hanqing Zhu(hqzhu@utexas.edu) 5 | # @Last Modified time: 2023-11-10 16:52:06 6 | exp='eval_accuracy_scan_noise' 7 | wbits=4 8 | abits=4 9 | id=4bit 10 | headwise=1 11 | 12 | # noise settings 13 | input_noise_std=0.03 14 | output_noise_std=0.05 15 | # following setting is added for inference only 16 | phase_noise_std=2 17 | num_wavelength=12 18 | channel_spacing=0.4 19 | seed=0 20 | 21 | resumed_ckpt_path='./resumed_ckpt/best_checkpoint.pth' 22 | 23 | 24 | for i in {1..3} 25 | do 26 | # for input_noise_std in 0.03 0.04 0.05 0.06 0.07 0.08 ## uncomment this line when scanning input noise 27 | # for phase_noise_std in 2 3 4 5 6 7 ## uncomment this line when scanning phase noise 28 | for num_wavelength in 8 12 16 20 24 ## uncomment this line when scanning # wavelength 29 | do 30 | CUDA_VISIBLE_DEVICES=2 python main.py --eval \ 31 | --resume ${resumed_ckpt_path} \ 32 | --model deit_tiny_patch16_224_quant \ 33 | --drop-path 0 \ 34 | --wbits ${wbits} \ 35 | --abits ${abits} \ 36 | --data-path /home/usr1/zixuan/ImageNet/data \ 37 | --headwise \ 38 | --input_noise_std ${input_noise_std} \ 39 | --output_noise_std ${output_noise_std} \ 40 | --phase_noise_std ${phase_noise_std} \ 41 | --num_wavelength ${num_wavelength} \ 42 | --channel_spacing ${channel_spacing} \ 43 | --seed ${seed+$i} \ 44 | --enable_wdm_noise \ 45 | --enable_linear_noise 46 | done 47 | done 48 | 49 | -------------------------------------------------------------------------------- /profile/customized_layer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | 4 | """ 5 | three normalization variants without elementwise_affine transformation 6 | normlize along the last dimension 7 | """ 8 | 9 | decorator = torch.compile 10 | # decorator = torch.jit.script 11 | 12 | 13 | @decorator 14 | def layer_norm(x: Tensor, eps: float): 15 | x_mean = x.mean(dim=-1, keepdim=True) 16 | x_var = x.var(dim=-1, keepdim=True, correction=0) 17 | return (x - x_mean) * torch.rsqrt(x_var + eps) 18 | 19 | 20 | @decorator 21 | def rms_norm(x, eps: float): 22 | return x * torch.rsqrt(x.square().mean(dim=-1, keepdim=True) + eps) 23 | 24 | 25 | @decorator 26 | def crms_norm(x, eps: float): 27 | discarded_element = x.sum(dim=-1, keepdim=True) 28 | return x * torch.rsqrt((x.square().sum(dim=-1, keepdim=True) + discarded_element.square()) / (x.shape[-1] + 1) + eps) 29 | 30 | 31 | class CustomizedLayerNorm(torch.nn.LayerNorm): 32 | def forward(self, x: Tensor) -> Tensor: 33 | return layer_norm(x.float(), self.eps).type_as(x) 34 | 35 | 36 | class RMSNorm(torch.nn.LayerNorm): 37 | def forward(self, x: Tensor) -> Tensor: 38 | return rms_norm(x.float(), self.eps).type_as(x) 39 | 40 | 41 | class CRMSNorm(torch.nn.LayerNorm): 42 | def forward(self, x: Tensor) -> Tensor: 43 | return crms_norm(x.float(), self.eps).type_as(x) 44 | 45 | 46 | class LinearZeroMeanOutput(torch.nn.Linear): 47 | def forward(self, x): 48 | zero_mean_weight = self.weight - self.weight.mean(dim=0, keepdim=True) 49 | zero_mean_bias = self.bias - self.bias.mean() 50 | return torch.nn.functional.linear(x, zero_mean_weight, zero_mean_bias) 51 | -------------------------------------------------------------------------------- /hardware_simulator/hardware/SRAM.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Hanqing Zhu 3 | # @Date: 2023-03-05 19:39:10 4 | # @Last Modified by: Hanqing Zhu(hqzhu@utexas.edu) 5 | # @Last Modified time: 2023-11-08 17:16:44 6 | import math 7 | 8 | class SRAM: 9 | def __init__(self, size=2048) -> None: 10 | 11 | # the largest SRAM -> 2MB 12 | self.max_data = size * 1024 * 8 13 | 14 | # HBM to SRAM 15 | self.bandwidth_dram_to_sram = 1024 * 1024 * 1024 * 1024 * 8 # 1TB/s 16 | self.bandwidth_sram = 1 / 0.604347* 64 * 64 * 1024 * 1024 * 1024 * 8 # based on cacti simulation 17 | self.bandwidth_sram_to_rf = 1024 * 1024 * 1024 * 1024 * 8 * 100 # set to inifnity 18 | self.clock_frequency = 500 * 1e6 # 500MHz 19 | 20 | def preload_DRAM_SRAM(self, nums=0, bits=32, bandwidth_ratio=1): 21 | cycle = 0 22 | latency = nums * bits / (self.bandwidth_dram_to_sram * bandwidth_ratio) 23 | cycle = math.ceil(latency * self.clock_frequency) 24 | if nums * bits > self.max_data: 25 | print('Error: loading DRAM to SRAM exceeds SRAM size') 26 | else: 27 | latency = nums * bits / (self.bandwidth_dram_to_sram * bandwidth_ratio) 28 | cycle = math.ceil(latency * self.clock_frequency) 29 | 30 | return cycle 31 | 32 | def load_SRAM_RF(self, nums=0, bits=32, bandwidth_ratio=1): 33 | cycle = 0 34 | latency = nums * bits / (self.bandwidth_sram_to_rf * bandwidth_ratio) 35 | cycle = math.ceil(latency * self.clock_frequency) 36 | return cycle 37 | 38 | def load_GB_SRAM(self, nums=0, bits=32, bandwidth_ratio=1): 39 | cycle = 0 40 | latency = nums * bits / (self.bandwidth_sram * bandwidth_ratio) 41 | cycle = math.ceil(latency * self.clock_frequency) 42 | return cycle -------------------------------------------------------------------------------- /software_model/process_logs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Hanqing Zhu 3 | # @Date: 2023-03-21 15:40:36 4 | # @Last Modified by: Hanqing Zhu(hqzhu@utexas.edu) 5 | # @Last Modified time: 2023-11-10 10:43:07 6 | import re 7 | import csv 8 | import statistics 9 | import argparse 10 | 11 | parser = argparse.ArgumentParser() 12 | 13 | parser.add_argument("-f", "--file", default="./robustness/sweep_phase_noise_deit_tiny_4bit.log", 14 | help="file") 15 | parser.add_argument("-i", "--iters", default=3, 16 | help="iterations") 17 | parser.add_argument("-n", "--num_vars", default=6, 18 | help="number of variations you sweep") 19 | 20 | args, opts = parser.parse_known_args() 21 | 22 | log_file = args.file 23 | num_iters = int(args.iters) 24 | num_variations = int(args.num_vars) 25 | 26 | 27 | with open(log_file, "r") as file: 28 | log_data = file.read() 29 | 30 | accuracy_pattern = r"\* Acc@1 (\d+\.\d+)" 31 | 32 | accuracy_matches = re.findall(accuracy_pattern, log_data) 33 | 34 | if accuracy_matches: 35 | accuracies = [float(match) for match in accuracy_matches] 36 | print(f"Accuracy: {accuracies}") 37 | else: 38 | print("Accuracy not found in log file.") 39 | 40 | indices = [x*num_variations for x in range(num_iters)] 41 | result = [] 42 | 43 | for i in range(num_variations): 44 | print("**", indices) 45 | tmp = [float(accuracy_matches[i]) for i in indices] 46 | mean = statistics.mean(tmp) 47 | std = statistics.stdev(tmp) 48 | tmp.extend([mean, std]) 49 | result.append(tmp) 50 | indices = [x + 1 for x in indices] 51 | 52 | filename = log_file.split("/")[-1].split(".")[0] + '.csv' 53 | 54 | def save_arrays_to_file(file_name, arrays): 55 | with open(file_name, mode='w', newline='') as file: 56 | writer = csv.writer(file) 57 | writer.writerow(['test1', 'test2', 'test3', 'mean', 'std']) 58 | for array in arrays: 59 | writer.writerow(array) 60 | 61 | save_arrays_to_file(filename, result) -------------------------------------------------------------------------------- /profile/vit_infer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Hanqing Zhu(hqzhu@utexas.edu) 3 | # @Date: 1969-12-31 18:00:00 4 | # @Last Modified by: Hanqing Zhu(hqzhu@utexas.edu) 5 | # @Last Modified time: 2023-11-09 23:05:25 6 | import argparse 7 | import torch 8 | import torch.utils.benchmark as benchmark 9 | from model import PreDefinedViT 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument("-m", "--model_name", default="deit-s", 13 | help="model") 14 | 15 | args, opts = parser.parse_known_args() 16 | 17 | image_size = 224 18 | num_classes = 1000 19 | using_torch_compile = False 20 | device = torch.device('cuda') 21 | # device = torch.device('cpu') 22 | 23 | batch_size_list = [1] 24 | num_threads_list = [1] 25 | min_run_time = 100 26 | 27 | model_dict = { 28 | 'deit-t': ['Tiny', 16], 29 | 'deit-s': ['Small', 16], 30 | 'deit-b': ['Base', 16] 31 | } 32 | 33 | results = [] 34 | 35 | model_variant = model_dict[args.model_name] 36 | model_name, patch_size = model_variant 37 | for method in ['pre-ln']: 38 | raw_model = PreDefinedViT(image_size=image_size, patch_size=patch_size, num_classes=num_classes, variant=model_name, method=method).to(device) 39 | model = torch.compile(raw_model) if using_torch_compile else raw_model 40 | model.eval() 41 | 42 | with torch.no_grad(): 43 | with torch.cuda.amp.autocast(): 44 | for batch_size in batch_size_list: 45 | for num_threads in num_threads_list: 46 | x = torch.randn(batch_size, 3, image_size, image_size).to(device) 47 | result = benchmark.Timer(stmt='y = model(x)', 48 | setup='from __main__ import model', 49 | globals={'x': x}, 50 | num_threads=num_threads, 51 | sub_label=f'batch_size {batch_size} method {method}', 52 | description=model_name, 53 | ).blocked_autorange(min_run_time=min_run_time) 54 | results.append(result) 55 | print(result) 56 | 57 | compare = benchmark.Compare(results) 58 | compare.print() 59 | -------------------------------------------------------------------------------- /profile/bert_infer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Hanqing Zhu(hqzhu@utexas.edu) 3 | # @Date: 1969-12-31 18:00:00 4 | # @Last Modified by: Hanqing Zhu(hqzhu@utexas.edu) 5 | # @Last Modified time: 2023-11-09 23:42:51 6 | import argparse 7 | import torch 8 | import torch.utils.benchmark as benchmark 9 | from model import PreDefinedBERT 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument("-m", "--model_name", default="deit-s", 13 | help="model") 14 | parser.add_argument("-s", "--seq_length", default=128, 15 | help="seq length") 16 | 17 | args, opts = parser.parse_known_args() 18 | 19 | vocab_size = 30528 20 | max_seq_length = 2048 21 | num_classes = 2 22 | using_torch_compile = False 23 | device = torch.device('cuda') 24 | # device = torch.device('cpu') 25 | 26 | batch_size_list = [1] 27 | num_threads_list = [1] 28 | min_run_time = 100 29 | 30 | model_dict = { 31 | 'bert-b': ['Base', 768], 32 | 'bert-l': ['Large', 1024] 33 | } 34 | 35 | results = [] 36 | 37 | model_name, emebedding_size = model_dict[args.model_name] 38 | seq_len = int(args.seq_length) 39 | for method in ['pre-ln']: 40 | raw_model = PreDefinedBERT(vocab_size=vocab_size, max_seq_length=max_seq_length, variant=model_name, method=method, num_classes=num_classes).to(device) 41 | model = torch.compile(raw_model) if using_torch_compile else raw_model 42 | model.eval() 43 | 44 | with torch.no_grad(): 45 | with torch.cuda.amp.autocast(): 46 | for batch_size in batch_size_list: 47 | for num_threads in num_threads_list: 48 | x = torch.randn(batch_size, seq_len, emebedding_size).to(device) 49 | result = benchmark.Timer(stmt='y = model(x)', 50 | setup='from __main__ import model', 51 | globals={'x': x}, 52 | num_threads=num_threads, 53 | sub_label=f'batch_size {batch_size} seq_len {seq_len}', 54 | description=model_name + ' ' + method, 55 | ).blocked_autorange(min_run_time=min_run_time) 56 | results.append(result) 57 | print(result) 58 | 59 | compare = benchmark.Compare(results) 60 | compare.print() 61 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Lightening-Transformer HPCA 2024 AE 2 | This contains the codebases for the main codebases of the paper "Lightening-Transformer: A Dynamically-operated Optically-interconnected Photonic Transformer Accelerator". 3 | 4 | --- 5 | 6 | ## Usage of the Provided Codebase 7 | 8 | We provides three kinds of codebases: 9 | 10 | * (1) algorithm codes for training/running models on our photonic accelerator, with the analytic transformation of our unique photonic tensor core embedded in the computation process. See `./software_model` for detailed implementation and usages, including the [DeiT](https://arxiv.org/abs/2012.12877) case. 11 | 12 | * (2) hardware simulator for estimating the energy and latency running Transformers on our photonic accelerator. See `./hardware_simulator` for detailed implementation and usages. 13 | 14 | * (3) profile codes for profiling latency and power usage of running Transformers on GPU. See `./profile` for detailed implementation and usages. The implementation refers to [Neurips'23, Pre-RMSNorm and Pre-CRMSNorm Transformers: Equivalent and Efficient Pre-LN Transformers](https://github.com/zixuanjiang/pre-rmsnorm-transformer). 15 | 16 | --- 17 | 18 | ## Required Dependencies 19 | 20 | The DeiT requires to install PyTorch and torchvision 0.8.1+ and [pytorch-image-models 0.3.2](https://github.com/rwightman/pytorch-image-models). 21 | 22 | 23 | ``` 24 | conda create -n test # create a virtual env 25 | conda install pytorch torchvision torchaudio pytorch-cuda=your_cuda_version -c pytorch -c nvidia # install pytorch 26 | pip install timm==0.3.2 torchpack packaging einops gdown 27 | conda activate test # activate the test env 28 | ``` 29 | 30 | For torch.2.0+, you will encounter the ModuleNotFoundError: No module named 'torch._six' in '/path_to_your_conda_envs/your_env_name/lib/python_version/site-packages/timm/models/layers/helpers.py". This is because torch2.0 doesn't have torch._six. Please replace the helper.py file with the following one. 31 | 32 | ``` 33 | from itertools import repeat 34 | # from torch._six import container_abcs 35 | 36 | 37 | # From PyTorch internals 38 | def _ntuple(n): 39 | def parse(x): 40 | if isinstance(x, str): 41 | return x 42 | return tuple(repeat(x, n)) 43 | return parse 44 | 45 | 46 | to_1tuple = _ntuple(1) 47 | to_2tuple = _ntuple(2) 48 | to_3tuple = _ntuple(3) 49 | to_4tuple = _ntuple(4) 50 | to_ntuple = _ntuple 51 | ``` 52 | 53 | ## Reference 54 | 55 | [1] Hanqing Zhu, Jiaqi Gu, Hanrui Wang, Zixuan Jiang, Rongxing Tang, Zhekai Zhang, Chenghao Feng, Song Han, Ray T. Chen and David Z. Pan. "Lightening-Transformer: A Dynamically-operated Optically-interconnected Photonic Transformer Accelerator", IEEE International Symposium on High-Performance Computer Architecture (HPCA'24). 56 | -------------------------------------------------------------------------------- /hardware_simulator/params/device_params/default.yaml: -------------------------------------------------------------------------------- 1 | # @Author: Hanqing Zhu 2 | # @Date: 1969-12-31 18:00:00 3 | # @Last Modified by: Hanqing Zhu 4 | # @Last Modified time: 2023-05-10 22:57:20 5 | # power in mW 6 | device: 7 | mzi_modulator: 8 | type: 'mzi' 9 | energy_per_bit: 450 # fJ/bit # 150 10 | static_power: 0 # 0 mW 11 | length: 260 12 | width: 20 13 | insertion_loss: 1.2 14 | mrr_modulator: 15 | type: 'ring' 16 | energy_per_bit: 42 # fJ/bit -> 42fJ/bit @ 40Gbit 17 | static_power: 1.2 #mW 18 | length: 9.66 19 | width: 9.66 20 | insertion_loss: 0.95 # insertion loss 21 | insertion_loss_uc: 0.1 # uncoupled insertion loss 22 | mrr_router: 23 | static_power: 0.275 24 | length: 4.8 25 | width: 4.8 26 | insertion_loss: 0.93 27 | phase_shifter: 28 | dynamic_power: 0 29 | static_power: 0 30 | insertion_loss: 0.33 31 | length: 100 32 | width: 45 33 | direction_coupler: 34 | insertion_loss: 0.33 35 | length: 5.25 36 | width: 2.4 37 | photo_detector: 38 | power: 1.1 39 | sensitivity: -25 #dbm 40 | length: 4 41 | width: 10 42 | mzi: 43 | type: 'mzi' 44 | energy_per_bit: 450 # fJ/bit 45 | static_power: 0 # 0 mW 46 | length: 180 47 | width: 100 48 | insertion_loss: 0.99 #two directional coupler 0.04 + 2 * 0.33 49 | response_time: 2.0e-3 # 2mus 50 | laser: 51 | power: 23.5 52 | length: 400 53 | width: 300 54 | wall_plug_eff: 0.2 55 | y_branch: 56 | insertion_loss: 0.1 57 | length: 1.8 58 | width: 1.3 59 | micro_comb: 60 | length: 1184 61 | width: 1184 62 | 63 | core: 64 | type: "dota" 65 | width: 12 66 | height: 12 67 | num_wavelength: 12 68 | work_freq: 5 69 | interface: 70 | ADC: 71 | choice: 1 72 | sharing_factor: 1 73 | DAC: 74 | choice: 1 75 | TIA: 76 | power: 3 77 | area: 50 78 | precision: 79 | in_bit: 4 80 | w_bit: 4 81 | act_bit: 4 82 | 83 | arch: 84 | num_tiles: 4 85 | num_pe_per_tile: 2 86 | full_range_support_factor: 1 87 | weight_reuse_factor: -1 88 | ### unique arch params for our DOTA 89 | time_accum_factor: 1 90 | input_mod_sharing_flag: 1 # whether input is globally shared cross tiles 91 | adc_share_flag: 1 # multiple PEs share one adc array 92 | datamovement: # datamovement cost from CACTI: leakage power * access time + dynamic energy per acess * cache access rate 93 | DRAM: 62.4e-9 94 | DRAM_GB: 62.4e-9 95 | GB2: 1.655e-9 # mJ/2byte: we divide the large global SRAM into 32KB banks 96 | GB1: 0.92e-9 # mJ/2byte 97 | NoC: 2.0e-9 # from eyerisis 98 | RF: 0.073e-9 # mJ/2byte 99 | memory_size: 100 | M2_buffer_size: 4096 -------------------------------------------------------------------------------- /software_model/samplers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015-present, Facebook, Inc. 2 | # All rights reserved. 3 | import torch 4 | import torch.distributed as dist 5 | import math 6 | 7 | 8 | class RASampler(torch.utils.data.Sampler): 9 | """Sampler that restricts data loading to a subset of the dataset for distributed, 10 | with repeated augmentation. 11 | It ensures that different each augmented version of a sample will be visible to a 12 | different process (GPU) 13 | Heavily based on torch.utils.data.DistributedSampler 14 | """ 15 | 16 | def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True, num_repeats: int = 3): 17 | if num_replicas is None: 18 | if not dist.is_available(): 19 | raise RuntimeError("Requires distributed package to be available") 20 | num_replicas = dist.get_world_size() 21 | if rank is None: 22 | if not dist.is_available(): 23 | raise RuntimeError("Requires distributed package to be available") 24 | rank = dist.get_rank() 25 | if num_repeats < 1: 26 | raise ValueError("num_repeats should be greater than 0") 27 | self.dataset = dataset 28 | self.num_replicas = num_replicas 29 | self.rank = rank 30 | self.num_repeats = num_repeats 31 | self.epoch = 0 32 | self.num_samples = int(math.ceil(len(self.dataset) * self.num_repeats / self.num_replicas)) 33 | self.total_size = self.num_samples * self.num_replicas 34 | # self.num_selected_samples = int(math.ceil(len(self.dataset) / self.num_replicas)) 35 | self.num_selected_samples = int(math.floor(len(self.dataset) // 256 * 256 / self.num_replicas)) 36 | self.shuffle = shuffle 37 | 38 | def __iter__(self): 39 | if self.shuffle: 40 | # deterministically shuffle based on epoch 41 | g = torch.Generator() 42 | g.manual_seed(self.epoch) 43 | indices = torch.randperm(len(self.dataset), generator=g) 44 | else: 45 | indices = torch.arange(start=0, end=len(self.dataset)) 46 | 47 | # add extra samples to make it evenly divisible 48 | indices = torch.repeat_interleave(indices, repeats=self.num_repeats, dim=0).tolist() 49 | padding_size: int = self.total_size - len(indices) 50 | if padding_size > 0: 51 | indices += indices[:padding_size] 52 | assert len(indices) == self.total_size 53 | 54 | # subsample 55 | indices = indices[self.rank:self.total_size:self.num_replicas] 56 | assert len(indices) == self.num_samples 57 | 58 | return iter(indices[:self.num_selected_samples]) 59 | 60 | def __len__(self): 61 | return self.num_selected_samples 62 | 63 | def set_epoch(self, epoch): 64 | self.epoch = epoch 65 | -------------------------------------------------------------------------------- /hardware_simulator/hardware/ADC.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Hanqing Zhu 3 | # @Date: 1969-12-31 18:00:00 4 | # @Last Modified by: Hanqing Zhu(hqzhu@utexas.edu) 5 | # @Last Modified time: 2023-11-10 23:43:45 6 | import logging 7 | 8 | 9 | ADC_list = { 10 | 1: {'area': 2850, 'prec': 8, 'power': 14.8, 'sample_rate': 10, 'type': 'sar'}, 11 | } 12 | 13 | class ADC(): 14 | def __init__(self, choice=1) -> None: 15 | self.ADC_choice = choice 16 | 17 | assert choice == 1 18 | 19 | # loaded ADC params 20 | # make it private 21 | self.__ADC_area = 0 22 | self.__ADC_prec = 0 23 | self.__ADC_power = 0 24 | self.__ADC_sample_rate = 0 25 | self.__ADC_type = None 26 | 27 | # obtain ADC param 28 | self._obatin_ADC_param() 29 | self.ADC_freq = self.__ADC_sample_rate # set to sample rate by default 30 | self.ADC_prec = self.__ADC_prec # set to sample rate by default 31 | 32 | def _obatin_ADC_param(self): 33 | if self.ADC_choice is not None: 34 | self.__chosen_ADC_list = ADC_list[self.ADC_choice] 35 | self.__ADC_area = self.__chosen_ADC_list['area'] 36 | self.__ADC_prec = self.__chosen_ADC_list['prec'] 37 | self.__ADC_power = self.__chosen_ADC_list['power'] 38 | self.__ADC_sample_rate = self.__chosen_ADC_list['sample_rate'] 39 | self.__ADC_type = self.__chosen_ADC_list['type'] 40 | else: 41 | raise NotImplementedError 42 | 43 | def set_ADC_work_freq(self, work_freq): 44 | if work_freq > self.__ADC_sample_rate: 45 | raise ValueError(f"Got required ADC work frequency {work_freq} exceeds the ADC frequency limit") 46 | self.ADC_freq = work_freq 47 | 48 | def set_ADC_work_prec(self, work_prec): 49 | if work_prec > self.__ADC_prec: 50 | raise ValueError(f"Got required ADC work precision {work_prec} exceeds the ADC precision limit") 51 | self.ADC_prec = work_prec 52 | 53 | def cal_ADC_param(self, print_msg=False): 54 | # convert power to desired freq and bit width 55 | if self.__ADC_type == "sar": 56 | # P \propto N 57 | self.ADC_power = self.__ADC_power * self.ADC_freq / \ 58 | self.__ADC_sample_rate * (self.ADC_prec / self.__ADC_prec) 59 | elif self.__ADC_type == "flash": 60 | # P \propto (2**N - 1) 61 | self.ADC_power = self.__ADC_power * self.ADC_freq / \ 62 | self.__ADC_sample_rate * \ 63 | ((2**self.ADC_prec - 1) / (2**self.__ADC_prec - 1)) 64 | 65 | self.ADC_area = self.__ADC_area 66 | 67 | if print_msg: 68 | logging.info('The %s-bit ADC power @%sGHz is %.2f mW', self.ADC_prec, self.ADC_freq, self.ADC_power) 69 | logging.info('The %s-bit ADC area is %.4f um^2', self.ADC_prec, self.ADC_area) 70 | 71 | 72 | if __name__ == "__main__": 73 | logging.basicConfig() 74 | logging.getLogger().setLevel(logging.INFO) 75 | test = ADC(choice=1) 76 | test.set_ADC_work_freq(4) 77 | test.set_ADC_work_prec(6) 78 | test.cal_ADC_param(print_msg=True) -------------------------------------------------------------------------------- /hardware_simulator/hardware/DAC.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Hanqing Zhu 3 | # @Date: 1969-12-31 18:00:00 4 | # @Last Modified by: Hanqing Zhu(hqzhu@utexas.edu) 5 | # @Last Modified time: 2023-11-10 23:43:37 6 | import logging 7 | 8 | 9 | # area: um^2, prec: bit, power: mw, sample_rate: GSample/s 10 | DAC_list = { 11 | 1: {'area': 11000, 'prec': 8, 'power': 50, 'sample_rate': 14, 'FoM': None, 'type': 'cap'} 12 | } 13 | 14 | class DAC(): 15 | def __init__(self, choice=1) -> None: 16 | self.DAC_choice = choice 17 | assert choice == 1 18 | # loaded DAC params 19 | # make it private 20 | self.__DAC_area = 0 21 | self.__DAC_prec = 0 22 | self.__DAC_power = 0 23 | self.__DAC_sample_rate = 0 24 | self.__DAC_type = None 25 | self.__DAC_FoM = 0 26 | 27 | # obtain DAC param 28 | self._obatin_DAC_param() 29 | self.DAC_freq = self.__DAC_sample_rate # set to sample rate by default 30 | self.DAC_prec = self.__DAC_prec # set to sample rate by default 31 | 32 | def _obatin_DAC_param(self): 33 | if self.DAC_choice is not None: 34 | self.__chosen_DAC_list = DAC_list[self.DAC_choice] 35 | self.__DAC_area = self.__chosen_DAC_list['area'] 36 | self.__DAC_prec = self.__chosen_DAC_list['prec'] 37 | self.__DAC_power = self.__chosen_DAC_list['power'] 38 | self.__DAC_sample_rate = self.__chosen_DAC_list['sample_rate'] 39 | self.__DAC_type = self.__chosen_DAC_list['type'] 40 | self.__DAC_FoM = self.__chosen_DAC_list['FoM'] 41 | else: 42 | raise NotImplementedError 43 | 44 | def set_DAC_work_freq(self, work_freq): 45 | if work_freq > self.__DAC_sample_rate: 46 | raise ValueError(f"Got required DAC work frequency {work_freq} exceeds the DAC frequency limit") 47 | self.DAC_freq = work_freq 48 | 49 | def set_DAC_work_prec(self, work_prec): 50 | if work_prec > self.__DAC_prec: 51 | raise ValueError(f"Got required DAC work precision {work_prec} exceeds the DAC precision limit") 52 | self.DAC_prec = work_prec 53 | 54 | def cal_DAC_param(self, print_msg=False): 55 | # convert power to desired freq and bit width 56 | if self.__DAC_FoM is not None: 57 | # following 2 * FoM * nb * Fs / Br (assuming Fs=Br) 58 | self.DAC_power = 2 * self.__DAC_FoM * \ 59 | self.DAC_prec * self.DAC_freq * 1e-3 60 | else: 61 | # P \propto 2**N/(N+1) * f_clk 62 | self.DAC_power = self.__DAC_power * (2**self.DAC_prec / (self.DAC_prec)) / ( 63 | 2**self.__DAC_prec / (self.__DAC_prec)) * self.DAC_freq / self.__DAC_sample_rate 64 | 65 | self.DAC_area = self.__DAC_area 66 | 67 | if print_msg: 68 | logging.info('The %s-bit DAC power @%sGHz is %.2f mW', self.DAC_prec, self.DAC_freq, self.DAC_power) 69 | logging.info('The %s-bit DAC area is %.4f um^2', self.DAC_prec, self.DAC_area) 70 | 71 | 72 | if __name__ == "__main__": 73 | logging.basicConfig() 74 | logging.getLogger().setLevel(logging.INFO) 75 | test = DAC(choice=2) 76 | test.set_DAC_work_freq(4) 77 | test.set_DAC_work_prec(5) 78 | test.cal_DAC_param(print_msg=True) -------------------------------------------------------------------------------- /software_model/losses.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015-present, Facebook, Inc. 2 | # All rights reserved. 3 | """ 4 | Implements the knowledge distillation loss 5 | """ 6 | import torch 7 | from torch.nn import functional as F 8 | 9 | 10 | class DistillationLoss(torch.nn.Module): 11 | """ 12 | This module wraps a standard criterion and adds an extra knowledge distillation loss by 13 | taking a teacher model prediction and using it as additional supervision. 14 | """ 15 | def __init__(self, base_criterion: torch.nn.Module, teacher_model: torch.nn.Module, 16 | distillation_type: str, alpha: float, tau: float): 17 | super().__init__() 18 | self.base_criterion = base_criterion 19 | self.teacher_model = teacher_model 20 | assert distillation_type in ['none', 'soft', 'hard'] 21 | self.distillation_type = distillation_type 22 | self.alpha = alpha 23 | self.tau = tau 24 | 25 | def forward(self, inputs, outputs, labels): 26 | """ 27 | Args: 28 | inputs: The original inputs that are feed to the teacher model 29 | outputs: the outputs of the model to be trained. It is expected to be 30 | either a Tensor, or a Tuple[Tensor, Tensor], with the original output 31 | in the first position and the distillation predictions as the second output 32 | labels: the labels for the base criterion 33 | """ 34 | outputs_kd = None 35 | if not isinstance(outputs, torch.Tensor): 36 | # assume that the model outputs a tuple of [outputs, outputs_kd] 37 | outputs, outputs_kd = outputs 38 | base_loss = self.base_criterion(outputs, labels) 39 | if self.distillation_type == 'none': 40 | return base_loss 41 | 42 | if outputs_kd is None: 43 | raise ValueError("When knowledge distillation is enabled, the model is " 44 | "expected to return a Tuple[Tensor, Tensor] with the output of the " 45 | "class_token and the dist_token") 46 | # don't backprop throught the teacher 47 | with torch.no_grad(): 48 | teacher_outputs = self.teacher_model(inputs) 49 | 50 | if self.distillation_type == 'soft': 51 | T = self.tau 52 | # taken from https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100 53 | # with slight modifications 54 | distillation_loss = F.kl_div( 55 | F.log_softmax(outputs_kd / T, dim=1), 56 | #We provide the teacher's targets in log probability because we use log_target=True 57 | #(as recommended in pytorch https://github.com/pytorch/pytorch/blob/9324181d0ac7b4f7949a574dbc3e8be30abe7041/torch/nn/functional.py#L2719) 58 | #but it is possible to give just the probabilities and set log_target=False. In our experiments we tried both. 59 | F.log_softmax(teacher_outputs / T, dim=1), 60 | reduction='sum', 61 | log_target=True 62 | ) * (T * T) / outputs_kd.numel() 63 | #We divide by outputs_kd.numel() to have the legacy PyTorch behavior. 64 | #But we also experiments output_kd.size(0) 65 | #see issue 61(https://github.com/facebookresearch/deit/issues/61) for more details 66 | elif self.distillation_type == 'hard': 67 | distillation_loss = F.cross_entropy(outputs_kd, teacher_outputs.argmax(dim=1)) 68 | 69 | loss = base_loss * (1 - self.alpha) + distillation_loss * self.alpha 70 | return loss 71 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # ignor pretrained model 2 | software_model/pretrained/ 3 | software_model/results/ 4 | software_model/resumed_ckpt/ 5 | 6 | hardware_simulator/results/ 7 | 8 | # Byte-compiled / optimized / DLL files 9 | __pycache__/ 10 | *.py[cod] 11 | *$py.class 12 | 13 | # C extensions 14 | *.so 15 | 16 | # Distribution / packaging 17 | .Python 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | share/python-wheels/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | MANIFEST 35 | 36 | # PyInstaller 37 | # Usually these files are written by a python script from a template 38 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 39 | *.manifest 40 | *.spec 41 | 42 | # Installer logs 43 | pip-log.txt 44 | pip-delete-this-directory.txt 45 | 46 | # Unit test / coverage reports 47 | htmlcov/ 48 | .tox/ 49 | .nox/ 50 | .coverage 51 | .coverage.* 52 | .cache 53 | nosetests.xml 54 | coverage.xml 55 | *.cover 56 | *.py,cover 57 | .hypothesis/ 58 | .pytest_cache/ 59 | cover/ 60 | 61 | # Translations 62 | *.mo 63 | *.pot 64 | 65 | # Django stuff: 66 | *.log 67 | local_settings.py 68 | db.sqlite3 69 | db.sqlite3-journal 70 | 71 | # Flask stuff: 72 | instance/ 73 | .webassets-cache 74 | 75 | # Scrapy stuff: 76 | .scrapy 77 | 78 | # Sphinx documentation 79 | docs/_build/ 80 | 81 | # PyBuilder 82 | .pybuilder/ 83 | target/ 84 | 85 | # Jupyter Notebook 86 | .ipynb_checkpoints 87 | 88 | # IPython 89 | profile_default/ 90 | ipython_config.py 91 | 92 | # pyenv 93 | # For a library or package, you might want to ignore these files since the code is 94 | # intended to run in multiple environments; otherwise, check them in: 95 | # .python-version 96 | 97 | # pipenv 98 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 99 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 100 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 101 | # install all needed dependencies. 102 | #Pipfile.lock 103 | 104 | # poetry 105 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 106 | # This is especially recommended for binary packages to ensure reproducibility, and is more 107 | # commonly ignored for libraries. 108 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 109 | #poetry.lock 110 | 111 | # pdm 112 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 113 | #pdm.lock 114 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 115 | # in version control. 116 | # https://pdm.fming.dev/#use-with-ide 117 | .pdm.toml 118 | 119 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 120 | __pypackages__/ 121 | 122 | # Celery stuff 123 | celerybeat-schedule 124 | celerybeat.pid 125 | 126 | # SageMath parsed files 127 | *.sage.py 128 | 129 | # Environments 130 | .env 131 | .venv 132 | env/ 133 | venv/ 134 | ENV/ 135 | env.bak/ 136 | venv.bak/ 137 | 138 | # Spyder project settings 139 | .spyderproject 140 | .spyproject 141 | 142 | # Rope project settings 143 | .ropeproject 144 | 145 | # mkdocs documentation 146 | /site 147 | 148 | # mypy 149 | .mypy_cache/ 150 | .dmypy.json 151 | dmypy.json 152 | 153 | # Pyre type checker 154 | .pyre/ 155 | 156 | # pytype static type analyzer 157 | .pytype/ 158 | 159 | # Cython debug symbols 160 | cython_debug/ 161 | 162 | # PyCharm 163 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 164 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 165 | # and can be added to the global gitignore or merged into this file. For a more nuclear 166 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 167 | #.idea/ -------------------------------------------------------------------------------- /software_model/augment.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Hanqing Zhu(hqzhu@utexas.edu) 3 | # @Date: 1969-12-31 18:00:00 4 | # @Last Modified by: Hanqing Zhu(hqzhu@utexas.edu) 5 | # @Last Modified time: 2023-11-07 00:27:04 6 | # Copyright (c) Meta Platforms, Inc. and affiliates. 7 | # All rights reserved. 8 | 9 | """ 10 | 3Augment implementation 11 | Data-augmentation (DA) based on dino DA (https://github.com/facebookresearch/dino) 12 | and timm DA(https://github.com/rwightman/pytorch-image-models) 13 | """ 14 | import torch 15 | from torchvision import transforms 16 | 17 | from timm.data.transforms import RandomResizedCropAndInterpolation, ToNumpy, ToTensor 18 | 19 | import numpy as np 20 | from torchvision import datasets, transforms 21 | import random 22 | 23 | 24 | 25 | from PIL import ImageFilter, ImageOps 26 | import torchvision.transforms.functional as TF 27 | 28 | 29 | class GaussianBlur(object): 30 | """ 31 | Apply Gaussian Blur to the PIL image. 32 | """ 33 | def __init__(self, p=0.1, radius_min=0.1, radius_max=2.): 34 | self.prob = p 35 | self.radius_min = radius_min 36 | self.radius_max = radius_max 37 | 38 | def __call__(self, img): 39 | do_it = random.random() <= self.prob 40 | if not do_it: 41 | return img 42 | 43 | img = img.filter( 44 | ImageFilter.GaussianBlur( 45 | radius=random.uniform(self.radius_min, self.radius_max) 46 | ) 47 | ) 48 | return img 49 | 50 | class Solarization(object): 51 | """ 52 | Apply Solarization to the PIL image. 53 | """ 54 | def __init__(self, p=0.2): 55 | self.p = p 56 | 57 | def __call__(self, img): 58 | if random.random() < self.p: 59 | return ImageOps.solarize(img) 60 | else: 61 | return img 62 | 63 | class gray_scale(object): 64 | """ 65 | Apply Solarization to the PIL image. 66 | """ 67 | def __init__(self, p=0.2): 68 | self.p = p 69 | self.transf = transforms.Grayscale(3) 70 | 71 | def __call__(self, img): 72 | if random.random() < self.p: 73 | return self.transf(img) 74 | else: 75 | return img 76 | 77 | 78 | 79 | class horizontal_flip(object): 80 | """ 81 | Apply Solarization to the PIL image. 82 | """ 83 | def __init__(self, p=0.2,activate_pred=False): 84 | self.p = p 85 | self.transf = transforms.RandomHorizontalFlip(p=1.0) 86 | 87 | def __call__(self, img): 88 | if random.random() < self.p: 89 | return self.transf(img) 90 | else: 91 | return img 92 | 93 | 94 | 95 | def new_data_aug_generator(args = None): 96 | img_size = args.input_size 97 | remove_random_resized_crop = args.src 98 | mean, std = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225] 99 | primary_tfl = [] 100 | scale=(0.08, 1.0) 101 | interpolation='bicubic' 102 | if remove_random_resized_crop: 103 | primary_tfl = [ 104 | transforms.Resize(img_size, interpolation=3), 105 | transforms.RandomCrop(img_size, padding=4,padding_mode='reflect'), 106 | transforms.RandomHorizontalFlip() 107 | ] 108 | else: 109 | primary_tfl = [ 110 | RandomResizedCropAndInterpolation( 111 | img_size, scale=scale, interpolation=interpolation), 112 | transforms.RandomHorizontalFlip() 113 | ] 114 | 115 | 116 | secondary_tfl = [transforms.RandomChoice([gray_scale(p=1.0), 117 | Solarization(p=1.0), 118 | GaussianBlur(p=1.0)])] 119 | 120 | if args.color_jitter is not None and not args.color_jitter==0: 121 | secondary_tfl.append(transforms.ColorJitter(args.color_jitter, args.color_jitter, args.color_jitter)) 122 | final_tfl = [ 123 | transforms.ToTensor(), 124 | transforms.Normalize( 125 | mean=torch.tensor(mean), 126 | std=torch.tensor(std)) 127 | ] 128 | return transforms.Compose(primary_tfl+secondary_tfl+final_tfl) 129 | -------------------------------------------------------------------------------- /software_model/engine.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Hanqing Zhu 3 | # @Date: 2022-11-30 21:32:37 4 | # @Last Modified by: Hanqing Zhu 5 | # @Last Modified time: 2023-03-29 01:56:00 6 | # Copyright (c) 2015-present, Facebook, Inc. 7 | # All rights reserved. 8 | """ 9 | Train and eval functions used in main.py 10 | """ 11 | import math 12 | import sys 13 | from typing import Iterable, Optional 14 | 15 | import torch 16 | 17 | from timm.data import Mixup 18 | from timm.utils import accuracy, ModelEma 19 | 20 | from losses import DistillationLoss 21 | import utils 22 | 23 | 24 | def train_one_epoch(model: torch.nn.Module, criterion: DistillationLoss, 25 | data_loader: Iterable, optimizer: torch.optim.Optimizer, 26 | device: torch.device, epoch: int, loss_scaler, max_norm: float = 0, 27 | model_ema: Optional[ModelEma] = None, mixup_fn: Optional[Mixup] = None, 28 | set_training_mode=True, args = None): 29 | model.train(set_training_mode) 30 | metric_logger = utils.MetricLogger(delimiter=" ") 31 | metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) 32 | header = 'Epoch: [{}]'.format(epoch) 33 | print_freq = 10 34 | 35 | for samples, targets in metric_logger.log_every(data_loader, print_freq, header): 36 | samples = samples.to(device, non_blocking=True) 37 | targets = targets.to(device, non_blocking=True) 38 | 39 | if mixup_fn is not None: 40 | samples, targets = mixup_fn(samples, targets) 41 | 42 | if args.bce_loss: 43 | targets = targets.gt(0.0).type(targets.dtype) 44 | 45 | with torch.cuda.amp.autocast(): 46 | outputs = model(samples) 47 | loss = criterion(samples, outputs, targets) 48 | 49 | loss_value = loss.item() 50 | 51 | if not math.isfinite(loss_value): 52 | print("Loss is {}, stopping training".format(loss_value)) 53 | sys.exit(1) 54 | 55 | optimizer.zero_grad() 56 | 57 | # this attribute is added by timm on one optimizer (adahessian) 58 | is_second_order = hasattr(optimizer, 'is_second_order') and optimizer.is_second_order 59 | loss_scaler(loss, optimizer, clip_grad=max_norm, 60 | parameters=model.parameters(), create_graph=is_second_order) 61 | 62 | torch.cuda.synchronize() 63 | if model_ema is not None: 64 | model_ema.update(model) 65 | 66 | metric_logger.update(loss=loss_value) 67 | metric_logger.update(lr=optimizer.param_groups[0]["lr"]) 68 | # gather the stats from all processes 69 | metric_logger.synchronize_between_processes() 70 | print("Averaged stats:", metric_logger) 71 | return {k: meter.global_avg for k, meter in metric_logger.meters.items()} 72 | 73 | 74 | @torch.no_grad() 75 | def evaluate(data_loader, model, device): 76 | criterion = torch.nn.CrossEntropyLoss() 77 | 78 | metric_logger = utils.MetricLogger(delimiter=" ") 79 | header = 'Test:' 80 | 81 | # switch to evaluation mode 82 | model.eval() 83 | 84 | num_images = 0 85 | for images, target in metric_logger.log_every(data_loader, 10, header): 86 | images = images.to(device, non_blocking=True) 87 | target = target.to(device, non_blocking=True) 88 | num_images += images.shape[0] 89 | 90 | # compute output 91 | with torch.cuda.amp.autocast(): 92 | output = model(images) 93 | loss = criterion(output, target) 94 | 95 | acc1, acc5 = accuracy(output, target, topk=(1, 5)) 96 | 97 | batch_size = images.shape[0] 98 | metric_logger.update(loss=loss.item()) 99 | metric_logger.meters['acc1'].update(acc1.item(), n=batch_size) 100 | metric_logger.meters['acc5'].update(acc5.item(), n=batch_size) 101 | 102 | # gather the stats from all processes 103 | metric_logger.synchronize_between_processes() 104 | print('* Acc@1 {top1.global_avg:.3f} Acc@5 {top5.global_avg:.3f} loss {losses.global_avg:.3f}' 105 | .format(top1=metric_logger.acc1, top5=metric_logger.acc5, losses=metric_logger.loss)) 106 | 107 | return {k: meter.global_avg for k, meter in metric_logger.meters.items()} 108 | -------------------------------------------------------------------------------- /software_model/datasets.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015-present, Facebook, Inc. 2 | # All rights reserved. 3 | import os 4 | import json 5 | 6 | from torchvision import datasets, transforms 7 | from torchvision.datasets.folder import ImageFolder, default_loader 8 | 9 | from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD 10 | from timm.data import create_transform 11 | 12 | 13 | class INatDataset(ImageFolder): 14 | def __init__(self, root, train=True, year=2018, transform=None, target_transform=None, 15 | category='name', loader=default_loader): 16 | self.transform = transform 17 | self.loader = loader 18 | self.target_transform = target_transform 19 | self.year = year 20 | # assert category in ['kingdom','phylum','class','order','supercategory','family','genus','name'] 21 | path_json = os.path.join(root, f'{"train" if train else "val"}{year}.json') 22 | with open(path_json) as json_file: 23 | data = json.load(json_file) 24 | 25 | with open(os.path.join(root, 'categories.json')) as json_file: 26 | data_catg = json.load(json_file) 27 | 28 | path_json_for_targeter = os.path.join(root, f"train{year}.json") 29 | 30 | with open(path_json_for_targeter) as json_file: 31 | data_for_targeter = json.load(json_file) 32 | 33 | targeter = {} 34 | indexer = 0 35 | for elem in data_for_targeter['annotations']: 36 | king = [] 37 | king.append(data_catg[int(elem['category_id'])][category]) 38 | if king[0] not in targeter.keys(): 39 | targeter[king[0]] = indexer 40 | indexer += 1 41 | self.nb_classes = len(targeter) 42 | 43 | self.samples = [] 44 | for elem in data['images']: 45 | cut = elem['file_name'].split('/') 46 | target_current = int(cut[2]) 47 | path_current = os.path.join(root, cut[0], cut[2], cut[3]) 48 | 49 | categors = data_catg[target_current] 50 | target_current_true = targeter[categors[category]] 51 | self.samples.append((path_current, target_current_true)) 52 | 53 | # __getitem__ and __len__ inherited from ImageFolder 54 | 55 | 56 | def build_dataset(is_train, args): 57 | transform = build_transform(is_train, args) 58 | 59 | if args.data_set == 'CIFAR': 60 | dataset = datasets.CIFAR100(args.data_path, train=is_train, transform=transform) 61 | nb_classes = 100 62 | elif args.data_set == 'IMNET': 63 | root = os.path.join(args.data_path, 'train' if is_train else 'val') 64 | dataset = datasets.ImageFolder(root, transform=transform) 65 | nb_classes = 1000 66 | elif args.data_set == 'INAT': 67 | dataset = INatDataset(args.data_path, train=is_train, year=2018, 68 | category=args.inat_category, transform=transform) 69 | nb_classes = dataset.nb_classes 70 | elif args.data_set == 'INAT19': 71 | dataset = INatDataset(args.data_path, train=is_train, year=2019, 72 | category=args.inat_category, transform=transform) 73 | nb_classes = dataset.nb_classes 74 | 75 | return dataset, nb_classes 76 | 77 | 78 | def build_transform(is_train, args): 79 | resize_im = args.input_size > 32 80 | if is_train: 81 | # this should always dispatch to transforms_imagenet_train 82 | transform = create_transform( 83 | input_size=args.input_size, 84 | is_training=True, 85 | color_jitter=args.color_jitter, 86 | auto_augment=args.aa, 87 | interpolation=args.train_interpolation, 88 | re_prob=args.reprob, 89 | re_mode=args.remode, 90 | re_count=args.recount, 91 | ) 92 | if not resize_im: 93 | # replace RandomResizedCropAndInterpolation with 94 | # RandomCrop 95 | transform.transforms[0] = transforms.RandomCrop( 96 | args.input_size, padding=4) 97 | return transform 98 | 99 | t = [] 100 | if resize_im: 101 | size = int(args.input_size / args.eval_crop_ratio) 102 | t.append( 103 | transforms.Resize(size, interpolation=3), # to maintain same ratio w.r.t. 224 images 104 | ) 105 | t.append(transforms.CenterCrop(args.input_size)) 106 | 107 | t.append(transforms.ToTensor()) 108 | t.append(transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD)) 109 | return transforms.Compose(t) 110 | -------------------------------------------------------------------------------- /profile/README.md: -------------------------------------------------------------------------------- 1 | # Profiling Workloads on GPU 2 | 3 | We provides profiling scripts on GPU. The reported results are measured on single A100 GPU with automatic mixed precision. 4 | 5 | 1. `vit_infer.py`, `bert_infer.py` for launching inference on a single device. 6 | 2. `model.py` provides a simplidied DeiT and BERT implementation based on [a](https://github.com/zixuanjiang/pre-rmsnorm-transformer) and [b](https://github.com/lucidrains/vit-pytorch). 7 | 8 | 9 | We provide the benchmark logs of our tested results in `benchmark_logs/`. 10 | 11 | --- 12 | ## Latency Measurement 13 | 14 | We use `torch.utils.benchmark` to measure the latency for DeiT and BERT models for SST tasks(BERT for sequential classification). 15 | 16 | We set the minimum run to 100 for each measurement. 17 | 18 | ### How to use 19 | 20 | #### DeiT 21 | Launch `python vit_infer.py -m model_name` to obtain latency for DeiT. 22 | * `-m`: The flag for different models. We can set it to `deit-t`, `deit-s`, `deit-b` to test latency for DeiT-Tiny, DeiT-Small, DeiT-Base. 23 | 24 | #### BERT 25 | Launch `python bert_infer.py -m model_name -s seq_length` to obtain latency for BERT for sequence classification. 26 | * `-m`: The flag for different models. We can set it to `bert-b`, `bert-l` to test latency for DeiT-Tiny, DeiT-Small, DeiT-Base. 27 | * `-s`: The flag for sequence length. You can try 128, 256, 384, 320 for BERT. 28 | 29 | ### Expected results 30 | If we set model to DeiT-B, and run `python vit_infer.py -m deit-b` the reported results should be like 31 | 32 | ``` 33 | y = model(x): batch_size 1 method pre-ln 34 | Base 35 | setup: from __main__ import model 36 | Median: 4.37 ms 37 | IQR: 0.06 ms (4.36 to 4.42) 38 | 226 measurements, 100 runs per measurement, 1 thread 39 | [------------------- ------------------] 40 | | Base 41 | 1 threads: ------------------------------ 42 | batch_size 1 method pre-ln | 4.4 43 | 44 | Times are in milliseconds (ms). 45 | ``` 46 | 47 | --- 48 | 49 | ## Power Tracing 50 | 51 | We use nvidia-smi to monitor the power usage when running the workloads on GPU. 52 | ``` 53 | nvidia-smi dmon -s puc -d 1 -i 0 > ./power_results/power_usage.csv 54 | ``` 55 | * `-s puc`: The `-s`` flag specifies which metrics to monitor. In this case, it's set to `puc``, which stands for "power usage in watts of the GPU." 56 | * `-d 1`: The `-d` flag specifies the update interval in seconds. Here, it's set to 1 second, which means that the power usage will be sampled and recorded every 1 second. 57 | * `-i 0`: The `-i` flag specifies the GPU index to monitor. In this case, it's set to 0, indicating that the monitoring should be done on GPU index 0. You can change this number to monitor a different GPU if you have run jobs on different GPUs in your system. By default, we use GPU with index 0. 58 | * `> ./power_results/power_usage.csv`: Save the monitor power usage result into power_usage.csv. 59 | 60 | ### How to use 61 | 62 | Launch `power_monitor.sh` to mointor the power usage. You can save the power usage into a csv file for further processing. 63 | 64 | Please run `power_monitor.sh` before launching inference scripts `vit_infer.py`, `bert_infer.py`. 65 | 66 | ### Expected results 67 | 68 | Take the DeiT-T as an example (see `./benchmark_logs/deit-s-power.csv`). 69 | The monitored power usage shows you the idle power (61W in our case) and work power (72W). 70 | Then the power during inference is 72-61=11W. 71 | 72 | ## Energy estimation 73 | 74 | Multiply the power with the measured latency for single inference, you can get the energy cost for single inference. 75 | 76 | For example, the DeiT-base model has a power of 16W and a latency of 4.37ms, so the energy cost is 113.62 mJ. 77 | 78 | --- 79 | 80 | ## AE workflow 81 | 82 | Follow the three steps to obtain both latency and power usage 83 | Open two terminals on the same machine. 84 | 85 | * First run `./power_monitor.sh > ./power_results/power_usage.csv` to mointor the power usage of GPU 0. `> ./power_results/power_usage.csv`: Save the monitor power usage result into power_usage.csv. 86 | * Then launch the latency measurement file: `python vit_infer.py -m model_name` or `python bert_infer.py -m model_name -s seq_length` on *another teminal*. 87 | * Kill the power monitor script when the latenyc measurement finishs. 88 | 89 | Obtain the power of GPU for running workloads by substracting the power with the idle power. 90 | 91 | For example, the DeiT-base model has a power of (87-61=16W) and a latency of 4.37ms, so the energy cost is 113.62 mJ. 92 | 93 | --- 94 | 95 | We refer to the following implementation. 96 | 1. [A simplified ViT implementation in PyTorch](https://github.com/lucidrains/vit-pytorch) 97 | 2. [BERT implementation from Nvidia](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/LanguageModeling/BERT) 98 | 3. [Measurement codes from pre-rmsnorm-transformer](https://github.com/zixuanjiang/pre-rmsnorm-transformer) 99 | -------------------------------------------------------------------------------- /hardware_simulator/utils/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Hanqing Zhu 3 | # @Date: 2023-02-23 22:45:26 4 | # @Last Modified by: Hanqing Zhu 5 | # @Last Modified time: 2023-02-23 22:45:32 6 | """ 7 | Description: Modified based on torchpack 0.3.0 8 | Author: Jiaqi Gu (jqgu@utexas.edu) 9 | Date: 2021-06-06 01:46:57 10 | LastEditors: Jiaqi Gu (jqgu@utexas.edu) 11 | LastEditTime: 2021-06-06 01:46:57 12 | """ 13 | import hashlib 14 | import json 15 | import yaml 16 | import os 17 | from ast import literal_eval 18 | from typing import Any, Dict, List, Tuple, Union 19 | 20 | from multimethod import multimethod 21 | 22 | 23 | __all__ = [ 24 | "Config", 25 | "configs", 26 | ] 27 | 28 | 29 | class Config(dict): 30 | def __getattr__(self, key: str) -> Any: 31 | if key not in self: 32 | d = self 33 | ## try hierarchical access 34 | keys = key.split(".") 35 | for k in keys: 36 | if k not in d: 37 | raise AttributeError(key) 38 | d = d[k] 39 | return d 40 | else: 41 | return self[key] 42 | 43 | def __setattr__(self, key: str, value: Any) -> None: 44 | self[key] = value 45 | 46 | def __delattr__(self, key: str) -> None: 47 | del self[key] 48 | 49 | def load(self, fpath: str, *, recursive: bool = False) -> None: 50 | if not os.path.exists(fpath): 51 | raise FileNotFoundError(fpath) 52 | fpaths = [fpath] 53 | if recursive: 54 | while fpath: 55 | fpath = os.path.dirname(fpath) 56 | for fname in ["default.yaml", "default.yml"]: 57 | fpaths.append(os.path.join(fpath, fname)) 58 | for fpath in reversed(fpaths): 59 | if os.path.exists(fpath): 60 | with open(fpath, "r") as f: 61 | cfg_dict = yaml.safe_load(f) 62 | self.update(cfg_dict) 63 | 64 | def reload(self, fpath: str, *, recursive: bool = False) -> None: 65 | self.clear() 66 | self.load(fpath, recursive=recursive) 67 | 68 | @multimethod 69 | def update(self, other: Dict) -> None: 70 | for key, value in other.items(): 71 | if isinstance(value, dict): 72 | if key not in self or not isinstance(self[key], Config): 73 | self[key] = Config() 74 | self[key].update(value) 75 | else: 76 | self[key] = value 77 | 78 | @multimethod 79 | def update(self, opts: Union[List, Tuple]) -> None: 80 | index = 0 81 | while index < len(opts): 82 | opt = opts[index] 83 | if opt.startswith("--"): 84 | opt = opt[2:] 85 | if "=" in opt: 86 | key, value = opt.split("=", 1) 87 | index += 1 88 | else: 89 | key, value = opt, opts[index + 1] 90 | index += 2 91 | current = self 92 | subkeys = key.split(".") 93 | try: 94 | value = literal_eval(value) 95 | except: 96 | pass 97 | for subkey in subkeys[:-1]: 98 | current = current.setdefault(subkey, Config()) 99 | current[subkeys[-1]] = value 100 | 101 | def dict(self) -> Dict[str, Any]: 102 | configs = dict() 103 | for key, value in self.items(): 104 | if isinstance(value, Config): 105 | value = value.dict() 106 | configs[key] = value 107 | return configs 108 | 109 | def flat_dict(self) -> Dict[str, Any]: 110 | def _flatten_dict(dd, separator: str = "_", prefix: str = ""): 111 | return ( 112 | { 113 | prefix + separator + k if prefix else k: v 114 | for kk, vv in dd.items() 115 | for k, v in _flatten_dict(vv, separator, kk).items() 116 | } 117 | if isinstance(dd, dict) 118 | else {prefix: dd} 119 | ) 120 | 121 | return _flatten_dict(self.dict(), separator=".") 122 | 123 | def hash(self) -> str: 124 | buffer = json.dumps(self.dict(), sort_keys=True) 125 | return hashlib.sha256(buffer.encode()).hexdigest() 126 | 127 | def dump_to_yml(self, path: str) -> None: 128 | with open(path, "w") as f: 129 | yaml.safe_dump(self.dict(), f) 130 | 131 | def __str__(self) -> str: 132 | texts = [] 133 | for key, value in self.items(): 134 | if isinstance(value, Config): 135 | seperator = "\n" 136 | else: 137 | seperator = " " 138 | text = key + ":" + seperator + str(value) 139 | lines = text.split("\n") 140 | for k, line in enumerate(lines[1:]): 141 | lines[k + 1] = (" " * 2) + line 142 | texts.extend(lines) 143 | return "\n".join(texts) 144 | 145 | 146 | configs = Config() -------------------------------------------------------------------------------- /hardware_simulator/entry_energy_latency_workload.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Hanqing Zhu(hqzhu@utexas.edu) 3 | # @Date: 1969-12-31 18:00:00 4 | # @Last Modified by: Hanqing Zhu(hqzhu@utexas.edu) 5 | # @Last Modified time: 2023-11-12 21:02:24 6 | 7 | import os 8 | import csv 9 | import argparse 10 | from utils.general import ensure_dir 11 | from utils.config import configs 12 | from utils.model import modelParams 13 | 14 | from simulator_FFN import FFNPrediction 15 | from simulator_attn import attnPrediction 16 | 17 | def main(configs, model_name='deit-s', exp_name='compare_onn', optimize_flag='arch_opt', tokens=197, print_msg=False): 18 | # extraxt model workload charaterstics 19 | model_zoo = modelParams() 20 | ops_list = model_zoo.obtain_ops_list(model_name=model_name, tokens=tokens) 21 | 22 | if model_name == 'bert-l': 23 | factor = 2 24 | else: 25 | factor = 1 26 | 27 | sv_path = f"./results/{exp_name}/{model_name}_{tokens}_{configs.core.precision.in_bit}bit/{configs.core.type}_{optimize_flag}_{configs.arch.num_tiles}t_{configs.arch.num_pe_per_tile}c/" 28 | sv_sub_path = f"./results/{exp_name}/{model_name}_{tokens}_{configs.core.precision.in_bit}bit/{configs.core.type}_{optimize_flag}_{configs.arch.num_tiles}t_{configs.arch.num_pe_per_tile}c/modules/" 29 | 30 | ensure_dir(sv_path) 31 | ensure_dir(sv_sub_path) 32 | 33 | energy_sum = 0 34 | latency_sum = 0 35 | saved_arrays = [] 36 | for item in ops_list: 37 | idx = item["idx"] 38 | name = item["name"] 39 | type = item["type"] 40 | if type == "fc": 41 | predictor = FFNPrediction(item, configs) 42 | predictor.run(print_msg=print_msg) 43 | predictor.save(sv_name=name, sv_path=sv_sub_path) 44 | energy_cost = predictor.energy_dict['linear']['comp']['total'][0] + \ 45 | predictor.energy_dict['linear']['datamovement']['total'][0] 46 | 47 | latency_cost = predictor.latency_dict['linear']['total'][1] 48 | if not 'head' in name and not 'embed' in name: 49 | energy_cost *= 12 * factor 50 | latency_cost *= 12 * factor 51 | saved_arrays.append([name, energy_cost, latency_cost]) 52 | elif type == 'attn': 53 | if configs.core.type != 'mzi': 54 | predictor = attnPrediction(item, configs) 55 | predictor.run(print_msg=print_msg) 56 | predictor.save(sv_name=name, sv_path=sv_sub_path) 57 | energy_cost = predictor.energy_dict['Q*K^T']['comp']['total'][0] + predictor.energy_dict['Q*K^T']['datamovement']['total'][0] + \ 58 | predictor.energy_dict['S*V']['comp']['total'][0] + predictor.energy_dict['S*V']['datamovement']['total'][0] 59 | # print(predictor.energy_dict['linear']['comp']) 60 | latency_cost = predictor.latency_dict['Q*K^T']['total'][1] + predictor.latency_dict['S*V']['total'][1] 61 | energy_cost *= 12 * factor 62 | latency_cost *= 12 * factor 63 | saved_arrays.append([name, energy_cost, latency_cost]) 64 | else: 65 | raise NotImplementedError 66 | energy_sum += energy_cost 67 | latency_sum += latency_cost 68 | 69 | energy_others, latency_others = model_zoo.obtain_other_costs(model_name=model_name, tokens=tokens) 70 | saved_arrays.append(["others", energy_others, latency_others]) 71 | energy_sum += energy_others 72 | latency_sum += latency_others 73 | 74 | def __save_csv(sv_name, total, arrays): 75 | with open(sv_name, 'w') as csvfile: 76 | writer = csv.writer(csvfile) 77 | writer.writerow(['', 'energy (mJ)', 'latency (ms)']) 78 | writer.writerow(total) 79 | for each in arrays: 80 | writer.writerow(each) 81 | __save_csv(os.path.join(sv_path, 'total.csv'), [ 82 | 'total', energy_sum, latency_sum], saved_arrays) 83 | 84 | 85 | if __name__ == "__main__": 86 | parser = argparse.ArgumentParser() 87 | parser.add_argument("-c", "--config", default=".params.yaml", 88 | metavar="FILE", help="config file") 89 | parser.add_argument("-m", "--model_name", default="deit-s", 90 | help="model") 91 | parser.add_argument("-t", "--tokens", default=197, 92 | help="tokens or sequence length") 93 | parser.add_argument("-o", "--optimize_flag", default="arch_opt", 94 | help="optimize flag for DOTA") 95 | parser.add_argument("-e", "--exp_name", default="compare_onn", 96 | help="experiments name") 97 | args, opts = parser.parse_known_args() 98 | configs.load(args.config, recursive=True) 99 | configs.update(opts) 100 | 101 | if configs.core.type == "dota": 102 | # three different optimize flag 103 | # broadcast 104 | # crossbar 105 | # arch-opt 106 | assert args.optimize_flag in ["broadcast", "crossbar", "arch_opt"] 107 | configs.arch.disable_crossbar_topology = 1 if args.optimize_flag == "broadcast" else 0 108 | if args.optimize_flag == "arch_opt": 109 | configs.arch.adc_share_flag = 1 110 | configs.arch.time_accum_factor = 3 111 | configs.arch.input_mod_sharing_flag = 1 112 | else: 113 | configs.arch.adc_share_flag = 0 114 | configs.arch.time_accum_factor = 1 115 | configs.arch.input_mod_sharing_flag = 0 116 | elif configs.core.type == 'mrrbank' or configs.core.type == 'mzi': 117 | configs.arch.weight_reuse_factor = -1 # fully weight-stationary flow 118 | args.optimize_flag = 'broadcast' 119 | else: 120 | raise ValueError(f"Got unsupportted core type {configs.core.type}") 121 | print(f"Report energy and latency estimation for {args.model_name}_{args.tokens}_{configs.core.precision.in_bit}bit on {configs.core.type}_{args.optimize_flag}_{configs.arch.num_tiles}t_{configs.arch.num_pe_per_tile}c") 122 | 123 | main(configs=configs, model_name=args.model_name, exp_name=args.exp_name, optimize_flag=args.optimize_flag, tokens=int(args.tokens)) 124 | 125 | sv_path = f"./results/{args.exp_name}/{args.model_name}_{args.tokens}_{configs.core.precision.in_bit}bit/{configs.core.type}_{args.optimize_flag}_{configs.arch.num_tiles}t_{configs.arch.num_pe_per_tile}c" 126 | 127 | print(f'Finish and save report to {sv_path}') 128 | print('-'*20) -------------------------------------------------------------------------------- /hardware_simulator/readme.md: -------------------------------------------------------------------------------- 1 | # Hardware simulator for our photonic Transformer accelerator 2 | 3 | This contains the hardware simulator for our photonic Transformer accelerator, DOTA, in our lightning-transformer work. 4 | Our simulator is based on behavior-level simulation. 5 | 6 | We support simulates our DOTA-B/L variants with 4-bit/8-bit work mode. And we also support simulating the photonic baselines, MRR bank and MZI. 7 | 8 | --- 9 | 10 | ## Code structures 11 | 12 | * `./hardwares/`. This directory contains the modeling for photonic tensor cores, including our dynamically-operated crossbar-style PTC and two baselines: MRR bank and MZI. 13 | * `./params/`. 14 | * `./params/device_params/` This directory contains the accelerator detailed params as well as all the device parameters. 15 | * DOTA-B: A 4 tile variant of our DOTA photonic Transformer accelerator. 16 | * DOTA-L: A 8 tile variant of our DOTA photonic Transformer accelerator. 17 | * MZI: A 2 tile variant of MZI mesh. 18 | * MRR bank: A 7 tile variant of MRR bank. 19 | * *NOTE: we keep DOTA-B, MZI, and MRR bank under the same area budget for fair comparasion.* 20 | 21 | * `entry_area_power_profile.py`. The python file you can launch to profile the area and power of the accelerator. 22 | * `entry_energy_latency_workload.py`. The python file you can launch to profile the energy and latency when running one workload on given accelerator. 23 | * `/results/`. The generated results will be dumpped into this directory. 24 | * `/utils/`. Utility functions. 25 | 26 | ## AE exp1: Simulate the area and power of our photonic accelerator. 27 | 28 | ### Single run by run `entry_area_power_profile.py` 29 | 30 | To simulate the area and power, run 31 | ``` 32 | exp='area_power_profile_single' # exp name you give 33 | config='./params/device_params/Dota_B_4bit.yaml' # the param file of the given photonic accelerator 34 | 35 | python entry_area_power_profile.py \ 36 | -e ${exp} \ 37 | --config ${config} 38 | ``` 39 | 40 | It will generate the area and power report under `./results/exp_name_you_give/accelerator_name/`. It contains two CSV files for area and power estimation. 41 | 42 | For example if you run 43 | ``` 44 | python entry_area_power_profile.py -e area_power_profile_single --config ./params/device_params/Dota_B_4bit.yaml 45 | ``` 46 | You will have the area and power report under `./results/area_power_profile_single/dota_4t_2c_4bit/`. 47 | The area report would be 48 | 49 | |dota |area (mm^2) |percentage (%)| 50 | |-------------|--------------------|--------------| 51 | |total |60.329395086 |1 | 52 | |laser |0.72 |1.19 | 53 | |DAC |15.84 |26.26 | 54 | |MZM |7.59416832 |12.59 | 55 | |ADC |1.6416 |2.72 | 56 | |TIA |0.0576 |0.1 | 57 | |photonic_core|11.318291999999998 |18.76 | 58 | |adder |0.051199999999999996|0.08 | 59 | |mem |14.695398766000002 |24.36 | 60 | |micro_comb |8.411135999999999 |13.94 | 61 | 62 | 63 | *Note that we only provide area report for MZI and MRR baselines.* 64 | 65 | ### Batch run by run `./scripts/area_power_all.sh` 66 | 67 | We provide one script to run the area and power estimation for all photonic accelerator variants: DOTA-B-4/8bit, DOTA-L-4/8bit, MRR-4/8bit, MZI-4/8bit. It generated results under `./results/area_power_all/` 68 | 69 | 70 | ## AE exp2: Simulate the energy and latency when running workload on photonic system. 71 | 72 | ### Single run by run `entry_energy_latency_workload.py` 73 | 74 | To simulate the energy and latency for a given Transformer workload (DeiT-T/S/B, BERT-B/L in our work), run the 75 | ``` 76 | exp='energy_latency_single_workload' # exp name 77 | model_name='deit-t' # model name 78 | tokens=197 # number of tokens, 197 for deit, you can define number of tokens for bert 79 | onn_params='./params/device_params/Dota_B_4bit.yaml' 80 | # choose onn accelerator params from 81 | # config_dict=( 82 | # ['dota_b_4bit']='./params/device_params/Dota_B_4bit.yaml' 83 | # ['dota_b_8bit']='./params/device_params/Dota_B_8bit.yaml' 84 | # ['dota_l_4bit']='./params/device_params/Dota_L_4bit.yaml' 85 | # ['dota_l_8bit']='./params/device_params/Dota_L_8bit.yaml' 86 | # ['mrr_4bit']='./params/device_params/Bs_mrr_bank_4bit.yaml' 87 | # ['mrr_8bit']='./params/device_params/Bs_mrr_bank_8bit.yaml' 88 | # ['mzi_4bit']='./params/device_params/Bs_mzi_4bit.yaml' 89 | # ['mzi_8bit']='./params/device_params/Bs_mzi_8bit.yaml' 90 | # ) 91 | 92 | python entry_energy_latency_workload.py \ 93 | -e ${exp} \ 94 | --tokens ${tokens} \ 95 | --model_name ${model_name} \ 96 | --config ${onn_params} 97 | ``` 98 | 99 | It will generate the energy and latency report under `./results/exp_name_you_give/accelerator_name/`. 100 | 101 | It contains a `total.csv` under this directory for energy and latency estimation, which also has the breakdown based on different layer types, e.g., attention/FFN/QKV/head. 102 | 103 | We also privide a more detailed energy breakdown for different layer types under the `modules/` in this directory. 104 | It provides the energy breakdown across different compoents (e.g., laser, DAC, ADC, data-moevemnt, etc.). 105 | 106 | You can change the arguments for model_name and the corresponding tokens. We support following arguments for `model_name` 107 | * deit-t 108 | * deit-s 109 | * deit-b 110 | * bert-b 111 | * bert-l 112 | 113 | The correct token number for deit on ImageNet dataset is 197. For bert, you can vary different number of tokens. 114 | 115 | You can also change the argument to enable/disable architecture-level optimization for our DOTA by settting the argument 116 | ``` 117 | --optimize_flag arch_opt # set to crossbar to disable arch optimization 118 | ``` 119 | 120 | ### Batch run by run `./scripts/energy_latency_all.sh` 121 | 122 | We provide one script to run the estimation for all workloads we used in our paper. 123 | * deit-t with tokens=197 124 | * deit-s with tokens=197 125 | * deit-b with tokens=197 126 | * bert-b with tokens=384 127 | * bert-l with tokens=320 128 | for photonic accelerator variants: DOTA-B-4/8bit, DOTA-L-4/8bit, MRR-4/8bit, MZI-4/8bit. It generated results under `./results/energy_latency_all/` 129 | 130 | *Note that we only provide reports on linear layers for MZI since it cannot support Attention efficiently due to on-the-fly activation decomposition that is extremely expensive.* 131 | -------------------------------------------------------------------------------- /hardware_simulator/hardware/photonic_core_base.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Hanqing Zhu(hqzhu@utexas.edu) 3 | # @Date: 1969-12-31 18:00:00 4 | # @Last Modified by: Hanqing Zhu(hqzhu@utexas.edu) 5 | # @Last Modified time: 2023-11-08 17:23:00 6 | # Basci class for photonic core 7 | from .ADC import ADC 8 | from .DAC import DAC 9 | 10 | __all__ = ["PhotoniceCore"] 11 | 12 | class PhotonicCore(): 13 | def __init__(self, *args, **kwargs): 14 | super().__init__(*args, **kwargs) 15 | self.photonic_core_type = None 16 | self.width = None 17 | self.height = None 18 | 19 | ## obtain params for photonic devices 20 | def _obtain_laser_param(self, config=None): 21 | if config is not None: 22 | self.laser_power = config.power 23 | self.laser_length = config.length 24 | self.laser_width = config.width 25 | self.laser_area = self.laser_length * self.laser_width 26 | self.laser_wall_plug_eff = config.wall_plug_eff 27 | else: 28 | self.laser_power = 0.5 29 | self.laser_length = 400 30 | self.laser_width = 300 31 | self.laser_area = self.laser_length * self.laser_width 32 | self.laser_wall_plug_eff = 0.25 33 | 34 | def _obtain_micro_comb_param(self, config=None): 35 | if config is not None: 36 | self.micro_comb_length = config.length 37 | self.micro_comb_width = config.width 38 | else: 39 | self.micro_comb_length = 1184 40 | self.micro_comb_width = 1184 41 | self.micro_comb_area = self.micro_comb_length * self.micro_comb_width 42 | 43 | # modulator 44 | def _obtain_modulator_param(self, config=None): 45 | if config is not None: 46 | self.modulator_type = config.type 47 | self.modulator_energy_per_bit = config.energy_per_bit 48 | self.modulator_power_static = config.static_power 49 | self.modulator_length = config.length 50 | self.modulator_width = config.width 51 | self.modulator_insertion_loss = config.insertion_loss 52 | else: 53 | self.modulator_energy_per_bit = 400 54 | self.modulator_static_power = 0 55 | self.modulator_length = 300 56 | self.modulator_width = 50 57 | self.modulator_insertion_loss = 0.8 58 | 59 | # basic devices 60 | def _obtain_y_branch_param(self, config=None): 61 | if config is not None: 62 | self.y_branch_length = config.length 63 | self.y_branch_width = config.width 64 | self.y_branch_insertion_loss = config.insertion_loss 65 | else: 66 | self.y_branch_length = 75 67 | self.y_branch_width = 3.9 68 | self.y_branch_insertion_loss = 0.1 69 | 70 | def _obtain_photodetector_param(self, config=None): 71 | if config is not None: 72 | self.photo_detector_power = config.power 73 | self.photo_detector_length = config.length 74 | self.photo_detector_width = config.width 75 | self.photo_detector_sensitivity = config.sensitivity 76 | else: 77 | self.photo_detector_power = 2.8 78 | self.photo_detector_length = 40 79 | self.photo_detector_width = 40 80 | self.photo_detector_sensitivity = -25 81 | 82 | def _obtain_direction_coupler_param(self, config=None): 83 | if config is not None: 84 | self.direction_coupler_length = config.length 85 | self.direction_coupler_width = config.width 86 | self.direction_coupler_insertion_loss = config.insertion_loss 87 | else: 88 | self.direction_coupler_length = 75 89 | self.direction_coupler_width = 10 90 | self.direction_coupler_insertion_loss = 0.3 91 | 92 | def _obtain_phase_shifter_param(self, config=None): 93 | if config is not None: 94 | self.phase_shifter_power_dynamic = config.dynamic_power 95 | self.phase_shifter_power_static = config.static_power 96 | self.phase_shifter_length = config.length 97 | self.phase_shifter_width = config.width 98 | self.phase_shifter_insertion_loss = config.insertion_loss 99 | # self.phase_shifter_programming_time = config.programming_time 100 | else: 101 | self.phase_shifter_power_dynamic = 0 102 | self.phase_shifter_power_static = 0 103 | self.phase_shifter_length = 200 104 | self.phase_shifter_width = 34 105 | self.phase_shifter_insertion_loss = 0.2 106 | # self.phase_shifter_programming_time = 10 # ns based on NEOM 107 | 108 | 109 | def _obtain_mrr_router_param(self, config=None): 110 | if config is not None: 111 | self.mrr_router_power = config.static_power 112 | self.mrr_router_length = config.length 113 | self.mrr_router_width = config.width 114 | self.mrr_router_insertion_loss = config.insertion_loss 115 | else: 116 | self.mrr_router_power = 2.4 117 | self.mrr_router_length = 20 118 | self.mrr_router_width = 20 119 | self.mrr_router_insertion_loss = 0.25 120 | 121 | def _obtain_TIA_param(self, config=None): 122 | if config is not None: 123 | self.TIA_power = config.power 124 | self.TIA_area = config.area 125 | else: 126 | raise NotImplementedError 127 | self.TIA_power = 3 128 | self.TIA_area = 5200 129 | 130 | def _obtain_ADC_param(self, config=None): 131 | if config is not None: 132 | ADC_choice = config.choice 133 | self.core_ADC_sharing_factor = config.sharing_factor 134 | self.ADC = ADC(ADC_choice) 135 | else: 136 | raise NotImplementedError 137 | 138 | def _obtain_DAC_param(self, config=None): 139 | if config is not None: 140 | DAC_choice = config.choice 141 | self.DAC = DAC(DAC_choice) 142 | else: 143 | raise NotImplementedError 144 | 145 | 146 | ## calculate area, insertion loss and energy cost 147 | def cal_insertion_loss(self): 148 | raise NotImplementedError 149 | 150 | def cal_TX_energy(self): 151 | raise NotImplementedError 152 | 153 | def cal_D2A_energy(self): 154 | raise NotImplementedError 155 | 156 | def cal_RX_energy(self): 157 | raise NotImplementedError 158 | 159 | def cal_A2D_energy(self): 160 | raise NotImplementedError 161 | 162 | def cal_comp_energy(self): 163 | raise NotImplementedError 164 | 165 | def cal_laser_energy(self): 166 | raise NotImplementedError 167 | 168 | def cal_core_area(self): 169 | raise NotImplementedError 170 | 171 | def cal_core_power(self): 172 | raise NotImplementedError 173 | -------------------------------------------------------------------------------- /hardware_simulator/utils/model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Hanqing Zhu(hqzhu@utexas.edu) 3 | # @Date: 1969-12-31 18:00:00 4 | # @Last Modified by: Hanqing Zhu(hqzhu@utexas.edu) 5 | # @Last Modified time: 2023-11-11 02:41:52 6 | from utils.config import configs 7 | from utils.cal_flops_for_transformer import get_infer_ops 8 | 9 | model_zoo ={ 10 | 'deit-t': {'patch': 16, 'depth': 12, 'embed_dim': 192, 'num_heads': 3, 'mlp_ratio': 4, 'tokens': 197}, 11 | 'deit-s': {'patch': 16, 'depth': 12, 'embed_dim': 384, 'num_heads': 6, 'mlp_ratio': 4, 'tokens': 197}, 12 | 'deit-b': {'patch': 16, 'depth': 12, 'embed_dim': 768, 'num_heads': 12, 'mlp_ratio': 4, 'tokens': 197}, 13 | 'bert-b': {'depth': 12, 'embed_dim': 768, 'num_heads': 12, 'mlp_ratio': 4, 'tokens': 384}, 14 | 'bert-l': {'depth': 24, 'embed_dim': 1024, 'num_heads': 16, 'mlp_ratio': 4, 'tokens': 320}, 15 | } 16 | 17 | 18 | class modelParams(): 19 | # generate op list based on model param 20 | def __init__(self) -> None: 21 | super().__init__() 22 | 23 | def obtain_other_costs(self, model_name='deit-t', tokens=None): 24 | """Function to return estimated energy and latency for non-GEMM ops""" 25 | energy, latency = 0, 0 26 | 27 | tokens = model_zoo[model_name]["tokens"] if tokens is None else tokens 28 | softmax_ops, layer_norm_ops, residual_ops, activation_ops = get_infer_ops( 29 | h_d=model_zoo[model_name]["embed_dim"], 30 | l_s=model_zoo[model_name]["depth"], 31 | seq= tokens, 32 | heads=model_zoo[model_name]["num_heads"], 33 | head_size=model_zoo[model_name]["embed_dim"] //model_zoo[model_name]["num_heads"] 34 | ) 35 | bits = 4 # default is 4 bits 36 | 37 | # softmax estimation use 38 | # "high-speed and low-complexity architecture for softmax function in deep learning,” in 2018 IEEE asia pacific conference on circuits and systems (APCCAS 39 | softmax_energy_byte = 51.6 / 44.8 * 1e-9 # mJ/Byte 40 | # other uses mac * ops 41 | LAYER_NORM_FLOPS = 5 42 | # GELU: 0.5 * x * (1 + tanh(sqrt(2 / np.pi) * (x + 0.044715 * pow(x, 3)))) 43 | ACTIVATION_FLOPS = 8 44 | 45 | comp_energy = (activation_ops*ACTIVATION_FLOPS + layer_norm_ops * LAYER_NORM_FLOPS + residual_ops) * 100 * 1e-12 + softmax_energy_byte * softmax_ops * bits /8 46 | datamovement_energy = (activation_ops + residual_ops + layer_norm_ops + softmax_ops) * 1.655e-9 * bits / 16 * 2 47 | energy = comp_energy + datamovement_energy 48 | 49 | # latency: 50 | # estimated as memory access latency since all activations are stored on-chip 51 | bandwidth_sram = 1 / 0.604347 * 64 * 64* 1024 * 1024 * 1024 * 8 52 | clock_frequency = 500 * 1e6 53 | latency = (softmax_ops + layer_norm_ops + residual_ops + activation_ops ) * bits / bandwidth_sram 54 | 55 | return energy, latency 56 | 57 | def obtain_ops_list(self, model_name='deit-t', tokens=None): 58 | """Function to return the GEMM workloads dict""" 59 | ops_list = [] 60 | if 'deit' in model_name: 61 | model_params = model_zoo[model_name] 62 | patch = model_params['patch'] 63 | depth = model_params['depth'] 64 | embed_dim = model_params['embed_dim'] 65 | num_heads = model_params['num_heads'] 66 | mlp_ratio = model_params['mlp_ratio'] 67 | num_classes = 1000 68 | tokens = tokens if tokens is not None else model_params['tokens'] 69 | idx = 0 70 | # deit family 71 | # first a 3 by 3 conv 72 | ops_list.append( 73 | {"idx": idx, "name": 'embed', "type": "fc", "in_features": 3*patch*patch, "out_features": embed_dim, "bs": 196} 74 | ) 75 | idx += 1 76 | # atten block 77 | ops_list.append( 78 | {"idx": idx, "name": 'qkv', "type": "fc", "in_features": embed_dim, "out_features": embed_dim*3, "bs": tokens} 79 | ) 80 | idx += 1 81 | ops_list.append( 82 | {"idx": idx, "name": 'attn', "type": "attn", "num_heads": num_heads, "embed_dim": embed_dim, "num_tokens": tokens} 83 | ) 84 | idx += 1 85 | ops_list.append( 86 | {"idx": idx, "name": 'proj', "type": "fc", "in_features": embed_dim, "out_features": embed_dim, "bs": tokens} 87 | ) 88 | idx += 1 89 | ops_list.append( 90 | {"idx": idx, "name": 'FFN1', "type": "fc", "in_features": embed_dim, "out_features": embed_dim*mlp_ratio, "bs": tokens} 91 | ) 92 | idx += 1 93 | ops_list.append( 94 | {"idx": idx, "name": 'FFN2', "type": "fc", "in_features": embed_dim*mlp_ratio, "out_features": embed_dim, "bs": tokens} 95 | ) 96 | idx += 1 97 | ops_list.append( 98 | {"idx": idx, "name": 'head', "type": "fc", "in_features": embed_dim, "out_features": num_classes, "bs": 1} 99 | ) 100 | elif 'bert' in model_name: 101 | model_params = model_zoo[model_name] 102 | depth = model_params['depth'] 103 | embed_dim = model_params['embed_dim'] 104 | num_heads = model_params['num_heads'] 105 | mlp_ratio = model_params['mlp_ratio'] 106 | num_classes = 2 107 | tokens = tokens if tokens is not None else model_params['tokens'] 108 | idx = 0 109 | ops_list.append( 110 | {"idx": idx, "name": 'qkv', "type": "fc", "in_features": embed_dim, "out_features": embed_dim*3, "bs": tokens} 111 | ) 112 | idx += 1 113 | ops_list.append( 114 | {"idx": idx, "name": 'attn', "type": "attn", "num_heads": num_heads, "embed_dim": embed_dim, "num_tokens": tokens} 115 | ) 116 | idx += 1 117 | ops_list.append( 118 | {"idx": idx, "name": 'proj', "type": "fc", "in_features": embed_dim, "out_features": embed_dim, "bs": tokens} 119 | ) 120 | idx += 1 121 | ops_list.append( 122 | {"idx": idx, "name": 'FFN1', "type": "fc", "in_features": embed_dim, "out_features": embed_dim*mlp_ratio, "bs": tokens} 123 | ) 124 | idx += 1 125 | ops_list.append( 126 | {"idx": idx, "name": 'FFN2', "type": "fc", "in_features": embed_dim*mlp_ratio, "out_features": embed_dim, "bs": tokens} 127 | ) 128 | idx += 1 129 | ops_list.append( 130 | {"idx": idx, "name": 'head', "type": "fc", "in_features": embed_dim, "out_features": num_classes, "bs": 1} 131 | ) 132 | 133 | return ops_list 134 | 135 | if __name__ == "__main__": 136 | test = modelParams() 137 | ops_list = test.obtain_ops_list('bert-l', tokens=384) 138 | print(ops_list) 139 | 140 | test.obtain_other_costs('bert-l', tokens=384) -------------------------------------------------------------------------------- /software_model/resmlp_models.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015-present, Facebook, Inc. 2 | # All rights reserved. 3 | import torch 4 | import torch.nn as nn 5 | from functools import partial 6 | 7 | from timm.models.vision_transformer import Mlp, PatchEmbed , _cfg 8 | from timm.models.registry import register_model 9 | from timm.models.layers import trunc_normal_, DropPath 10 | 11 | 12 | __all__ = [ 13 | 'resmlp_12', 'resmlp_24', 'resmlp_36', 'resmlpB_24' 14 | ] 15 | 16 | class Affine(nn.Module): 17 | def __init__(self, dim): 18 | super().__init__() 19 | self.alpha = nn.Parameter(torch.ones(dim)) 20 | self.beta = nn.Parameter(torch.zeros(dim)) 21 | 22 | def forward(self, x): 23 | return self.alpha * x + self.beta 24 | 25 | class layers_scale_mlp_blocks(nn.Module): 26 | 27 | def __init__(self, dim, drop=0., drop_path=0., act_layer=nn.GELU,init_values=1e-4,num_patches = 196): 28 | super().__init__() 29 | self.norm1 = Affine(dim) 30 | self.attn = nn.Linear(num_patches, num_patches) 31 | self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() 32 | self.norm2 = Affine(dim) 33 | self.mlp = Mlp(in_features=dim, hidden_features=int(4.0 * dim), act_layer=act_layer, drop=drop) 34 | self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True) 35 | self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True) 36 | 37 | def forward(self, x): 38 | x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x).transpose(1,2)).transpose(1,2)) 39 | x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) 40 | return x 41 | 42 | 43 | class resmlp_models(nn.Module): 44 | 45 | def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12,drop_rate=0., 46 | Patch_layer=PatchEmbed,act_layer=nn.GELU, 47 | drop_path_rate=0.0,init_scale=1e-4): 48 | super().__init__() 49 | 50 | 51 | 52 | self.num_classes = num_classes 53 | self.num_features = self.embed_dim = embed_dim 54 | 55 | self.patch_embed = Patch_layer( 56 | img_size=img_size, patch_size=patch_size, in_chans=int(in_chans), embed_dim=embed_dim) 57 | num_patches = self.patch_embed.num_patches 58 | dpr = [drop_path_rate for i in range(depth)] 59 | 60 | self.blocks = nn.ModuleList([ 61 | layers_scale_mlp_blocks( 62 | dim=embed_dim,drop=drop_rate,drop_path=dpr[i], 63 | act_layer=act_layer,init_values=init_scale, 64 | num_patches=num_patches) 65 | for i in range(depth)]) 66 | 67 | 68 | self.norm = Affine(embed_dim) 69 | 70 | 71 | 72 | self.feature_info = [dict(num_chs=embed_dim, reduction=0, module='head')] 73 | self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity() 74 | self.apply(self._init_weights) 75 | 76 | def _init_weights(self, m): 77 | if isinstance(m, nn.Linear): 78 | trunc_normal_(m.weight, std=0.02) 79 | if m.bias is not None: 80 | nn.init.constant_(m.bias, 0) 81 | elif isinstance(m, nn.LayerNorm): 82 | nn.init.constant_(m.bias, 0) 83 | nn.init.constant_(m.weight, 1.0) 84 | 85 | 86 | 87 | def get_classifier(self): 88 | return self.head 89 | 90 | def reset_classifier(self, num_classes, global_pool=''): 91 | self.num_classes = num_classes 92 | self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() 93 | 94 | def forward_features(self, x): 95 | B = x.shape[0] 96 | 97 | x = self.patch_embed(x) 98 | 99 | for i , blk in enumerate(self.blocks): 100 | x = blk(x) 101 | 102 | x = self.norm(x) 103 | x = x.mean(dim=1).reshape(B,1,-1) 104 | 105 | return x[:, 0] 106 | 107 | def forward(self, x): 108 | x = self.forward_features(x) 109 | x = self.head(x) 110 | return x 111 | 112 | @register_model 113 | def resmlp_12(pretrained=False,dist=False, **kwargs): 114 | model = resmlp_models( 115 | patch_size=16, embed_dim=384, depth=12, 116 | Patch_layer=PatchEmbed, 117 | init_scale=0.1,**kwargs) 118 | 119 | model.default_cfg = _cfg() 120 | if pretrained: 121 | if dist: 122 | url_path = "https://dl.fbaipublicfiles.com/deit/resmlp_12_dist.pth" 123 | else: 124 | url_path = "https://dl.fbaipublicfiles.com/deit/resmlp_12_no_dist.pth" 125 | checkpoint = torch.hub.load_state_dict_from_url( 126 | url=url_path, 127 | map_location="cpu", check_hash=True 128 | ) 129 | 130 | model.load_state_dict(checkpoint) 131 | return model 132 | 133 | @register_model 134 | def resmlp_24(pretrained=False,dist=False,dino=False, **kwargs): 135 | model = resmlp_models( 136 | patch_size=16, embed_dim=384, depth=24, 137 | Patch_layer=PatchEmbed, 138 | init_scale=1e-5,**kwargs) 139 | model.default_cfg = _cfg() 140 | if pretrained: 141 | if dist: 142 | url_path = "https://dl.fbaipublicfiles.com/deit/resmlp_24_dist.pth" 143 | elif dino: 144 | url_path = "https://dl.fbaipublicfiles.com/deit/resmlp_24_dino.pth" 145 | else: 146 | url_path = "https://dl.fbaipublicfiles.com/deit/resmlp_24_no_dist.pth" 147 | checkpoint = torch.hub.load_state_dict_from_url( 148 | url=url_path, 149 | map_location="cpu", check_hash=True 150 | ) 151 | 152 | model.load_state_dict(checkpoint) 153 | return model 154 | 155 | @register_model 156 | def resmlp_36(pretrained=False,dist=False, **kwargs): 157 | model = resmlp_models( 158 | patch_size=16, embed_dim=384, depth=36, 159 | Patch_layer=PatchEmbed, 160 | init_scale=1e-6,**kwargs) 161 | model.default_cfg = _cfg() 162 | if pretrained: 163 | if dist: 164 | url_path = "https://dl.fbaipublicfiles.com/deit/resmlp_36_dist.pth" 165 | else: 166 | url_path = "https://dl.fbaipublicfiles.com/deit/resmlp_36_no_dist.pth" 167 | checkpoint = torch.hub.load_state_dict_from_url( 168 | url=url_path, 169 | map_location="cpu", check_hash=True 170 | ) 171 | 172 | model.load_state_dict(checkpoint) 173 | return model 174 | 175 | @register_model 176 | def resmlpB_24(pretrained=False,dist=False, in_22k = False, **kwargs): 177 | model = resmlp_models( 178 | patch_size=8, embed_dim=768, depth=24, 179 | Patch_layer=PatchEmbed, 180 | init_scale=1e-6,**kwargs) 181 | model.default_cfg = _cfg() 182 | if pretrained: 183 | if dist: 184 | url_path = "https://dl.fbaipublicfiles.com/deit/resmlpB_24_dist.pth" 185 | elif in_22k: 186 | url_path = "https://dl.fbaipublicfiles.com/deit/resmlpB_24_22k.pth" 187 | else: 188 | url_path = "https://dl.fbaipublicfiles.com/deit/resmlpB_24_no_dist.pth" 189 | 190 | checkpoint = torch.hub.load_state_dict_from_url( 191 | url=url_path, 192 | map_location="cpu", check_hash=True 193 | ) 194 | 195 | model.load_state_dict(checkpoint) 196 | 197 | return model 198 | -------------------------------------------------------------------------------- /software_model/models.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015-present, Facebook, Inc. 2 | # All rights reserved. 3 | import torch 4 | import torch.nn as nn 5 | from functools import partial 6 | 7 | from timm.models.vision_transformer import VisionTransformer, _cfg 8 | from timm.models.registry import register_model 9 | from timm.models.layers import trunc_normal_ 10 | 11 | 12 | __all__ = [ 13 | 'deit_tiny_patch16_224', 'deit_small_patch16_224', 'deit_base_patch16_224', 14 | 'deit_tiny_distilled_patch16_224', 'deit_small_distilled_patch16_224', 15 | 'deit_base_distilled_patch16_224', 'deit_base_patch16_384', 16 | 'deit_base_distilled_patch16_384', 17 | ] 18 | 19 | 20 | class DistilledVisionTransformer(VisionTransformer): 21 | def __init__(self, *args, **kwargs): 22 | super().__init__(*args, **kwargs) 23 | self.dist_token = nn.Parameter(torch.zeros(1, 1, self.embed_dim)) 24 | num_patches = self.patch_embed.num_patches 25 | self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 2, self.embed_dim)) 26 | self.head_dist = nn.Linear(self.embed_dim, self.num_classes) if self.num_classes > 0 else nn.Identity() 27 | 28 | trunc_normal_(self.dist_token, std=.02) 29 | trunc_normal_(self.pos_embed, std=.02) 30 | self.head_dist.apply(self._init_weights) 31 | 32 | def forward_features(self, x): 33 | # taken from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py 34 | # with slight modifications to add the dist_token 35 | B = x.shape[0] 36 | x = self.patch_embed(x) 37 | 38 | cls_tokens = self.cls_token.expand(B, -1, -1) # stole cls_tokens impl from Phil Wang, thanks 39 | dist_token = self.dist_token.expand(B, -1, -1) 40 | x = torch.cat((cls_tokens, dist_token, x), dim=1) 41 | 42 | x = x + self.pos_embed 43 | x = self.pos_drop(x) 44 | 45 | for blk in self.blocks: 46 | x = blk(x) 47 | 48 | x = self.norm(x) 49 | return x[:, 0], x[:, 1] 50 | 51 | def forward(self, x): 52 | x, x_dist = self.forward_features(x) 53 | x = self.head(x) 54 | x_dist = self.head_dist(x_dist) 55 | if self.training: 56 | return x, x_dist 57 | else: 58 | # during inference, return the average of both classifier predictions 59 | return (x + x_dist) / 2 60 | 61 | 62 | @register_model 63 | def deit_tiny_patch16_224(pretrained=False, **kwargs): 64 | model = VisionTransformer( 65 | patch_size=16, embed_dim=192, depth=12, num_heads=3, mlp_ratio=4, qkv_bias=True, 66 | norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) 67 | model.default_cfg = _cfg() 68 | if pretrained: 69 | checkpoint = torch.hub.load_state_dict_from_url( 70 | url="https://dl.fbaipublicfiles.com/deit/deit_tiny_patch16_224-a1311bcf.pth", 71 | map_location="cpu", check_hash=True 72 | ) 73 | model.load_state_dict(checkpoint["model"]) 74 | return model 75 | 76 | 77 | @register_model 78 | def deit_small_patch16_224(pretrained=False, **kwargs): 79 | model = VisionTransformer( 80 | patch_size=16, embed_dim=384, depth=12, num_heads=6, mlp_ratio=4, qkv_bias=True, 81 | norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) 82 | model.default_cfg = _cfg() 83 | if pretrained: 84 | checkpoint = torch.hub.load_state_dict_from_url( 85 | url="https://dl.fbaipublicfiles.com/deit/deit_small_patch16_224-cd65a155.pth", 86 | map_location="cpu", check_hash=True 87 | ) 88 | model.load_state_dict(checkpoint["model"]) 89 | return model 90 | 91 | 92 | @register_model 93 | def deit_base_patch16_224(pretrained=False, **kwargs): 94 | model = VisionTransformer( 95 | patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True, 96 | norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) 97 | model.default_cfg = _cfg() 98 | if pretrained: 99 | checkpoint = torch.hub.load_state_dict_from_url( 100 | url="https://dl.fbaipublicfiles.com/deit/deit_base_patch16_224-b5f2ef4d.pth", 101 | map_location="cpu", check_hash=True 102 | ) 103 | model.load_state_dict(checkpoint["model"]) 104 | return model 105 | 106 | 107 | @register_model 108 | def deit_tiny_distilled_patch16_224(pretrained=False, **kwargs): 109 | model = DistilledVisionTransformer( 110 | patch_size=16, embed_dim=192, depth=12, num_heads=3, mlp_ratio=4, qkv_bias=True, 111 | norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) 112 | model.default_cfg = _cfg() 113 | if pretrained: 114 | checkpoint = torch.hub.load_state_dict_from_url( 115 | url="https://dl.fbaipublicfiles.com/deit/deit_tiny_distilled_patch16_224-b40b3cf7.pth", 116 | map_location="cpu", check_hash=True 117 | ) 118 | model.load_state_dict(checkpoint["model"]) 119 | return model 120 | 121 | 122 | @register_model 123 | def deit_small_distilled_patch16_224(pretrained=False, **kwargs): 124 | model = DistilledVisionTransformer( 125 | patch_size=16, embed_dim=384, depth=12, num_heads=6, mlp_ratio=4, qkv_bias=True, 126 | norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) 127 | model.default_cfg = _cfg() 128 | if pretrained: 129 | checkpoint = torch.hub.load_state_dict_from_url( 130 | url="https://dl.fbaipublicfiles.com/deit/deit_small_distilled_patch16_224-649709d9.pth", 131 | map_location="cpu", check_hash=True 132 | ) 133 | model.load_state_dict(checkpoint["model"]) 134 | return model 135 | 136 | 137 | @register_model 138 | def deit_base_distilled_patch16_224(pretrained=False, **kwargs): 139 | model = DistilledVisionTransformer( 140 | patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True, 141 | norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) 142 | model.default_cfg = _cfg() 143 | if pretrained: 144 | checkpoint = torch.hub.load_state_dict_from_url( 145 | url="https://dl.fbaipublicfiles.com/deit/deit_base_distilled_patch16_224-df68dfff.pth", 146 | map_location="cpu", check_hash=True 147 | ) 148 | model.load_state_dict(checkpoint["model"]) 149 | return model 150 | 151 | 152 | # tested on this model 153 | @register_model 154 | def deit_base_patch16_384(pretrained=False, **kwargs): 155 | model = VisionTransformer( 156 | img_size=384, patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True, 157 | norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) 158 | model.default_cfg = _cfg() 159 | if pretrained: 160 | checkpoint = torch.hub.load_state_dict_from_url( 161 | url="https://dl.fbaipublicfiles.com/deit/deit_base_patch16_384-8de9b5d1.pth", 162 | map_location="cpu", check_hash=True 163 | ) 164 | model.load_state_dict(checkpoint["model"]) 165 | return model 166 | 167 | 168 | @register_model 169 | def deit_base_distilled_patch16_384(pretrained=False, **kwargs): 170 | model = DistilledVisionTransformer( 171 | img_size=384, patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True, 172 | norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) 173 | model.default_cfg = _cfg() 174 | if pretrained: 175 | checkpoint = torch.hub.load_state_dict_from_url( 176 | url="https://dl.fbaipublicfiles.com/deit/deit_base_distilled_patch16_384-d0272ac0.pth", 177 | map_location="cpu", check_hash=True 178 | ) 179 | model.load_state_dict(checkpoint["model"]) 180 | return model 181 | -------------------------------------------------------------------------------- /software_model/ops/_quant_base.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Hanqing Zhu 3 | # @Date: 2023-01-03 21:20:31 4 | # @Last Modified by: Hanqing Zhu 5 | # @Last Modified time: 2023-03-30 03:50:36 6 | """ 7 | Quantized modules: the base class 8 | """ 9 | import torch 10 | import torch.nn as nn 11 | from torch.nn.parameter import Parameter 12 | 13 | import math 14 | from enum import Enum 15 | 16 | __all__ = ['Qmodes', '_Conv2dQ', '_LinearQ', '_ActQ', 17 | 'truncation', 'get_sparsity_mask', 'FunStopGradient', 'round_pass', 'grad_scale'] 18 | 19 | 20 | class Qmodes(Enum): 21 | layer_wise = 1 22 | kernel_wise = 2 23 | 24 | 25 | def grad_scale(x, scale): 26 | y = x 27 | y_grad = x * scale 28 | return y.detach() - y_grad.detach() + y_grad 29 | 30 | 31 | def get_sparsity_mask(param, sparsity): 32 | bottomk, _ = torch.topk(param.abs().view(-1), int(sparsity * param.numel()), largest=False, sorted=True) 33 | threshold = bottomk.data[-1] # This is the largest element from the group of elements that we prune away 34 | return torch.gt(torch.abs(param), threshold).type(param.type()) 35 | 36 | 37 | def round_pass(x): 38 | y = x.round() 39 | y_grad = x 40 | return y.detach() - y_grad.detach() + y_grad 41 | 42 | 43 | class FunStopGradient(torch.autograd.Function): 44 | 45 | @staticmethod 46 | def forward(ctx, weight, stopGradientMask): 47 | ctx.save_for_backward(stopGradientMask) 48 | return weight 49 | 50 | @staticmethod 51 | def backward(ctx, grad_outputs): 52 | stopGradientMask, = ctx.saved_tensors 53 | grad_inputs = grad_outputs * stopGradientMask 54 | return grad_inputs, None 55 | 56 | 57 | def log_shift(value_fp): 58 | value_shift = 2 ** (torch.log2(value_fp).ceil()) 59 | return value_shift 60 | 61 | 62 | def clamp(input, min, max, inplace=False): 63 | if inplace: 64 | input.clamp_(min, max) 65 | return input 66 | return torch.clamp(input, min, max) 67 | 68 | 69 | def get_quantized_range(num_bits, signed=True): 70 | if signed: 71 | n = 2 ** (num_bits - 1) 72 | return -n, n - 1 73 | return 0, 2 ** num_bits - 1 74 | 75 | 76 | def linear_quantize(input, scale_factor, inplace=False): 77 | if inplace: 78 | input.mul_(scale_factor).round_() 79 | return input 80 | return torch.round(scale_factor * input) 81 | 82 | 83 | def linear_quantize_clamp(input, scale_factor, clamp_min, clamp_max, inplace=False): 84 | output = linear_quantize(input, scale_factor, inplace) 85 | return clamp(output, clamp_min, clamp_max, inplace) 86 | 87 | 88 | def linear_dequantize(input, scale_factor, inplace=False): 89 | if inplace: 90 | input.div_(scale_factor) 91 | return input 92 | return input / scale_factor 93 | 94 | 95 | def truncation(fp_data, nbits=8): 96 | il = torch.log2(torch.max(fp_data.max(), fp_data.min().abs())) + 1 97 | il = math.ceil(il - 1e-5) 98 | qcode = nbits - il 99 | scale_factor = 2 ** qcode 100 | clamp_min, clamp_max = get_quantized_range(nbits, signed=True) 101 | q_data = linear_quantize_clamp(fp_data, scale_factor, clamp_min, clamp_max) 102 | q_data = linear_dequantize(q_data, scale_factor) 103 | return q_data, qcode 104 | 105 | 106 | def get_default_kwargs_q(kwargs_q, layer_type): 107 | default = { 108 | 'nbits': 4 109 | } 110 | if isinstance(layer_type, _Conv2dQ): 111 | default.update({ 112 | 'mode': Qmodes.layer_wise}) 113 | elif isinstance(layer_type, _LinearQ): 114 | pass 115 | elif isinstance(layer_type, _ActQ): 116 | pass 117 | # default.update({ 118 | # 'signed': 'Auto'}) 119 | else: 120 | assert NotImplementedError 121 | return 122 | for k, v in default.items(): 123 | if k not in kwargs_q: 124 | kwargs_q[k] = v 125 | return kwargs_q 126 | 127 | 128 | class _Conv2dQ(nn.Conv2d): 129 | def __init__(self, in_channels, out_channels, kernel_size, stride=1, 130 | padding=0, dilation=1, groups=1, bias=True, **kwargs_q): 131 | super(_Conv2dQ, self).__init__(in_channels, out_channels, kernel_size, stride=stride, 132 | padding=padding, dilation=dilation, groups=groups, bias=bias) 133 | self.kwargs_q = get_default_kwargs_q(kwargs_q, layer_type=self) 134 | self.nbits = kwargs_q['nbits'] 135 | if self.nbits < 0: 136 | self.register_parameter('alpha', None) 137 | return 138 | self.q_mode = kwargs_q['mode'] 139 | if self.q_mode == Qmodes.kernel_wise: 140 | self.alpha = Parameter(torch.Tensor(out_channels)) 141 | else: # layer-wise quantization 142 | self.alpha = Parameter(torch.Tensor(1)) 143 | self.register_buffer('init_state', torch.zeros(1)) 144 | 145 | def add_param(self, param_k, param_v): 146 | self.kwargs_q[param_k] = param_v 147 | 148 | def set_bit(self, nbits): 149 | self.kwargs_q['nbits'] = nbits 150 | 151 | def extra_repr(self): 152 | s_prefix = super(_Conv2dQ, self).extra_repr() 153 | if self.alpha is None: 154 | return '{}, fake'.format(s_prefix) 155 | return '{}, {}'.format(s_prefix, self.kwargs_q) 156 | 157 | 158 | class _LinearQ(nn.Linear): 159 | def __init__(self, in_features, out_features, bias=True, **kwargs_q): 160 | super(_LinearQ, self).__init__(in_features=in_features, out_features=out_features, bias=bias) 161 | self.kwargs_q = get_default_kwargs_q(kwargs_q, layer_type=self) 162 | self.nbits = kwargs_q['nbits'] 163 | if self.nbits < 0: 164 | self.register_parameter('alpha', None) 165 | return 166 | self.q_mode = kwargs_q['mode'] 167 | self.alpha = Parameter(torch.Tensor(1)) 168 | if self.q_mode == Qmodes.kernel_wise: 169 | self.alpha = Parameter(torch.Tensor(out_features)) 170 | self.register_buffer('init_state', torch.zeros(1)) 171 | 172 | def add_param(self, param_k, param_v): 173 | self.kwargs_q[param_k] = param_v 174 | 175 | def extra_repr(self): 176 | s_prefix = super(_LinearQ, self).extra_repr() 177 | if self.alpha is None: 178 | return '{}, fake'.format(s_prefix) 179 | return '{}, {}'.format(s_prefix, self.kwargs_q) 180 | 181 | 182 | class _ActQ(nn.Module): 183 | def __init__(self, in_features, **kwargs_q): 184 | super(_ActQ, self).__init__() 185 | self.kwargs_q = get_default_kwargs_q(kwargs_q, layer_type=self) 186 | self.nbits = kwargs_q['nbits'] 187 | if self.nbits < 0: 188 | self.register_parameter('alpha', None) 189 | # self.register_parameter('zero_point', None) 190 | return 191 | # self.signed = kwargs_q['signed'] 192 | self.q_mode = kwargs_q['mode'] 193 | # print(kwargs_q) 194 | self.offset = kwargs_q['offset'] 195 | self.zero_point = None 196 | if self.q_mode == Qmodes.kernel_wise: 197 | self.alpha = Parameter(torch.Tensor(in_features)) 198 | if self.offset: 199 | self.zero_point = Parameter(torch.Tensor(in_features)) 200 | torch.nn.init.zeros_(self.zero_point) 201 | else: 202 | self.alpha = Parameter(torch.Tensor(1)) 203 | if self.offset: 204 | self.zero_point = Parameter(torch.Tensor([0])) 205 | # self.zero_point = Parameter(torch.Tensor([0])) 206 | self.register_buffer('init_state', torch.zeros(1)) 207 | self.register_buffer('signed', torch.zeros(1)) 208 | 209 | def add_param(self, param_k, param_v): 210 | self.kwargs_q[param_k] = param_v 211 | 212 | def set_bit(self, nbits): 213 | self.kwargs_q['nbits'] = nbits 214 | 215 | def extra_repr(self): 216 | # s_prefix = super(_ActQ, self).extra_repr() 217 | if self.alpha is None: 218 | return 'fake' 219 | return '{}'.format(self.kwargs_q) -------------------------------------------------------------------------------- /software_model/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015-present, Facebook, Inc. 2 | # All rights reserved. 3 | """ 4 | Misc functions, including distributed helpers. 5 | 6 | Mostly copy-paste from torchvision references. 7 | """ 8 | import io 9 | import os 10 | import time 11 | from collections import defaultdict, deque 12 | import datetime 13 | 14 | import torch 15 | import torch.distributed as dist 16 | 17 | 18 | class SmoothedValue(object): 19 | """Track a series of values and provide access to smoothed values over a 20 | window or the global series average. 21 | """ 22 | 23 | def __init__(self, window_size=20, fmt=None): 24 | if fmt is None: 25 | fmt = "{median:.4f} ({global_avg:.4f})" 26 | self.deque = deque(maxlen=window_size) 27 | self.total = 0.0 28 | self.count = 0 29 | self.fmt = fmt 30 | 31 | def update(self, value, n=1): 32 | self.deque.append(value) 33 | self.count += n 34 | self.total += value * n 35 | 36 | def synchronize_between_processes(self): 37 | """ 38 | Warning: does not synchronize the deque! 39 | """ 40 | if not is_dist_avail_and_initialized(): 41 | return 42 | t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda') 43 | dist.barrier() 44 | dist.all_reduce(t) 45 | t = t.tolist() 46 | self.count = int(t[0]) 47 | self.total = t[1] 48 | 49 | @property 50 | def median(self): 51 | d = torch.tensor(list(self.deque)) 52 | return d.median().item() 53 | 54 | @property 55 | def avg(self): 56 | d = torch.tensor(list(self.deque), dtype=torch.float32) 57 | return d.mean().item() 58 | 59 | @property 60 | def global_avg(self): 61 | return self.total / self.count 62 | 63 | @property 64 | def max(self): 65 | return max(self.deque) 66 | 67 | @property 68 | def value(self): 69 | return self.deque[-1] 70 | 71 | def __str__(self): 72 | return self.fmt.format( 73 | median=self.median, 74 | avg=self.avg, 75 | global_avg=self.global_avg, 76 | max=self.max, 77 | value=self.value) 78 | 79 | 80 | class MetricLogger(object): 81 | def __init__(self, delimiter="\t"): 82 | self.meters = defaultdict(SmoothedValue) 83 | self.delimiter = delimiter 84 | 85 | def update(self, **kwargs): 86 | for k, v in kwargs.items(): 87 | if isinstance(v, torch.Tensor): 88 | v = v.item() 89 | assert isinstance(v, (float, int)) 90 | self.meters[k].update(v) 91 | 92 | def __getattr__(self, attr): 93 | if attr in self.meters: 94 | return self.meters[attr] 95 | if attr in self.__dict__: 96 | return self.__dict__[attr] 97 | raise AttributeError("'{}' object has no attribute '{}'".format( 98 | type(self).__name__, attr)) 99 | 100 | def __str__(self): 101 | loss_str = [] 102 | for name, meter in self.meters.items(): 103 | loss_str.append( 104 | "{}: {}".format(name, str(meter)) 105 | ) 106 | return self.delimiter.join(loss_str) 107 | 108 | def synchronize_between_processes(self): 109 | for meter in self.meters.values(): 110 | meter.synchronize_between_processes() 111 | 112 | def add_meter(self, name, meter): 113 | self.meters[name] = meter 114 | 115 | def log_every(self, iterable, print_freq, header=None): 116 | i = 0 117 | if not header: 118 | header = '' 119 | start_time = time.time() 120 | end = time.time() 121 | iter_time = SmoothedValue(fmt='{avg:.4f}') 122 | data_time = SmoothedValue(fmt='{avg:.4f}') 123 | space_fmt = ':' + str(len(str(len(iterable)))) + 'd' 124 | log_msg = [ 125 | header, 126 | '[{0' + space_fmt + '}/{1}]', 127 | 'eta: {eta}', 128 | '{meters}', 129 | 'time: {time}', 130 | 'data: {data}' 131 | ] 132 | if torch.cuda.is_available(): 133 | log_msg.append('max mem: {memory:.0f}') 134 | log_msg = self.delimiter.join(log_msg) 135 | MB = 1024.0 * 1024.0 136 | for obj in iterable: 137 | data_time.update(time.time() - end) 138 | yield obj 139 | iter_time.update(time.time() - end) 140 | if i % print_freq == 0 or i == len(iterable) - 1: 141 | eta_seconds = iter_time.global_avg * (len(iterable) - i) 142 | eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) 143 | if torch.cuda.is_available(): 144 | print(log_msg.format( 145 | i, len(iterable), eta=eta_string, 146 | meters=str(self), 147 | time=str(iter_time), data=str(data_time), 148 | memory=torch.cuda.max_memory_allocated() / MB)) 149 | else: 150 | print(log_msg.format( 151 | i, len(iterable), eta=eta_string, 152 | meters=str(self), 153 | time=str(iter_time), data=str(data_time))) 154 | i += 1 155 | end = time.time() 156 | total_time = time.time() - start_time 157 | total_time_str = str(datetime.timedelta(seconds=int(total_time))) 158 | print('{} Total time: {} ({:.4f} s / it)'.format( 159 | header, total_time_str, total_time / len(iterable))) 160 | 161 | 162 | def _load_checkpoint_for_ema(model_ema, checkpoint): 163 | """ 164 | Workaround for ModelEma._load_checkpoint to accept an already-loaded object 165 | """ 166 | mem_file = io.BytesIO() 167 | torch.save({'state_dict_ema':checkpoint}, mem_file) 168 | mem_file.seek(0) 169 | model_ema._load_checkpoint(mem_file) 170 | 171 | 172 | def setup_for_distributed(is_master): 173 | """ 174 | This function disables printing when not in master process 175 | """ 176 | import builtins as __builtin__ 177 | builtin_print = __builtin__.print 178 | 179 | def print(*args, **kwargs): 180 | force = kwargs.pop('force', False) 181 | if is_master or force: 182 | builtin_print(*args, **kwargs) 183 | 184 | __builtin__.print = print 185 | 186 | 187 | def is_dist_avail_and_initialized(): 188 | if not dist.is_available(): 189 | return False 190 | if not dist.is_initialized(): 191 | return False 192 | return True 193 | 194 | 195 | def get_world_size(): 196 | if not is_dist_avail_and_initialized(): 197 | return 1 198 | return dist.get_world_size() 199 | 200 | 201 | def get_rank(): 202 | if not is_dist_avail_and_initialized(): 203 | return 0 204 | return dist.get_rank() 205 | 206 | 207 | def is_main_process(): 208 | return get_rank() == 0 209 | 210 | 211 | def save_on_master(*args, **kwargs): 212 | if is_main_process(): 213 | torch.save(*args, **kwargs) 214 | 215 | 216 | def init_distributed_mode(args): 217 | if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: 218 | args.rank = int(os.environ["RANK"]) 219 | args.world_size = int(os.environ['WORLD_SIZE']) 220 | args.gpu = int(os.environ['LOCAL_RANK']) 221 | elif 'SLURM_PROCID' in os.environ: 222 | args.rank = int(os.environ['SLURM_PROCID']) 223 | args.gpu = args.rank % torch.cuda.device_count() 224 | else: 225 | print('Not using distributed mode') 226 | args.distributed = False 227 | return 228 | 229 | args.distributed = True 230 | 231 | torch.cuda.set_device(args.gpu) 232 | args.dist_backend = 'nccl' 233 | print('| distributed init (rank {}): {}'.format( 234 | args.rank, args.dist_url), flush=True) 235 | torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url, 236 | world_size=args.world_size, rank=args.rank) 237 | torch.distributed.barrier() 238 | setup_for_distributed(args.rank == 0) 239 | -------------------------------------------------------------------------------- /hardware_simulator/utils/cal_flops_for_transformer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Hanqing Zhu(hqzhu@utexas.edu) 3 | # @Date: 1969-12-31 18:00:00 4 | # @Last Modified by: Hanqing Zhu(hqzhu@utexas.edu) 5 | # @Last Modified time: 2023-11-09 01:18:58 6 | """Computes the flops needed for training/running transformer networks. 7 | https://github.com/google-research/electra/blob/master/flops_computation.py 8 | """ 9 | 10 | # We checked this code with TensorFlow"s FLOPs counting, although we had to 11 | # correct for this issue: https://github.com/tensorflow/tensorflow/issues/22071 12 | # Assumptions going into the FLOPs counting 13 | # - An "operation" is a mathematical operation, not a machine instruction. So 14 | # an "exp" takes one opp like and add, even though in practice an exp 15 | # might be slower. This is not too bad an assumption because 16 | # matrix-multiplies dominate the compute for most models, so minor details 17 | # about activation functions don"t matter too much. Similarly, we count 18 | # matrix-multiplies as 2*m*n flops instead of m*n, as one might if 19 | # if considering fused multiply-add ops. 20 | # - Backward pass takes the same number of FLOPs as forward pass. No exactly 21 | # right (e.g., for softmax cross entropy loss the backward pass is faster). 22 | # Importantly, it really is the same for matrix-multiplies, which is most of 23 | # the compute anyway. 24 | # - We assume "dense" embedding lookups (i.e., multiplication by a one-hot 25 | # vector). On some hardware accelerators, these dense operations are 26 | # actually faster than sparse lookups. 27 | # Please open a github issue if you spot a problem with this code! 28 | 29 | # I am not sure if the below constants are 100% right, but they are only applied 30 | # to O(hidden_size) activations, which is generally a lot less compute than the 31 | # matrix-multiplies, which are O(hidden_size^2), so they don't affect the total 32 | # number of FLOPs much. 33 | 34 | # random number, >=, multiply activations by dropout mask, multiply activations 35 | # by correction (1 / (1 - dropout_rate)) 36 | DROPOUT_FLOPS = 4 37 | 38 | # compute mean activation (sum), computate variance of activation 39 | # (square and sum), bias (add), scale (multiply) 40 | LAYER_NORM_FLOPS = 5 41 | 42 | # GELU: 0.5 * x * (1 + tanh(sqrt(2 / np.pi) * (x + 0.044715 * pow(x, 3)))) 43 | ACTIVATION_FLOPS = 8 44 | 45 | # max/substract (for stability), exp, sum, divide 46 | SOFTMAX_FLOPS = 5 47 | 48 | __all__ = [ 49 | "get_infer_ops" 50 | ] 51 | 52 | class TransformerHparams(object): 53 | """Computes the train/inference FLOPs for transformers.""" 54 | 55 | def __init__(self, h, l, s=512, v=30522, e=None, i=None, heads=None, 56 | head_size=None, output_frac=0.15625, sparse_embed_lookup=False, 57 | decoder=False): 58 | self.h = h # hidden size 59 | self.l = l # number of layers 60 | self.s = s # sequence length 61 | self.v = v # vocab size 62 | self.e = h if e is None else e # embedding size 63 | self.i = h * 4 if i is None else i # intermediate size 64 | self.kqv = h if head_size is None else head_size * heads # attn proj sizes 65 | # attention heads 66 | self.heads = max(h // 64, 1) if heads is None else heads 67 | self.output_frac = output_frac # percent of tokens using an output softmax 68 | self.sparse_embed_lookup = sparse_embed_lookup # sparse embedding lookups 69 | self.decoder = decoder # decoder has extra attn to encoder states 70 | 71 | self.residual_flops = 0 72 | self.activation_flops = 0 73 | self.layer_norm_flops = 0 74 | self.softmax_flops = 0 75 | 76 | def get_block_flops(self): 77 | """Get the forward-pass FLOPs for a single transformer block.""" 78 | attn_mul = 2 if self.decoder else 1 79 | block_flops = dict( 80 | kqv=3 * 2 * self.h * self.kqv * attn_mul, 81 | kqv_bias=3 * self.kqv * attn_mul, 82 | attention_scores=2 * self.kqv * self.s * attn_mul, 83 | attn_softmax=SOFTMAX_FLOPS * self.s * self.heads * attn_mul, 84 | attention_dropout=DROPOUT_FLOPS * self.s * self.heads * attn_mul, 85 | attention_scale=self.s * self.heads * attn_mul, 86 | attention_weighted_avg_values=2 * self.h * self.s * attn_mul, 87 | attn_output=2 * self.h * self.h * attn_mul, 88 | attn_output_bias=self.h * attn_mul, 89 | attn_output_dropout=DROPOUT_FLOPS * self.h * attn_mul, 90 | attn_output_residual=self.h * attn_mul, 91 | attn_output_layer_norm=LAYER_NORM_FLOPS * attn_mul, 92 | intermediate=2 * self.h * self.i, 93 | intermediate_act=ACTIVATION_FLOPS * self.i, 94 | intermediate_bias=self.i, 95 | output=2 * self.h * self.i, 96 | output_bias=self.h, 97 | output_dropout=DROPOUT_FLOPS * self.h, 98 | output_residual=self.h, 99 | output_layer_norm=LAYER_NORM_FLOPS * self.h, 100 | ) 101 | 102 | self.softmax_flops += self.s * self.s * self.heads * attn_mul # tokens * tokens * head 103 | self.residual_flops += self.s * (self.h + self.h) # tokens * hidden size 104 | self.layer_norm_flops += self.s * (self.h + 1) # tokens * hidden_size 105 | self.activation_flops += self.s * self.i # GELU tokens * hidden_size * 4 106 | 107 | return sum(block_flops.values()) * self.s 108 | 109 | def get_embedding_flops(self, output=False): 110 | """Get the forward-pass FLOPs the transformer inputs or output softmax.""" 111 | embedding_flops = {} 112 | if output or (not self.sparse_embed_lookup): 113 | embedding_flops["main_multiply"] = 2 * self.e * self.v 114 | # input embedding post-processing 115 | if not output: 116 | embedding_flops.update(dict( 117 | tok_type_and_position=2 * self.e * (self.s + 2), 118 | add_tok_type_and_position=2 * self.e, 119 | emb_layer_norm=LAYER_NORM_FLOPS * self.e, 120 | emb_dropout=DROPOUT_FLOPS * self.e 121 | )) 122 | # projection layer if e != h 123 | if self.e != self.h or output: 124 | embedding_flops.update(dict( 125 | hidden_kernel=2 * self.h * self.e, 126 | hidden_bias=self.e if output else self.h 127 | )) 128 | # extra hidden layer and output softmax 129 | if output: 130 | embedding_flops.update(dict( 131 | hidden_activation=ACTIVATION_FLOPS * self.e, 132 | hidden_layernorm=LAYER_NORM_FLOPS * self.e, 133 | output_softmax=SOFTMAX_FLOPS * self.v, 134 | output_target_word=2 * self.v 135 | )) 136 | return self.output_frac * sum(embedding_flops.values()) * self.s 137 | return sum(embedding_flops.values()) * self.s 138 | 139 | def get_binary_classification_flops(self): 140 | classification_flops = dict( 141 | hidden=2 * self.h * self.h, 142 | hidden_bias=self.h, 143 | hidden_act=ACTIVATION_FLOPS * self.h, 144 | logits=2 * self.h 145 | ) 146 | return sum(classification_flops.values()) * self.s 147 | 148 | def get_train_flops(self, batch_size, train_steps, discriminator=False): 149 | """Get the FLOPs for pre-training the transformer.""" 150 | # 2* for forward/backward pass 151 | return 2 * batch_size * train_steps * ( 152 | (self.l * self.get_block_flops()) + 153 | self.get_embedding_flops(output=False) + 154 | (self.get_binary_classification_flops() if discriminator else 155 | self.get_embedding_flops(output=True)) 156 | ) 157 | 158 | def get_infer_flops(self): 159 | """Get the FLOPs for running inference with the transformer on a 160 | classification task.""" 161 | (self.l * self.get_block_flops()) + self.get_embedding_flops(output=False) + self.get_binary_classification_flops() 162 | 163 | 164 | def get_electra_train_flops( 165 | h_d, l_d, h_g, l_g, batch_size, train_steps, tied_embeddings, 166 | e=None, s=512, output_frac=0.15625): 167 | """Get the FLOPs needed for pre-training ELECTRA.""" 168 | if e is None: 169 | e = h_d 170 | disc = TransformerHparams( 171 | h_d, l_d, s=s, e=e, 172 | output_frac=output_frac).get_train_flops(batch_size, train_steps, True) 173 | gen = TransformerHparams( 174 | h_g, l_g, s=s, e=e if tied_embeddings else None, 175 | output_frac=output_frac).get_train_flops(batch_size, train_steps) 176 | return disc + gen 177 | 178 | def get_infer_ops( 179 | h_d, l_s, seq, heads, head_size=64 180 | ): 181 | """Get the ops needed for Transformer inference. Softmax, layernorm, residual add, activation""" 182 | estimator = TransformerHparams(h=h_d, l=l_s, s=seq, heads=heads, head_size=head_size) 183 | estimator.get_infer_flops() 184 | 185 | return estimator.softmax_flops, estimator.layer_norm_flops, estimator.residual_flops, estimator.activation_flops 186 | -------------------------------------------------------------------------------- /profile/benckmark_logs/deit-s_power.csv: -------------------------------------------------------------------------------- 1 | # gpu pwr gtemp mtemp sm mem enc dec jpg ofa mclk pclk 2 | # Idx W C C % % % % % % MHz MHz 3 | 0 61 36 52 0 0 0 0 0 0 1593 210 4 | 0 61 36 52 0 0 0 0 0 0 1593 210 5 | 0 61 36 52 0 0 0 0 0 0 1593 210 6 | 0 61 36 52 0 0 0 0 0 0 1593 210 7 | 0 61 36 52 0 0 0 0 0 0 1593 210 8 | 0 61 36 52 0 0 0 0 0 0 1593 210 9 | 0 72 37 52 36 1 0 0 0 0 1593 855 10 | 0 72 37 52 40 1 0 0 0 0 1593 855 11 | 0 72 37 52 39 1 0 0 0 0 1593 855 12 | 0 72 37 53 40 1 0 0 0 0 1593 855 13 | 0 72 37 52 40 1 0 0 0 0 1593 855 14 | 0 72 37 52 39 1 0 0 0 0 1593 855 15 | 0 72 37 53 39 1 0 0 0 0 1593 855 16 | 0 73 37 53 40 1 0 0 0 0 1593 855 17 | 0 72 37 53 40 1 0 0 0 0 1593 855 18 | 0 72 37 53 39 1 0 0 0 0 1593 855 19 | 0 72 37 53 37 1 0 0 0 0 1593 840 20 | 0 72 37 53 39 1 0 0 0 0 1593 870 21 | 0 72 37 53 39 1 0 0 0 0 1593 870 22 | 0 72 37 53 39 1 0 0 0 0 1593 855 23 | 0 72 37 53 38 1 0 0 0 0 1593 840 24 | 0 72 37 54 38 1 0 0 0 0 1593 825 25 | 0 72 37 53 40 1 0 0 0 0 1593 855 26 | 0 72 37 54 40 1 0 0 0 0 1593 855 27 | 0 72 37 54 41 1 0 0 0 0 1593 855 28 | 0 72 37 53 40 1 0 0 0 0 1593 855 29 | 0 72 37 54 40 1 0 0 0 0 1593 855 30 | 0 72 37 54 39 1 0 0 0 0 1593 855 31 | 0 72 37 53 39 1 0 0 0 0 1593 840 32 | 0 72 37 54 39 1 0 0 0 0 1593 840 33 | 0 72 37 53 40 1 0 0 0 0 1593 840 34 | 0 72 37 54 39 1 0 0 0 0 1593 870 35 | 0 72 37 53 39 1 0 0 0 0 1593 870 36 | 0 73 37 54 39 1 0 0 0 0 1593 870 37 | 0 72 37 52 39 1 0 0 0 0 1593 870 38 | 0 73 37 53 39 1 0 0 0 0 1593 870 39 | 0 72 37 54 38 1 0 0 0 0 1593 855 40 | 0 72 37 53 39 1 0 0 0 0 1593 840 41 | 0 72 37 53 39 1 0 0 0 0 1593 870 42 | 0 72 37 53 39 1 0 0 0 0 1593 870 43 | 0 72 37 53 39 1 0 0 0 0 1593 870 44 | 0 72 37 53 40 1 0 0 0 0 1593 855 45 | 0 73 37 54 40 1 0 0 0 0 1593 855 46 | 0 72 37 53 40 1 0 0 0 0 1593 855 47 | # gpu pwr gtemp mtemp sm mem enc dec jpg ofa mclk pclk 48 | # Idx W C C % % % % % % MHz MHz 49 | 0 72 37 54 39 1 0 0 0 0 1593 840 50 | 0 72 37 54 39 1 0 0 0 0 1593 840 51 | 0 72 37 54 39 1 0 0 0 0 1593 840 52 | 0 72 37 53 40 1 0 0 0 0 1593 840 53 | 0 72 37 54 39 1 0 0 0 0 1593 870 54 | 0 73 37 54 39 1 0 0 0 0 1593 870 55 | 0 73 37 54 39 1 0 0 0 0 1593 870 56 | 0 72 37 53 39 1 0 0 0 0 1593 870 57 | 0 72 37 53 39 1 0 0 0 0 1593 870 58 | 0 72 37 53 39 1 0 0 0 0 1593 855 59 | 0 72 37 53 39 1 0 0 0 0 1593 855 60 | 0 72 37 53 38 1 0 0 0 0 1593 855 61 | 0 72 37 54 39 1 0 0 0 0 1593 840 62 | 0 72 37 53 39 1 0 0 0 0 1593 840 63 | 0 72 37 54 39 1 0 0 0 0 1593 840 64 | 0 72 37 53 39 1 0 0 0 0 1593 855 65 | 0 73 37 53 40 1 0 0 0 0 1593 855 66 | 0 72 37 53 40 1 0 0 0 0 1593 855 67 | 0 73 37 53 40 1 0 0 0 0 1593 855 68 | 0 72 37 54 37 1 0 0 0 0 1593 855 69 | 0 72 37 53 39 1 0 0 0 0 1593 840 70 | 0 72 37 53 39 1 0 0 0 0 1593 840 71 | 0 72 37 53 39 1 0 0 0 0 1593 870 72 | 0 72 37 53 37 1 0 0 0 0 1593 870 73 | 0 72 37 53 39 1 0 0 0 0 1593 840 74 | 0 72 37 54 39 1 0 0 0 0 1593 840 75 | 0 72 37 53 39 1 0 0 0 0 1593 840 76 | 0 72 37 53 40 1 0 0 0 0 1593 855 77 | 0 72 37 54 40 1 0 0 0 0 1593 855 78 | 0 73 37 53 40 1 0 0 0 0 1593 855 79 | 0 72 37 53 40 1 0 0 0 0 1593 855 80 | 0 72 37 53 38 1 0 0 0 0 1593 840 81 | 0 72 37 53 39 1 0 0 0 0 1593 840 82 | 0 72 37 53 39 1 0 0 0 0 1593 840 83 | 0 72 37 53 39 1 0 0 0 0 1593 840 84 | 0 72 37 53 40 1 0 0 0 0 1593 825 85 | 0 72 37 53 39 1 0 0 0 0 1593 825 86 | 0 72 37 53 39 1 0 0 0 0 1593 825 87 | 0 72 37 53 41 1 0 0 0 0 1593 855 88 | 0 72 37 53 40 1 0 0 0 0 1593 855 89 | 0 73 37 53 40 1 0 0 0 0 1593 855 90 | 0 72 37 53 39 1 0 0 0 0 1593 855 91 | 0 73 37 53 39 1 0 0 0 0 1593 855 92 | 0 72 37 53 38 1 0 0 0 0 1593 840 93 | # gpu pwr gtemp mtemp sm mem enc dec jpg ofa mclk pclk 94 | # Idx W C C % % % % % % MHz MHz 95 | 0 72 37 53 38 1 0 0 0 0 1593 840 96 | 0 72 37 53 40 1 0 0 0 0 1593 825 97 | 0 72 37 54 39 1 0 0 0 0 1593 825 98 | 0 72 37 53 39 1 0 0 0 0 1593 825 99 | 0 72 37 53 40 1 0 0 0 0 1593 825 100 | 0 72 37 53 41 1 0 0 0 0 1593 855 101 | 0 72 37 53 39 1 0 0 0 0 1593 855 102 | 0 73 37 53 40 1 0 0 0 0 1593 855 103 | 0 72 37 53 40 1 0 0 0 0 1593 855 104 | 0 72 37 53 40 1 0 0 0 0 1593 855 105 | 0 72 37 53 40 1 0 0 0 0 1593 855 106 | 0 72 37 53 39 1 0 0 0 0 1593 840 107 | 0 72 37 53 39 1 0 0 0 0 1593 840 108 | 0 73 37 53 40 1 0 0 0 0 1593 855 109 | 0 73 37 53 40 1 0 0 0 0 1593 855 110 | 0 72 37 53 39 1 0 0 0 0 1593 840 111 | 0 72 37 53 39 1 0 0 0 0 1593 870 112 | 0 64 37 53 15 0 0 0 0 0 1593 870 113 | 0 64 37 54 0 0 0 0 0 0 1593 825 114 | 0 61 37 54 0 0 0 0 0 0 1593 240 115 | 0 61 37 53 0 0 0 0 0 0 1593 210 116 | -------------------------------------------------------------------------------- /profile/benckmark_logs/bert-b-128_power.csv: -------------------------------------------------------------------------------- 1 | # gpu pwr gtemp mtemp sm mem enc dec jpg ofa mclk pclk 2 | # Idx W C C % % % % % % MHz MHz 3 | 0 62 38 55 0 0 0 0 0 0 1593 210 4 | 0 62 38 55 0 0 0 0 0 0 1593 210 5 | 0 62 38 55 0 0 0 0 0 0 1593 210 6 | 0 62 38 55 0 0 0 0 0 0 1593 210 7 | 0 62 38 55 0 0 0 0 0 0 1593 210 8 | 0 62 38 55 0 0 0 0 0 0 1593 825 9 | 0 81 39 54 40 3 0 0 0 0 1593 960 10 | 0 81 39 54 40 3 0 0 0 0 1593 975 11 | 0 81 39 55 40 3 0 0 0 0 1593 975 12 | 0 81 39 55 40 3 0 0 0 0 1593 975 13 | 0 81 39 54 40 3 0 0 0 0 1593 975 14 | 0 81 39 54 40 3 0 0 0 0 1593 975 15 | 0 80 39 55 40 3 0 0 0 0 1593 975 16 | 0 81 39 55 39 2 0 0 0 0 1593 975 17 | 0 80 39 55 39 2 0 0 0 0 1593 975 18 | 0 81 39 55 40 2 0 0 0 0 1593 960 19 | 0 81 39 55 40 3 0 0 0 0 1593 975 20 | 0 81 39 54 40 3 0 0 0 0 1593 975 21 | 0 81 39 55 40 3 0 0 0 0 1593 975 22 | 0 81 39 55 40 3 0 0 0 0 1593 975 23 | 0 81 39 55 40 3 0 0 0 0 1593 975 24 | 0 81 39 55 40 3 0 0 0 0 1593 975 25 | 0 81 39 55 40 3 0 0 0 0 1593 975 26 | 0 81 39 54 40 3 0 0 0 0 1593 975 27 | 0 81 39 54 40 3 0 0 0 0 1593 975 28 | 0 81 39 54 40 3 0 0 0 0 1593 975 29 | 0 81 39 55 40 3 0 0 0 0 1593 975 30 | 0 81 39 55 40 3 0 0 0 0 1593 975 31 | 0 81 39 55 40 3 0 0 0 0 1593 975 32 | 0 81 39 55 40 3 0 0 0 0 1593 975 33 | 0 81 39 54 40 3 0 0 0 0 1593 975 34 | 0 81 39 55 40 3 0 0 0 0 1593 975 35 | 0 81 39 55 40 3 0 0 0 0 1593 975 36 | 0 81 39 54 40 3 0 0 0 0 1593 975 37 | 0 78 39 55 27 2 0 0 0 0 1593 945 38 | 0 81 39 55 40 3 0 0 0 0 1593 975 39 | 0 81 39 55 40 3 0 0 0 0 1593 975 40 | 0 81 39 55 40 3 0 0 0 0 1593 975 41 | 0 81 39 55 40 3 0 0 0 0 1593 975 42 | 0 81 39 55 40 3 0 0 0 0 1593 975 43 | 0 81 39 55 40 3 0 0 0 0 1593 975 44 | 0 81 39 54 40 3 0 0 0 0 1593 975 45 | 0 81 39 55 40 3 0 0 0 0 1593 975 46 | 0 81 39 55 39 3 0 0 0 0 1593 975 47 | # gpu pwr gtemp mtemp sm mem enc dec jpg ofa mclk pclk 48 | # Idx W C C % % % % % % MHz MHz 49 | 0 81 39 54 39 3 0 0 0 0 1593 975 50 | 0 81 39 55 39 3 0 0 0 0 1593 975 51 | 0 81 39 55 39 3 0 0 0 0 1593 975 52 | 0 81 39 55 39 2 0 0 0 0 1593 975 53 | 0 81 39 55 39 2 0 0 0 0 1593 960 54 | 0 81 39 55 39 2 0 0 0 0 1593 960 55 | 0 81 39 55 40 3 0 0 0 0 1593 960 56 | 0 81 39 54 40 3 0 0 0 0 1593 975 57 | 0 81 39 55 40 3 0 0 0 0 1593 975 58 | 0 81 39 55 40 3 0 0 0 0 1593 975 59 | 0 81 39 54 40 3 0 0 0 0 1593 975 60 | 0 81 39 55 39 3 0 0 0 0 1593 975 61 | 0 81 39 55 40 3 0 0 0 0 1593 975 62 | 0 81 39 55 40 3 0 0 0 0 1593 975 63 | 0 81 39 54 39 3 0 0 0 0 1593 975 64 | 0 81 39 55 40 3 0 0 0 0 1593 975 65 | 0 81 39 55 40 3 0 0 0 0 1593 975 66 | 0 81 39 55 40 3 0 0 0 0 1593 975 67 | 0 81 39 54 40 3 0 0 0 0 1593 975 68 | 0 81 39 54 39 2 0 0 0 0 1593 975 69 | 0 71 39 54 39 3 0 0 0 0 1593 945 70 | 0 81 39 55 40 3 0 0 0 0 1593 975 71 | 0 81 39 55 39 3 0 0 0 0 1593 975 72 | 0 81 39 55 40 3 0 0 0 0 1593 975 73 | 0 81 39 55 40 3 0 0 0 0 1593 975 74 | 0 81 39 55 40 3 0 0 0 0 1593 975 75 | 0 81 39 55 40 3 0 0 0 0 1593 975 76 | 0 81 39 56 40 3 0 0 0 0 1593 975 77 | 0 81 39 55 40 3 0 0 0 0 1593 975 78 | 0 81 39 54 39 3 0 0 0 0 1593 975 79 | 0 81 39 55 40 3 0 0 0 0 1593 975 80 | 0 81 39 54 40 3 0 0 0 0 1593 975 81 | 0 81 39 55 40 3 0 0 0 0 1593 975 82 | 0 81 39 55 40 3 0 0 0 0 1593 975 83 | 0 81 39 54 40 3 0 0 0 0 1593 975 84 | 0 81 39 55 40 3 0 0 0 0 1593 975 85 | 0 81 39 54 39 3 0 0 0 0 1593 975 86 | 0 81 39 56 39 2 0 0 0 0 1593 975 87 | 0 81 39 55 40 2 0 0 0 0 1593 960 88 | 0 81 39 55 39 2 0 0 0 0 1593 960 89 | 0 81 39 55 40 3 0 0 0 0 1593 960 90 | 0 81 39 55 40 3 0 0 0 0 1593 975 91 | 0 81 39 55 40 3 0 0 0 0 1593 975 92 | 0 80 39 55 38 2 0 0 0 0 1593 975 93 | # gpu pwr gtemp mtemp sm mem enc dec jpg ofa mclk pclk 94 | # Idx W C C % % % % % % MHz MHz 95 | 0 80 39 55 38 2 0 0 0 0 1593 960 96 | 0 80 39 56 39 2 0 0 0 0 1593 945 97 | 0 80 39 56 39 2 0 0 0 0 1593 945 98 | 0 81 39 55 40 3 0 0 0 0 1593 975 99 | 0 81 40 55 40 3 0 0 0 0 1593 975 100 | 0 81 40 56 40 3 0 0 0 0 1593 975 101 | 0 81 39 55 39 3 0 0 0 0 1593 975 102 | 0 80 39 55 40 3 0 0 0 0 1593 975 103 | 0 80 39 55 38 2 0 0 0 0 1593 975 104 | 0 80 39 56 39 2 0 0 0 0 1593 945 105 | 0 81 39 55 40 2 0 0 0 0 1593 945 106 | 0 81 40 55 40 3 0 0 0 0 1593 975 107 | 0 81 40 55 39 2 0 0 0 0 1593 975 108 | 0 81 40 55 40 3 0 0 0 0 1593 975 109 | 0 81 40 55 40 3 0 0 0 0 1593 975 110 | 0 81 40 54 40 3 0 0 0 0 1593 975 111 | 0 81 40 55 40 3 0 0 0 0 1593 975 112 | 0 65 39 55 7 0 0 0 0 0 1593 825 113 | 0 65 39 55 0 0 0 0 0 0 1593 825 114 | 0 62 39 55 0 0 0 0 0 0 1593 330 115 | 0 62 39 55 0 0 0 0 0 0 1593 210 116 | 0 62 38 55 0 0 0 0 0 0 1593 210 117 | -------------------------------------------------------------------------------- /profile/benckmark_logs/bert-b-384_power.csv: -------------------------------------------------------------------------------- 1 | # gpu pwr gtemp mtemp sm mem enc dec jpg ofa mclk pclk 2 | # Idx W C C % % % % % % MHz MHz 3 | 0 62 38 55 0 0 0 0 0 0 1593 210 4 | 0 62 38 55 0 0 0 0 0 0 1593 210 5 | 0 62 38 55 0 0 0 0 0 0 1593 210 6 | 0 62 38 55 0 0 0 0 0 0 1593 210 7 | 0 62 38 55 0 0 0 0 0 0 1593 210 8 | 0 65 38 55 0 0 0 0 0 0 1593 825 9 | 0 113 41 55 39 3 0 0 0 0 1593 1290 10 | 0 113 41 55 38 3 0 0 0 0 1593 1290 11 | 0 111 41 56 40 3 0 0 0 0 1593 1245 12 | 0 115 41 56 40 4 0 0 0 0 1593 1290 13 | 0 115 41 56 40 4 0 0 0 0 1593 1290 14 | 0 112 41 56 40 4 0 0 0 0 1593 1275 15 | 0 113 41 56 40 3 0 0 0 0 1593 1275 16 | 0 114 41 56 40 4 0 0 0 0 1593 1290 17 | 0 116 41 55 40 4 0 0 0 0 1593 1290 18 | 0 116 41 56 40 4 0 0 0 0 1593 1290 19 | 0 116 41 55 40 4 0 0 0 0 1593 1290 20 | 0 115 41 55 40 4 0 0 0 0 1593 1290 21 | 0 116 41 56 40 4 0 0 0 0 1593 1290 22 | 0 116 42 55 40 4 0 0 0 0 1593 1290 23 | 0 116 42 55 40 4 0 0 0 0 1593 1290 24 | 0 116 42 56 40 4 0 0 0 0 1593 1290 25 | 0 114 42 56 38 3 0 0 0 0 1593 1290 26 | 0 116 42 55 40 4 0 0 0 0 1593 1290 27 | 0 116 42 56 40 4 0 0 0 0 1593 1290 28 | 0 115 42 56 40 4 0 0 0 0 1593 1290 29 | 0 114 42 56 38 3 0 0 0 0 1593 1290 30 | 0 111 41 56 39 3 0 0 0 0 1593 1260 31 | 0 116 42 56 40 4 0 0 0 0 1593 1290 32 | 0 116 42 56 40 4 0 0 0 0 1593 1290 33 | 0 116 42 56 40 4 0 0 0 0 1593 1290 34 | 0 116 42 56 40 4 0 0 0 0 1593 1290 35 | 0 116 42 56 40 4 0 0 0 0 1593 1290 36 | 0 115 42 56 40 4 0 0 0 0 1593 1290 37 | 0 116 42 56 39 3 0 0 0 0 1593 1290 38 | 0 116 42 56 40 4 0 0 0 0 1593 1290 39 | 0 116 42 56 40 4 0 0 0 0 1593 1290 40 | 0 114 42 56 38 3 0 0 0 0 1593 1290 41 | 0 116 42 56 39 3 0 0 0 0 1593 1290 42 | 0 116 42 56 40 4 0 0 0 0 1593 1290 43 | 0 116 42 56 40 4 0 0 0 0 1593 1290 44 | 0 116 42 56 40 4 0 0 0 0 1593 1290 45 | 0 115 42 56 39 3 0 0 0 0 1593 1290 46 | 0 115 42 56 35 3 0 0 0 0 1593 1290 47 | # gpu pwr gtemp mtemp sm mem enc dec jpg ofa mclk pclk 48 | # Idx W C C % % % % % % MHz MHz 49 | 0 113 42 56 40 4 0 0 0 0 1593 1275 50 | 0 113 42 56 40 4 0 0 0 0 1593 1275 51 | 0 113 42 56 40 3 0 0 0 0 1593 1275 52 | 0 116 42 56 39 4 0 0 0 0 1593 1290 53 | 0 116 42 56 40 4 0 0 0 0 1593 1290 54 | 0 114 42 56 38 3 0 0 0 0 1593 1290 55 | 0 114 42 56 38 3 0 0 0 0 1593 1290 56 | 0 109 42 56 40 3 0 0 0 0 1593 1245 57 | 0 111 42 56 40 3 0 0 0 0 1593 1245 58 | 0 116 42 56 40 4 0 0 0 0 1593 1290 59 | 0 116 42 56 40 4 0 0 0 0 1593 1290 60 | 0 115 42 56 40 4 0 0 0 0 1593 1290 61 | 0 116 42 56 40 4 0 0 0 0 1593 1290 62 | 0 112 42 56 38 3 0 0 0 0 1593 1275 63 | 0 111 42 56 39 3 0 0 0 0 1593 1260 64 | 0 116 42 56 40 4 0 0 0 0 1593 1290 65 | 0 116 42 56 40 4 0 0 0 0 1593 1290 66 | 0 115 42 55 40 4 0 0 0 0 1593 1290 67 | 0 116 42 55 40 4 0 0 0 0 1593 1290 68 | 0 116 42 56 40 4 0 0 0 0 1593 1290 69 | 0 116 42 55 40 4 0 0 0 0 1593 1290 70 | 0 113 42 55 40 4 0 0 0 0 1593 1290 71 | 0 116 42 55 40 4 0 0 0 0 1593 1290 72 | 0 116 42 55 40 4 0 0 0 0 1593 1290 73 | 0 116 42 55 40 4 0 0 0 0 1593 1290 74 | 0 112 42 55 39 3 0 0 0 0 1593 1275 75 | 0 111 42 56 39 3 0 0 0 0 1593 1260 76 | 0 109 42 55 39 3 0 0 0 0 1593 1245 77 | 0 109 42 55 40 3 0 0 0 0 1593 1245 78 | 0 84 41 55 39 3 0 0 0 0 1593 1245 79 | 0 116 42 55 40 4 0 0 0 0 1593 1290 80 | 0 116 42 56 40 4 0 0 0 0 1593 1290 81 | 0 116 42 56 40 4 0 0 0 0 1593 1290 82 | 0 116 42 55 40 4 0 0 0 0 1593 1290 83 | 0 116 42 56 40 4 0 0 0 0 1593 1290 84 | 0 115 42 56 40 4 0 0 0 0 1593 1290 85 | 0 116 42 56 40 4 0 0 0 0 1593 1290 86 | 0 110 42 56 39 3 0 0 0 0 1593 1260 87 | 0 116 42 56 40 4 0 0 0 0 1593 1290 88 | 0 116 42 56 40 4 0 0 0 0 1593 1290 89 | 0 116 42 56 40 4 0 0 0 0 1593 1290 90 | 0 112 42 56 39 3 0 0 0 0 1593 1275 91 | 0 111 42 56 39 3 0 0 0 0 1593 1260 92 | 0 109 42 56 40 3 0 0 0 0 1593 1245 93 | # gpu pwr gtemp mtemp sm mem enc dec jpg ofa mclk pclk 94 | # Idx W C C % % % % % % MHz MHz 95 | 0 109 42 56 40 3 0 0 0 0 1593 1245 96 | 0 111 42 56 41 4 0 0 0 0 1593 1290 97 | 0 116 42 56 40 4 0 0 0 0 1593 1290 98 | 0 116 42 56 40 4 0 0 0 0 1593 1290 99 | 0 115 42 56 40 4 0 0 0 0 1593 1290 100 | 0 116 42 56 39 4 0 0 0 0 1593 1290 101 | 0 116 42 56 40 4 0 0 0 0 1593 1290 102 | 0 116 42 56 40 4 0 0 0 0 1593 1290 103 | 0 114 42 55 40 4 0 0 0 0 1593 1290 104 | 0 112 42 56 39 3 0 0 0 0 1593 1275 105 | 0 111 42 57 39 3 0 0 0 0 1593 1260 106 | 0 110 42 56 40 3 0 0 0 0 1593 1245 107 | 0 110 42 56 40 3 0 0 0 0 1593 1245 108 | 0 109 42 56 40 3 0 0 0 0 1593 1245 109 | 0 109 42 56 40 3 0 0 0 0 1593 1245 110 | 0 103 42 56 40 3 0 0 0 0 1593 1230 111 | 0 116 42 56 40 4 0 0 0 0 1593 1290 112 | 0 74 40 56 16 0 0 0 0 0 1593 1290 113 | 0 63 40 56 0 0 0 0 0 0 1593 555 114 | 0 63 39 56 0 0 0 0 0 0 1593 345 115 | 0 62 39 56 0 0 0 0 0 0 1593 225 116 | 0 62 39 56 0 0 0 0 0 0 1593 210 117 | -------------------------------------------------------------------------------- /profile/benckmark_logs/deit-t_power.csv: -------------------------------------------------------------------------------- 1 | # gpu pwr gtemp mtemp sm mem enc dec jpg ofa mclk pclk 2 | # Idx W C C % % % % % % MHz MHz 3 | 0 61 36 52 0 0 0 0 0 0 1593 210 4 | 0 61 36 52 0 0 0 0 0 0 1593 210 5 | 0 61 36 53 0 0 0 0 0 0 1593 210 6 | 0 61 36 52 0 0 0 0 0 0 1593 210 7 | 0 61 36 53 0 0 0 0 0 0 1593 210 8 | 0 61 36 52 0 0 0 0 0 0 1593 210 9 | 0 61 36 52 0 0 0 0 0 0 1593 210 10 | 0 68 37 52 32 0 0 0 0 0 1593 825 11 | 0 68 37 54 37 0 0 0 0 0 1593 825 12 | 0 68 37 52 38 0 0 0 0 0 1593 825 13 | 0 68 37 53 38 0 0 0 0 0 1593 825 14 | 0 68 37 53 37 0 0 0 0 0 1593 825 15 | 0 68 37 54 37 0 0 0 0 0 1593 825 16 | 0 68 37 54 38 0 0 0 0 0 1593 825 17 | 0 68 37 54 38 0 0 0 0 0 1593 825 18 | 0 68 37 53 38 0 0 0 0 0 1593 825 19 | 0 68 37 52 38 0 0 0 0 0 1593 825 20 | 0 68 37 52 38 0 0 0 0 0 1593 825 21 | 0 68 37 53 38 0 0 0 0 0 1593 825 22 | 0 68 37 53 37 0 0 0 0 0 1593 825 23 | 0 68 37 53 38 0 0 0 0 0 1593 825 24 | 0 68 37 54 38 0 0 0 0 0 1593 825 25 | 0 68 37 53 38 0 0 0 0 0 1593 825 26 | 0 68 37 53 38 0 0 0 0 0 1593 825 27 | 0 68 37 53 38 0 0 0 0 0 1593 825 28 | 0 68 37 53 37 0 0 0 0 0 1593 825 29 | 0 68 37 53 37 0 0 0 0 0 1593 825 30 | 0 68 37 53 37 0 0 0 0 0 1593 825 31 | 0 68 37 53 37 0 0 0 0 0 1593 825 32 | 0 68 37 53 38 0 0 0 0 0 1593 825 33 | 0 68 37 53 25 0 0 0 0 0 1593 825 34 | 0 68 37 53 38 0 0 0 0 0 1593 825 35 | 0 68 37 54 38 0 0 0 0 0 1593 825 36 | 0 68 37 53 38 0 0 0 0 0 1593 825 37 | 0 68 37 53 38 0 0 0 0 0 1593 825 38 | 0 68 37 53 38 0 0 0 0 0 1593 825 39 | 0 68 37 53 38 0 0 0 0 0 1593 825 40 | 0 68 37 53 38 0 0 0 0 0 1593 825 41 | 0 68 37 53 38 0 0 0 0 0 1593 825 42 | 0 68 37 53 37 0 0 0 0 0 1593 825 43 | 0 68 37 53 37 0 0 0 0 0 1593 825 44 | 0 68 37 53 37 0 0 0 0 0 1593 825 45 | 0 68 37 54 37 0 0 0 0 0 1593 825 46 | 0 68 37 53 37 0 0 0 0 0 1593 825 47 | # gpu pwr gtemp mtemp sm mem enc dec jpg ofa mclk pclk 48 | # Idx W C C % % % % % % MHz MHz 49 | 0 68 37 53 37 0 0 0 0 0 1593 825 50 | 0 68 37 53 37 0 0 0 0 0 1593 825 51 | 0 68 37 53 38 0 0 0 0 0 1593 825 52 | 0 68 37 53 38 0 0 0 0 0 1593 825 53 | 0 68 37 53 37 0 0 0 0 0 1593 825 54 | 0 68 37 53 37 0 0 0 0 0 1593 825 55 | 0 68 37 53 37 0 0 0 0 0 1593 825 56 | 0 68 37 53 38 0 0 0 0 0 1593 825 57 | 0 68 37 53 37 0 0 0 0 0 1593 825 58 | 0 68 37 53 37 0 0 0 0 0 1593 825 59 | 0 68 37 53 38 0 0 0 0 0 1593 825 60 | 0 68 37 53 38 0 0 0 0 0 1593 825 61 | 0 68 37 53 37 0 0 0 0 0 1593 825 62 | 0 68 37 53 37 0 0 0 0 0 1593 825 63 | 0 68 37 53 37 0 0 0 0 0 1593 825 64 | 0 68 37 53 37 0 0 0 0 0 1593 825 65 | 0 68 37 53 22 0 0 0 0 0 1593 825 66 | 0 68 37 53 37 0 0 0 0 0 1593 825 67 | 0 68 37 53 37 0 0 0 0 0 1593 825 68 | 0 68 37 53 37 0 0 0 0 0 1593 825 69 | 0 68 37 53 37 0 0 0 0 0 1593 825 70 | 0 68 37 53 37 0 0 0 0 0 1593 825 71 | 0 68 37 53 38 0 0 0 0 0 1593 825 72 | 0 68 37 53 38 0 0 0 0 0 1593 825 73 | 0 68 37 53 38 0 0 0 0 0 1593 825 74 | 0 68 37 53 38 0 0 0 0 0 1593 825 75 | 0 68 37 53 38 0 0 0 0 0 1593 825 76 | 0 68 37 53 38 0 0 0 0 0 1593 825 77 | 0 68 37 53 37 0 0 0 0 0 1593 825 78 | 0 68 37 53 37 0 0 0 0 0 1593 825 79 | 0 68 37 53 37 0 0 0 0 0 1593 825 80 | 0 68 37 53 37 0 0 0 0 0 1593 825 81 | 0 68 37 53 37 0 0 0 0 0 1593 825 82 | 0 68 37 53 38 0 0 0 0 0 1593 825 83 | 0 68 37 53 37 0 0 0 0 0 1593 825 84 | 0 68 37 53 37 0 0 0 0 0 1593 825 85 | 0 68 37 53 38 0 0 0 0 0 1593 825 86 | 0 68 37 53 37 0 0 0 0 0 1593 825 87 | 0 68 37 53 37 0 0 0 0 0 1593 825 88 | 0 68 37 53 37 0 0 0 0 0 1593 825 89 | 0 68 37 53 38 0 0 0 0 0 1593 825 90 | 0 68 37 53 37 0 0 0 0 0 1593 825 91 | 0 68 37 53 37 0 0 0 0 0 1593 825 92 | 0 68 37 53 37 0 0 0 0 0 1593 825 93 | # gpu pwr gtemp mtemp sm mem enc dec jpg ofa mclk pclk 94 | # Idx W C C % % % % % % MHz MHz 95 | 0 68 37 53 37 0 0 0 0 0 1593 825 96 | 0 68 37 53 37 0 0 0 0 0 1593 825 97 | 0 68 37 53 37 0 0 0 0 0 1593 825 98 | 0 68 37 53 37 0 0 0 0 0 1593 825 99 | 0 68 37 53 37 0 0 0 0 0 1593 825 100 | 0 68 37 53 37 0 0 0 0 0 1593 825 101 | 0 68 37 53 37 0 0 0 0 0 1593 825 102 | 0 68 37 53 37 0 0 0 0 0 1593 825 103 | 0 68 37 53 37 0 0 0 0 0 1593 825 104 | 0 68 37 53 38 0 0 0 0 0 1593 825 105 | 0 68 37 53 37 0 0 0 0 0 1593 825 106 | 0 68 37 53 37 0 0 0 0 0 1593 825 107 | 0 68 37 53 37 0 0 0 0 0 1593 825 108 | 0 68 37 53 37 0 0 0 0 0 1593 825 109 | 0 68 37 53 38 0 0 0 0 0 1593 825 110 | 0 68 37 53 37 0 0 0 0 0 1593 825 111 | 0 68 37 53 37 0 0 0 0 0 1593 825 112 | 0 68 37 53 38 0 0 0 0 0 1593 825 113 | 0 64 37 53 37 0 0 0 0 0 1593 825 114 | 0 64 37 53 0 0 0 0 0 0 1593 825 115 | 0 64 37 53 0 0 0 0 0 0 1593 825 116 | 0 61 37 53 0 0 0 0 0 0 1593 240 117 | 0 61 37 53 0 0 0 0 0 0 1593 210 118 | -------------------------------------------------------------------------------- /profile/benckmark_logs/bert-l-320_power.csv: -------------------------------------------------------------------------------- 1 | # gpu pwr gtemp mtemp sm mem enc dec jpg ofa mclk pclk 2 | # Idx W C C % % % % % % MHz MHz 3 | 0 61 36 53 0 0 0 0 0 0 1593 210 4 | 0 61 36 53 0 0 0 0 0 0 1593 210 5 | 0 61 36 52 0 0 0 0 0 0 1593 210 6 | 0 61 36 53 0 0 0 0 0 0 1593 210 7 | 0 61 36 52 0 0 0 0 0 0 1593 210 8 | 0 61 36 53 0 0 0 0 0 0 1593 210 9 | 0 64 38 54 33 0 0 0 0 0 1593 825 10 | 0 153 41 53 41 7 0 0 0 0 1593 1410 11 | 0 150 41 54 40 6 0 0 0 0 1593 1410 12 | 0 153 41 54 41 7 0 0 0 0 1593 1410 13 | 0 153 42 54 41 7 0 0 0 0 1593 1410 14 | 0 154 42 54 41 7 0 0 0 0 1593 1410 15 | 0 152 42 55 41 6 0 0 0 0 1593 1410 16 | 0 154 42 54 41 7 0 0 0 0 1593 1410 17 | 0 154 42 54 42 7 0 0 0 0 1593 1410 18 | 0 153 42 54 41 7 0 0 0 0 1593 1410 19 | 0 154 42 55 42 7 0 0 0 0 1593 1410 20 | 0 151 42 55 41 7 0 0 0 0 1593 1410 21 | 0 154 42 55 42 7 0 0 0 0 1593 1410 22 | 0 155 42 55 42 7 0 0 0 0 1593 1410 23 | 0 153 42 55 42 7 0 0 0 0 1593 1410 24 | 0 153 42 55 42 7 0 0 0 0 1593 1410 25 | 0 154 42 55 42 7 0 0 0 0 1593 1410 26 | 0 154 42 55 41 7 0 0 0 0 1593 1410 27 | 0 153 42 55 42 7 0 0 0 0 1593 1410 28 | 0 154 42 55 42 7 0 0 0 0 1593 1410 29 | 0 154 42 56 42 7 0 0 0 0 1593 1410 30 | 0 153 42 55 41 7 0 0 0 0 1593 1410 31 | 0 153 42 55 42 7 0 0 0 0 1593 1410 32 | 0 152 42 55 42 7 0 0 0 0 1593 1410 33 | 0 155 42 56 42 7 0 0 0 0 1593 1410 34 | 0 152 42 55 41 7 0 0 0 0 1593 1410 35 | 0 153 42 56 41 7 0 0 0 0 1593 1410 36 | 0 153 42 56 41 7 0 0 0 0 1593 1410 37 | 0 154 42 55 41 7 0 0 0 0 1593 1410 38 | 0 153 42 56 41 7 0 0 0 0 1593 1410 39 | 0 153 42 56 41 7 0 0 0 0 1593 1410 40 | 0 154 42 56 41 6 0 0 0 0 1593 1410 41 | 0 153 43 55 41 7 0 0 0 0 1593 1410 42 | 0 153 42 56 41 6 0 0 0 0 1593 1410 43 | 0 154 43 55 42 7 0 0 0 0 1593 1410 44 | 0 154 43 55 41 7 0 0 0 0 1593 1410 45 | 0 154 43 56 42 7 0 0 0 0 1593 1410 46 | 0 154 43 56 41 7 0 0 0 0 1593 1410 47 | # gpu pwr gtemp mtemp sm mem enc dec jpg ofa mclk pclk 48 | # Idx W C C % % % % % % MHz MHz 49 | 0 155 43 55 41 7 0 0 0 0 1593 1410 50 | 0 154 43 56 42 7 0 0 0 0 1593 1410 51 | 0 155 43 55 41 7 0 0 0 0 1593 1410 52 | 0 154 43 56 41 7 0 0 0 0 1593 1410 53 | 0 154 43 56 42 7 0 0 0 0 1593 1410 54 | 0 154 43 56 41 7 0 0 0 0 1593 1410 55 | 0 154 43 56 42 7 0 0 0 0 1593 1410 56 | 0 154 43 56 42 7 0 0 0 0 1593 1410 57 | 0 155 43 55 42 7 0 0 0 0 1593 1410 58 | 0 154 43 55 41 7 0 0 0 0 1593 1410 59 | 0 153 43 56 42 7 0 0 0 0 1593 1410 60 | 0 154 43 55 41 7 0 0 0 0 1593 1410 61 | 0 154 43 55 41 7 0 0 0 0 1593 1410 62 | 0 154 43 56 42 7 0 0 0 0 1593 1410 63 | 0 154 43 55 42 7 0 0 0 0 1593 1410 64 | 0 155 43 56 42 7 0 0 0 0 1593 1410 65 | 0 154 43 56 42 7 0 0 0 0 1593 1410 66 | 0 154 43 56 42 7 0 0 0 0 1593 1410 67 | 0 154 43 55 42 7 0 0 0 0 1593 1410 68 | 0 154 43 56 41 7 0 0 0 0 1593 1410 69 | 0 154 43 56 42 7 0 0 0 0 1593 1410 70 | 0 155 43 55 42 7 0 0 0 0 1593 1410 71 | 0 155 43 56 41 7 0 0 0 0 1593 1410 72 | 0 154 43 55 42 7 0 0 0 0 1593 1410 73 | 0 153 43 56 41 7 0 0 0 0 1593 1410 74 | 0 153 43 56 41 6 0 0 0 0 1593 1410 75 | 0 153 43 56 41 6 0 0 0 0 1593 1410 76 | 0 153 43 56 41 6 0 0 0 0 1593 1410 77 | 0 155 43 56 41 7 0 0 0 0 1593 1410 78 | 0 155 43 56 42 7 0 0 0 0 1593 1410 79 | 0 154 43 55 41 7 0 0 0 0 1593 1410 80 | 0 155 43 56 42 7 0 0 0 0 1593 1410 81 | 0 154 43 56 42 7 0 0 0 0 1593 1410 82 | 0 155 43 56 41 7 0 0 0 0 1593 1410 83 | 0 154 43 55 42 7 0 0 0 0 1593 1410 84 | 0 155 43 55 42 7 0 0 0 0 1593 1410 85 | 0 154 43 56 42 7 0 0 0 0 1593 1410 86 | 0 155 43 55 42 7 0 0 0 0 1593 1410 87 | 0 154 43 56 41 7 0 0 0 0 1593 1410 88 | 0 155 43 55 41 6 0 0 0 0 1593 1410 89 | 0 155 43 56 42 7 0 0 0 0 1593 1410 90 | 0 155 43 55 42 7 0 0 0 0 1593 1410 91 | 0 153 43 56 41 7 0 0 0 0 1593 1410 92 | 0 155 43 56 41 7 0 0 0 0 1593 1410 93 | # gpu pwr gtemp mtemp sm mem enc dec jpg ofa mclk pclk 94 | # Idx W C C % % % % % % MHz MHz 95 | 0 155 43 56 42 7 0 0 0 0 1593 1410 96 | 0 155 43 56 42 7 0 0 0 0 1593 1410 97 | 0 154 43 56 42 7 0 0 0 0 1593 1410 98 | 0 155 43 56 42 7 0 0 0 0 1593 1410 99 | 0 155 43 56 42 7 0 0 0 0 1593 1410 100 | 0 155 43 56 42 7 0 0 0 0 1593 1410 101 | 0 154 43 57 41 7 0 0 0 0 1593 1410 102 | 0 155 43 56 41 7 0 0 0 0 1593 1410 103 | 0 155 43 57 41 7 0 0 0 0 1593 1410 104 | 0 155 43 56 41 7 0 0 0 0 1593 1410 105 | 0 154 43 56 41 7 0 0 0 0 1593 1410 106 | 0 155 43 56 41 7 0 0 0 0 1593 1410 107 | 0 154 43 56 41 7 0 0 0 0 1593 1410 108 | 0 153 43 56 41 7 0 0 0 0 1593 1410 109 | 0 153 43 56 41 6 0 0 0 0 1593 1410 110 | 0 153 43 55 41 6 0 0 0 0 1593 1410 111 | 0 154 43 56 41 6 0 0 0 0 1593 1410 112 | 0 153 44 57 41 6 0 0 0 0 1593 1410 113 | 0 85 42 56 41 7 0 0 0 0 1593 1410 114 | 0 82 40 55 0 0 0 0 0 0 1593 1410 115 | 0 64 39 55 0 0 0 0 0 0 1593 825 116 | 0 62 38 55 0 0 0 0 0 0 1593 345 117 | 0 62 38 55 0 0 0 0 0 0 1593 225 118 | 0 61 38 55 0 0 0 0 0 0 1593 210 119 | 0 62 38 55 0 0 0 0 0 0 1593 210 120 | 0 62 38 55 0 0 0 0 0 0 1593 210 121 | 0 62 38 55 0 0 0 0 0 0 1593 210 122 | -------------------------------------------------------------------------------- /profile/benckmark_logs/deit-b-power.csv: -------------------------------------------------------------------------------- 1 | # gpu pwr gtemp mtemp sm mem enc dec jpg ofa mclk pclk 2 | # Idx W C C % % % % % % MHz MHz 3 | 0 61 37 53 0 0 0 0 0 0 1593 210 4 | 0 61 37 53 0 0 0 0 0 0 1593 210 5 | 0 61 37 53 0 0 0 0 0 0 1593 210 6 | 0 61 37 54 0 0 0 0 0 0 1593 210 7 | 0 61 37 54 0 0 0 0 0 0 1593 210 8 | 0 61 37 54 0 0 0 0 0 0 1593 210 9 | 0 61 37 53 0 0 0 0 0 0 1593 825 10 | 0 86 38 54 41 3 0 0 0 0 1593 1080 11 | 0 86 38 54 39 3 0 0 0 0 1593 1080 12 | 0 87 38 53 40 3 0 0 0 0 1593 1065 13 | 0 87 38 53 40 3 0 0 0 0 1593 1065 14 | 0 86 38 53 40 3 0 0 0 0 1593 1065 15 | 0 86 38 53 39 3 0 0 0 0 1593 1065 16 | 0 87 38 53 39 3 0 0 0 0 1593 1065 17 | 0 86 38 53 40 3 0 0 0 0 1593 1065 18 | 0 86 38 53 39 3 0 0 0 0 1593 1065 19 | 0 87 38 53 40 3 0 0 0 0 1593 1065 20 | 0 87 38 53 39 3 0 0 0 0 1593 1065 21 | 0 86 38 53 40 3 0 0 0 0 1593 1065 22 | 0 87 38 53 40 3 0 0 0 0 1593 1065 23 | 0 86 38 53 40 3 0 0 0 0 1593 1065 24 | 0 87 38 53 39 3 0 0 0 0 1593 1065 25 | 0 86 38 53 40 3 0 0 0 0 1593 1050 26 | 0 86 38 53 40 3 0 0 0 0 1593 1065 27 | 0 86 38 53 39 3 0 0 0 0 1593 1065 28 | 0 87 38 53 39 3 0 0 0 0 1593 1065 29 | 0 86 38 53 39 3 0 0 0 0 1593 1065 30 | 0 86 38 53 40 3 0 0 0 0 1593 1065 31 | 0 86 38 54 39 3 0 0 0 0 1593 1050 32 | 0 87 38 53 40 3 0 0 0 0 1593 1065 33 | 0 86 38 53 40 3 0 0 0 0 1593 1065 34 | 0 87 38 54 40 3 0 0 0 0 1593 1065 35 | 0 86 38 53 39 3 0 0 0 0 1593 1065 36 | 0 86 38 54 40 3 0 0 0 0 1593 1065 37 | 0 87 38 54 40 3 0 0 0 0 1593 1065 38 | 0 87 38 53 40 3 0 0 0 0 1593 1065 39 | 0 87 38 53 39 3 0 0 0 0 1593 1065 40 | 0 86 38 53 39 3 0 0 0 0 1593 1065 41 | 0 87 38 53 40 3 0 0 0 0 1593 1065 42 | 0 87 38 54 40 3 0 0 0 0 1593 1065 43 | 0 87 38 54 39 3 0 0 0 0 1593 1065 44 | 0 87 38 53 40 3 0 0 0 0 1593 1065 45 | 0 87 38 53 39 3 0 0 0 0 1593 1065 46 | 0 87 38 53 40 3 0 0 0 0 1593 1065 47 | # gpu pwr gtemp mtemp sm mem enc dec jpg ofa mclk pclk 48 | # Idx W C C % % % % % % MHz MHz 49 | 0 84 38 53 39 3 0 0 0 0 1593 1065 50 | 0 87 38 53 40 3 0 0 0 0 1593 1065 51 | 0 86 38 54 39 3 0 0 0 0 1593 1065 52 | 0 86 38 53 39 3 0 0 0 0 1593 1065 53 | 0 87 38 54 40 3 0 0 0 0 1593 1065 54 | 0 86 38 54 39 3 0 0 0 0 1593 1065 55 | 0 87 38 54 39 3 0 0 0 0 1593 1065 56 | 0 87 38 53 40 3 0 0 0 0 1593 1065 57 | 0 87 38 54 40 3 0 0 0 0 1593 1065 58 | 0 87 38 54 39 3 0 0 0 0 1593 1065 59 | 0 86 38 54 39 3 0 0 0 0 1593 1050 60 | 0 86 38 54 39 3 0 0 0 0 1593 1065 61 | 0 86 38 54 39 3 0 0 0 0 1593 1065 62 | 0 86 38 53 40 3 0 0 0 0 1593 1050 63 | 0 86 38 54 40 3 0 0 0 0 1593 1050 64 | 0 86 38 54 39 3 0 0 0 0 1593 1065 65 | 0 87 38 54 39 3 0 0 0 0 1593 1065 66 | 0 86 38 54 41 3 0 0 0 0 1593 1050 67 | 0 87 38 54 39 3 0 0 0 0 1593 1065 68 | 0 87 38 53 39 3 0 0 0 0 1593 1065 69 | 0 87 38 53 39 3 0 0 0 0 1593 1065 70 | 0 86 38 54 39 3 0 0 0 0 1593 1065 71 | 0 86 38 54 39 3 0 0 0 0 1593 1065 72 | 0 87 38 53 39 3 0 0 0 0 1593 1065 73 | 0 87 38 54 39 3 0 0 0 0 1593 1065 74 | 0 87 38 53 39 3 0 0 0 0 1593 1065 75 | 0 87 38 54 39 3 0 0 0 0 1593 1065 76 | 0 87 38 54 39 3 0 0 0 0 1593 1065 77 | 0 87 38 54 39 3 0 0 0 0 1593 1065 78 | 0 87 38 53 39 3 0 0 0 0 1593 1065 79 | 0 87 38 54 39 3 0 0 0 0 1593 1065 80 | 0 87 38 54 39 3 0 0 0 0 1593 1065 81 | 0 87 38 54 39 3 0 0 0 0 1593 1065 82 | 0 86 38 54 39 3 0 0 0 0 1593 1065 83 | 0 87 38 54 39 3 0 0 0 0 1593 1065 84 | 0 87 38 54 39 3 0 0 0 0 1593 1065 85 | 0 87 38 54 40 3 0 0 0 0 1593 1065 86 | 0 87 38 53 39 3 0 0 0 0 1593 1065 87 | 0 87 38 54 39 3 0 0 0 0 1593 1065 88 | 0 87 38 53 39 3 0 0 0 0 1593 1065 89 | 0 86 38 54 40 3 0 0 0 0 1593 1065 90 | 0 87 38 54 39 3 0 0 0 0 1593 1065 91 | 0 86 38 55 40 3 0 0 0 0 1593 1050 92 | 0 87 38 53 40 3 0 0 0 0 1593 1050 93 | # gpu pwr gtemp mtemp sm mem enc dec jpg ofa mclk pclk 94 | # Idx W C C % % % % % % MHz MHz 95 | 0 86 38 54 40 3 0 0 0 0 1593 1050 96 | 0 86 38 54 40 3 0 0 0 0 1593 1050 97 | 0 86 38 54 39 3 0 0 0 0 1593 1050 98 | 0 86 38 54 41 3 0 0 0 0 1593 1050 99 | 0 87 38 54 39 3 0 0 0 0 1593 1065 100 | 0 86 38 54 39 3 0 0 0 0 1593 1065 101 | 0 87 38 54 39 3 0 0 0 0 1593 1065 102 | 0 87 38 55 39 3 0 0 0 0 1593 1065 103 | 0 86 38 54 39 3 0 0 0 0 1593 1065 104 | 0 87 38 55 39 3 0 0 0 0 1593 1065 105 | 0 87 38 55 39 3 0 0 0 0 1593 1065 106 | 0 87 38 54 40 3 0 0 0 0 1593 1065 107 | 0 87 38 54 39 3 0 0 0 0 1593 1065 108 | 0 86 38 54 39 3 0 0 0 0 1593 1065 109 | 0 87 38 54 40 3 0 0 0 0 1593 1065 110 | 0 87 38 54 39 3 0 0 0 0 1593 1065 111 | 0 87 38 54 39 3 0 0 0 0 1593 1065 112 | 0 87 38 55 39 3 0 0 0 0 1593 1065 113 | 0 64 37 53 23 0 0 0 0 0 1593 840 114 | 0 64 37 53 0 0 0 0 0 0 1593 825 115 | 0 62 37 53 0 0 0 0 0 0 1593 375 116 | 0 61 37 53 0 0 0 0 0 0 1593 240 117 | 0 61 37 53 0 0 0 0 0 0 1593 210 118 | 0 61 37 53 0 0 0 0 0 0 1593 210 119 | 0 61 37 53 0 0 0 0 0 0 1593 210 120 | 0 61 37 53 0 0 0 0 0 0 1593 210 121 | 0 61 37 53 0 0 0 0 0 0 1593 210 122 | 0 61 37 53 0 0 0 0 0 0 1593 210 123 | 0 61 37 53 0 0 0 0 0 0 1593 210 124 | 0 61 37 53 0 0 0 0 0 0 1593 210 125 | 0 61 37 53 0 0 0 0 0 0 1593 210 126 | 0 61 37 53 0 0 0 0 0 0 1593 210 127 | 0 61 37 53 0 0 0 0 0 0 1593 210 128 | 0 61 37 53 0 0 0 0 0 0 1593 210 129 | 0 61 37 53 0 0 0 0 0 0 1593 210 130 | 0 61 37 53 0 0 0 0 0 0 1593 210 131 | 0 61 37 53 0 0 0 0 0 0 1593 210 132 | 0 61 37 53 0 0 0 0 0 0 1593 210 133 | -------------------------------------------------------------------------------- /software_model/readme.md: -------------------------------------------------------------------------------- 1 | # Train and Inference of DeiT on our lightining-transformer 2 | 3 | --- 4 | 5 | For deit, we built on the official implementation (https://github.com/facebookresearch/deit). 6 | 7 | To model the inference accuracy on our photonic accelerator, we explictly inject the analytic transformation of our photonic tensor core during computation. We consider several nonidealties and inject them during inference, including **input encoding magnitude varation**, **input encoding phase variaion**, **output computation variation**, and **WDM dispersion introduced by multiple wavelength**. 8 | 9 | Please ensure that you have install the required dependencies following instructions in `../readme.md`, before you run jobs. 10 | 11 | --- 12 | 13 | ## Structures 14 | Our code is built upon the offical [DeiT](https://github.com/facebookresearch/deit). 15 | * `./models/quant_vit.py`. The ViT model definition with quantization and analytic transformation of our PTC computation considering different noise resources. 16 | * `./ops/`. Useful utils functions, including the implemented learned-step-size quantization [LSQ](https://github.com/hustzxd/LSQuantization) for transformer quantization. 17 | * `main.py`. The main python file. 18 | * `/scripts/`. This folder contains the scripts for implementing noise-aware training of low-bit DeiT models and testing inference accuracy. 19 | 20 | 21 | 22 | ## Data preparation 23 | 24 | ### Dataset 25 | Download and extract ImageNet train and val images from http://image-net.org/. 26 | The directory structure is the standard layout for the torchvision [`datasets.ImageFolder`](https://pytorch.org/docs/stable/torchvision/datasets.html#imagefolder), and the training and validation data is expected to be in the `train/` folder and `val` folder respectively: 27 | 28 | ``` 29 | /path/to/imagenet/ 30 | train/ 31 | class1/ 32 | img1.jpeg 33 | class2/ 34 | img2.jpeg 35 | val/ 36 | class1/ 37 | img3.jpeg 38 | class2/ 39 | img4.jpeg 40 | ``` 41 | ### Pretrained checkpoints 42 | Download baseline DeiT models pretrained on ImageNet 2012 and put in the `pretrained` directory. 43 | 44 | | name | acc@1 | acc@5 | #params | url | 45 | | --- | --- | --- | --- | --- | 46 | | DeiT-tiny | 72.2 | 91.1 | 5M | [model](https://dl.fbaipublicfiles.com/deit/deit_tiny_patch16_224-a1311bcf.pth) | 47 | | DeiT-small | 79.9 | 95.0 | 22M| [model](https://dl.fbaipublicfiles.com/deit/deit_small_patch16_224-cd65a155.pth) | 48 | | DeiT-base | 81.8 | 95.6 | 86M | [model](https://dl.fbaipublicfiles.com/deit/deit_base_patch16_224-b5f2ef4d.pth) | 49 | 50 | ``` 51 | mkdir pretrained 52 | curl -o ./pretrained/deit_tiny_patch16_224-a1311bcf.pth https://dl.fbaipublicfiles.com/deit/deit_tiny_patch16_224-a1311bcf.pth 53 | curl -o ./pretrained/deit_small_patch16_224-cd65a155.pth https://dl.fbaipublicfiles.com/deit/deit_small_patch16_224-cd65a155.pth 54 | curl -o ./pretrained/deit_base_patch16_224-b5f2ef4d.pth https://dl.fbaipublicfiles.com/deit/deit_base_patch16_224-b5f2ef4d.pth 55 | ``` 56 | 57 | ### Provided checkpoint for 4-bit DeiT-T 58 | Training with DeiT may takes days given the quantization and the dedicated model of computation based on photonic tensor core. 59 | 60 | We provide our checkpoint of DeiT-4bit for you to help you quickly perform evaluation and reproduce our results. 61 | The model is in [google drive link](https://drive.google.com/uc?id=1EZjEnkqyBaBU8pUrYqNLTYMq4Mn0cbKV). 62 | 63 | ``` 64 | mkdir resumed_ckpt 65 | gdown https://drive.google.com/uc?id=1EZjEnkqyBaBU8pUrYqNLTYMq4Mn0cbKV -O resumed_ckpt/ 66 | ``` 67 | 68 | ## How to use 69 | 70 | ### Noise-aware training with a pretrained checkpoint 71 | 72 | Train a quantized DeiT model using `./scripts/train_quant_transformer_with_noise.sh` by setting the bit-precision, input noise std. and output noise std and other training settings. 73 | 74 | You need to replace the path in `--data-path /path/to/imagenet/data` by the path you put imagenet. 75 | 76 | The `--finetune pretrained/deit_tiny_patch16_224-a1311bcf.pth` should be the path to the downloaded pretrained model. 77 | 78 | 79 | ``` 80 | wbits=4 81 | abits=4 82 | id=4bit 83 | lr=5e-4 84 | weight_decay=1e-8 85 | batch_size=512 86 | epochs=300 87 | port=47771 88 | headwise=1 89 | input_noise_std=0.03 90 | output_noise_std=0.05 91 | 92 | torchrun \ 93 | --master_port ${port} \ 94 | --nproc_per_node=4 main.py \ 95 | --model deit_tiny_patch16_224_quant \ 96 | --drop-path 0 \ 97 | --batch-size ${batch_size} \ 98 | --lr ${lr} \ 99 | --min-lr 0 \ 100 | --epochs ${epochs} \ 101 | --warmup-epochs 0 \ 102 | --weight-decay ${weight_decay} \ 103 | --wbits ${wbits} \ 104 | --abits ${abits} \ 105 | --dist-eval \ 106 | --output_dir test/deit_tiny_${id}/${wbits}w${abits}a_bs${batch_size}_baselr${lr}_weightdecay${weight_decay}_ft${epochs}_headwise${headwise}_noise_i_${input_noise_std}_o_${output_noise_std}_linear_noise \ 107 | --finetune pretrained/deit_tiny_patch16_224-a1311bcf.pth \ 108 | --data-path /path/to/imagenet/data \ 109 | --headwise \ 110 | --input_noise_std ${input_noise_std} \ 111 | --output_noise_std ${output_noise_std} \ 112 | --enable_linear_noise 113 | ``` 114 | 115 | ### Evaluation of a trained model with noise injection 116 | 117 | Test the inference accuracy of a trained DeiT model using `./scripts/evaluate_quant_transformer.sh` and setting the corresponding noise levels. 118 | 119 | * input_noise_std: Noise std of the input magtitude encoding. Default is 0.03. 120 | * phase_noise_std: Noise std of the input phase encoding. Default is $2^{\circ}$. 121 | * output_noise_std: Noise std of the computed outputs. Default is 0.05. 122 | * num_wavelength: number of wavelength used in the system. We will calculate the wavelength-induced dispersion error. Default is 12. 123 | 124 | Set `resumed_ckpt_path='./your/path/to/best_checkpoint.pth'` in the script. 125 | 126 | ``` 127 | exp='eval_accuracy' 128 | wbits=4 129 | abits=4 130 | id=4bit 131 | headwise=1 132 | 133 | # noise settings 134 | input_noise_std=0.03 135 | output_noise_std=0.05 136 | # following setting is added for inference only 137 | phase_noise_std=2 138 | num_wavelength=12 139 | channel_spacing=0.4 140 | seed=0 141 | 142 | resumed_ckpt_path='./your/path/to/best_checkpoint.pth' 143 | 144 | 145 | for i in {1..1} 146 | do 147 | for input_noise_std in 0.03 148 | do 149 | CUDA_VISIBLE_DEVICES=0 python main.py --eval \ 150 | --resume ${resumed_ckpt_path} \ 151 | --model deit_tiny_patch16_224_quant \ 152 | --drop-path 0 \ 153 | --wbits ${wbits} \ 154 | --abits ${abits} \ 155 | --data-path /path/to/imagenet/data \ 156 | --headwise \ 157 | --input_noise_std ${input_noise_std} \ 158 | --output_noise_std ${output_noise_std} \ 159 | --phase_noise_std ${phase_noise_std} \ 160 | --num_wavelength ${num_wavelength} \ 161 | --channel_spacing ${channel_spacing} \ 162 | --seed ${seed+$i} \ 163 | --enable_wdm_noise \ 164 | --enable_linear_noise 165 | done 166 | done 167 | ``` 168 | It will give you the outputs as follows 169 | ``` 170 | Test: Total time: 0:29:16 (3.3723 s / it) 171 | * Acc@1 71.052 Acc@5 90.432 loss 1.287 172 | Accuracy of the network on the 50000 test images: 71.1% 173 | ``` 174 | 175 | --- 176 | 177 | ## AE experiments: Reproduce reported results in accuracy and robustness analysis 178 | 179 | We test the robustness of model running on our photonic accelerator by sweeping various on-chip noise-resources. 180 | * input_noise_std: Noise std of the input magtitude encoding. Default is 0.03. 181 | * phase_noise_std: Noise std of the input phase encoding. Default is $2^{\circ}$. 182 | * output_noise_std: Noise std of the computed outputs. Default is 0.05. 183 | * num_wavelength: number of wavelength used in the system. We will calculate the wavelength-induced dispersion error. Default is 12. 184 | 185 | ### Download our checkpoint 186 | 187 | One trained DeiT-T-4bit model is provided for quickly reproducing the results. 188 | 189 | Download it as follows: 190 | ``` 191 | mkdir resumed_ckpt 192 | gdown https://drive.google.com/uc?id=1EZjEnkqyBaBU8pUrYqNLTYMq4Mn0cbKV -O resumed_ckpt/ 193 | ``` 194 | It will in the `./resumed_ckpt`. 195 | 196 | ### Launch jobs with noise sweeping. 197 | 198 | You can run `./scripts/evaluate_quant_transformer_scan_noise.sh` to measure the accuracy with varing noise levels. 199 | We will test accurcy by three times. 200 | 201 | By uncommenting the corresponding line in the script, you can reproduce the experiments for sweeping input noise std, phase noise std, and number of wavelengths. 202 | ``` 203 | for input_noise_std in 0.03 0.04 0.05 0.06 0.07 0.08 ## uncomment this line when scanning input noise 204 | # for phase_noise_std in 2 3 4 5 6 7 ## uncomment this line when scanning phase noise 205 | # for num_wavelength in 8 12 16 20 24 ## uncomment this line when scanning # wavelength 206 | ``` 207 | 208 | 209 | You can redirect the output of running `./scripts/evaluate_quant_transformer_scan_noise.sh` to a log file. 210 | Then use our provided scripts `./scripts/process_output_logs.sh` to process the logs. 211 | You will get the parsed accuracy as well as the mean and std in a CSV file. 212 | 213 | ``` 214 | ./scripts/evaluate_quant_transformer.sh &> results.log # redirect results to a log file 215 | ./scripts/process_output_logs.sh # set the log_file path and number of iters and how many variations you sweep in the script 216 | ``` 217 | 218 | The expetced results will be like 219 | 220 | ``` 221 | test1,test2,test3,mean,std 222 | 71.174,71.014,70.99,71.05933333333333,0.10002666311206546 223 | 71.052,71.1,70.972,71.04133333333333,0.06466323014923916 224 | 71.034,70.924,70.924,70.96066666666667,0.06350852961085851 225 | 70.99,70.952,71.144,71.02866666666667,0.10167267741794891 226 | 71.206,70.82,71.184,71.07,0.21678560837842034 227 | ``` 228 | The first three columns represent the accurcay of 3 different runs with different seeds, followed by two columns being the mean and std of the three data. 229 | 230 | Different rows represent the accuracy for different noise values, which is sweeped as in the `./scripts/evaluate_quant_transformer_scan_noise.sh`. 231 | 232 | ### Scripts `./scripts/evaluate_quant_transformer_scan_noise.sh`. 233 | ``` 234 | exp='eval_accuracy_scan_noise' 235 | wbits=4 236 | abits=4 237 | id=4bit 238 | headwise=1 239 | 240 | # noise settings 241 | input_noise_std=0.03 242 | output_noise_std=0.05 243 | # following setting is added for inference only 244 | phase_noise_std=2 245 | num_wavelength=12 246 | channel_spacing=0.4 247 | seed=0 248 | 249 | resumed_ckpt_path='./resumed_ckpt/best_checkpoint.pth' 250 | 251 | for i in {1..3} 252 | do 253 | for input_noise_std in 0.03 0.04 0.05 0.06 0.07 0.08 ## uncomment this line when scanning input noise 254 | # for phase_noise_std in 2 3 4 5 6 7 ## uncomment this line when scanning phase noise 255 | # for num_wavelength in 8 12 16 20 24 ## uncomment this line when scanning # wavelength 256 | do 257 | CUDA_VISIBLE_DEVICES=0 python main.py --eval \ 258 | --resume ${resumed_ckpt_path} \ 259 | --model deit_tiny_patch16_224_quant \ 260 | --drop-path 0 \ 261 | --wbits ${wbits} \ 262 | --abits ${abits} \ 263 | --data-path /home/usr1/zixuan/ImageNet/data \ 264 | --headwise \ 265 | --input_noise_std ${input_noise_std} \ 266 | --output_noise_std ${output_noise_std} \ 267 | --phase_noise_std ${phase_noise_std} \ 268 | --num_wavelength ${num_wavelength} \ 269 | --channel_spacing ${channel_spacing} \ 270 | --seed ${seed+$i} \ 271 | --enable_wdm_noise \ 272 | --enable_linear_noise 273 | done 274 | done 275 | ``` 276 | --------------------------------------------------------------------------------