├── profile
    ├── power_results
    │   ├── bert-l-320.csv
    │   └── power_usage.csv
    ├── power_monitor.sh
    ├── customized_layer.py
    ├── vit_infer.py
    ├── bert_infer.py
    ├── README.md
    └── benckmark_logs
    │   ├── deit-s_power.csv
    │   ├── bert-b-128_power.csv
    │   ├── bert-b-384_power.csv
    │   ├── deit-t_power.csv
    │   ├── bert-l-320_power.csv
    │   └── deit-b-power.csv
├── software_model
    ├── models
    │   └── __init__.py
    ├── ops
    │   ├── __init__.py
    │   ├── simulator.py
    │   └── _quant_base.py
    ├── hubconf.py
    ├── deit_t_sweep_wavelength.csv
    ├── deit_t_sweep_input_noise_std.csv
    ├── deit_t_sweep_phase_noise_std.csv
    ├── scripts
    │   ├── process_output_logs.sh
    │   ├── train_quant_transformer_with_noise.sh
    │   ├── evaluate_quant_transformer.sh
    │   └── evaluate_quant_transformer_scan_noise.sh
    ├── process_logs.py
    ├── samplers.py
    ├── losses.py
    ├── augment.py
    ├── engine.py
    ├── datasets.py
    ├── resmlp_models.py
    ├── models.py
    ├── utils.py
    └── readme.md
├── HPCA24_LT_poster_v1_02.pdf
├── hardware_simulator
    ├── params
    │   ├── models
    │   │   ├── bert_base.yaml
    │   │   ├── bert_large.yaml
    │   │   ├── deit_base.yaml
    │   │   ├── deit_small.yaml
    │   │   └── deit_tiny.yaml
    │   └── device_params
    │   │   ├── Dota_B_4bit.yaml
    │   │   ├── Dota_B_8bit.yaml
    │   │   ├── Dota_L_4bit.yaml
    │   │   ├── Dota_L_8bit.yaml
    │   │   ├── Bs_mrr_bank_4bit.yaml
    │   │   ├── Bs_mrr_bank_8bit.yaml
    │   │   ├── Bs_mzi_4bit.yaml
    │   │   ├── Bs_mzi_8bit.yaml
    │   │   └── default.yaml
    ├── hardware
    │   ├── __init__.py
    │   ├── SRAM.py
    │   ├── ADC.py
    │   ├── DAC.py
    │   └── photonic_core_base.py
    ├── scripts
    │   ├── energy_latency_onns_deit_t.sh
    │   ├── energy_latency_single.sh
    │   ├── energy_latency_onns_deit.sh
    │   ├── area_power_all.sh
    │   └── energy_latency_all.sh
    ├── utils
    │   ├── __init__.py
    │   ├── config.py
    │   ├── model.py
    │   └── cal_flops_for_transformer.py
    ├── entry_energy_latency_workload.py
    └── readme.md
├── readme.md
└── .gitignore


/profile/power_results/bert-l-320.csv:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/profile/power_monitor.sh:
--------------------------------------------------------------------------------
1 | nvidia-smi dmon -s puc -d 1 -i 0


--------------------------------------------------------------------------------
/software_model/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .quant_vit import *


--------------------------------------------------------------------------------
/HPCA24_LT_poster_v1_02.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhuhanqing/Lightening-Transformer-AE/HEAD/HPCA24_LT_poster_v1_02.pdf


--------------------------------------------------------------------------------
/hardware_simulator/params/models/bert_base.yaml:
--------------------------------------------------------------------------------
1 | model
2 |   name: "bert_base"
3 |   depth: 12
4 |   num_heads: 12
5 |   embed_dim: 768
6 |   mlp_ratio: 4
7 |   tokens: 128


--------------------------------------------------------------------------------
/hardware_simulator/params/models/bert_large.yaml:
--------------------------------------------------------------------------------
1 | model
2 |   name: "bert_large"
3 |   depth: 24
4 |   num_heads: 12
5 |   embed_dim: 768
6 |   mlp_ratio: 4
7 |   tokens: 384


--------------------------------------------------------------------------------
/hardware_simulator/params/models/deit_base.yaml:
--------------------------------------------------------------------------------
1 | model
2 |   name: "deit-s"
3 |   patch: 16
4 |   depth: 12
5 |   embed_dim: 768
6 |   num_heads: 12
7 |   mlp_ratio: 4
8 |   tokens: 197


--------------------------------------------------------------------------------
/hardware_simulator/params/models/deit_small.yaml:
--------------------------------------------------------------------------------
1 | model
2 |   name: "deit-s"
3 |   patch: 16
4 |   depth: 12
5 |   embed_dim: 368
6 |   num_heads: 6
7 |   mlp_ratio: 4
8 |   tokens: 197


--------------------------------------------------------------------------------
/hardware_simulator/params/models/deit_tiny.yaml:
--------------------------------------------------------------------------------
1 | model
2 |   name: "deit-t"
3 |   patch: 16
4 |   depth: 12
5 |   embed_dim: 192
6 |   num_heads: 3
7 |   mlp_ratio: 4
8 |   tokens: 197


--------------------------------------------------------------------------------
/software_model/ops/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Author: Hanqing Zhu
3 | # @Date:   2023-01-02 21:13:44
4 | # @Last Modified by:   Hanqing Zhu
5 | # @Last Modified time: 2023-03-28 16:41:34
6 | from .quantize import *
7 | from .simulator import *


--------------------------------------------------------------------------------
/software_model/hubconf.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2015-present, Facebook, Inc.
2 | # All rights reserved.
3 | from models import *
4 | from cait_models import *
5 | from resmlp_models import *
6 | #from patchconvnet_models import *
7 | 
8 | dependencies = ["torch", "torchvision", "timm"]
9 | 


--------------------------------------------------------------------------------
/software_model/deit_t_sweep_wavelength.csv:
--------------------------------------------------------------------------------
1 | test1,test2,test3,mean,std
2 | 71.174,71.014,70.99,71.05933333333333,0.10002666311206546
3 | 71.052,71.1,70.972,71.04133333333333,0.06466323014923916
4 | 71.034,70.924,70.924,70.96066666666667,0.06350852961085851
5 | 70.99,70.952,71.144,71.02866666666667,0.10167267741794891
6 | 71.206,70.82,71.184,71.07,0.21678560837842034
7 | 


--------------------------------------------------------------------------------
/hardware_simulator/hardware/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author: Hanqing Zhu
 3 | # @Date:   2023-02-25 11:30:16
 4 | # @Last Modified by:   Hanqing Zhu(hqzhu@utexas.edu)
 5 | # @Last Modified time: 2023-11-08 00:46:22
 6 | from .photonic_crossbar import *
 7 | from .photonic_mrr_bank import *
 8 | from .photonic_MZI import *
 9 | from .SRAM import *
10 | from .ADC import *
11 | from .DAC import *


--------------------------------------------------------------------------------
/software_model/deit_t_sweep_input_noise_std.csv:
--------------------------------------------------------------------------------
1 | test1,test2,test3,mean,std
2 | 71.052,71.1,70.972,71.04133333333333,0.06466323014923916
3 | 71.044,70.86,71.048,70.984,0.10740577265678043
4 | 71.002,70.762,70.938,70.90066666666667,0.12427925544246267
5 | 70.844,70.674,70.672,70.73,0.09873196037757545
6 | 70.724,70.532,70.518,70.59133333333334,0.11510574848083822
7 | 70.61,70.334,70.434,70.45933333333333,0.13973307888017394
8 | 


--------------------------------------------------------------------------------
/software_model/deit_t_sweep_phase_noise_std.csv:
--------------------------------------------------------------------------------
1 | test1,test2,test3,mean,std
2 | 71.052,71.1,70.972,71.04133333333333,0.06466323014923916
3 | 70.964,70.942,70.982,70.96266666666666,0.020033305601758828
4 | 71.12,70.934,71.062,71.03866666666667,0.09517002329165371
5 | 71.116,70.938,71.206,71.08666666666667,0.13638670511942627
6 | 71.086,70.884,71.13,71.03333333333333,0.13118434865994025
7 | 71.168,71.022,71.038,71.07600000000001,0.08007496487667297
8 | 


--------------------------------------------------------------------------------
/hardware_simulator/scripts/energy_latency_onns_deit_t.sh:
--------------------------------------------------------------------------------
 1 | exp='energy_latency_compare_onns_deit_t'
 2 | model_name='deit-t'
 3 | tokens=197
 4 | declare -A config_dict
 5 | config_dict=(
 6 |     ['dota_b_4bit']='./params/device_params/Dota_B_4bit.yaml'
 7 |     # ['mrr_4bit']='./params/device_params/Bs_mrr_bank_4bit.yaml'
 8 |     # ['mzi_4bit']='./params/device_params/Bs_mzi_4bit.yaml'
 9 | )
10 | 
11 | for key in "${!config_dict[@]}"
12 | do
13 |     # Get the value associated with the key
14 |     onn_params="${config_dict[$key]}"
15 | 
16 |     python entry_energy_latency_workload.py \
17 |         -e ${exp} \
18 |         --tokens ${tokens} \
19 |         --model_name ${model_name} \
20 |         --config ${onn_params} \
21 |         -o 'broadcast'
22 | done


--------------------------------------------------------------------------------
/profile/power_results/power_usage.csv:
--------------------------------------------------------------------------------
 1 | # gpu    pwr  gtemp  mtemp     sm    mem    enc    dec    jpg    ofa   mclk   pclk 
 2 | # Idx      W      C      C      %      %      %      %      %      %    MHz    MHz 
 3 |     0     61     38     54      0      0      0      0      0      0   1593    210 
 4 |     0     61     38     54      0      0      0      0      0      0   1593    210 
 5 |     0     61     38     53      0      0      0      0      0      0   1593    210 
 6 |     0     61     38     53      0      0      0      0      0      0   1593    210 
 7 |     0     61     38     53      0      0      0      0      0      0   1593    210 
 8 |     0     61     38     54      0      0      0      0      0      0   1593    210 
 9 |     0     61     38     54      0      0      0      0      0      0   1593    210 
10 | 


--------------------------------------------------------------------------------
/hardware_simulator/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author: Hanqing Zhu
 3 | # @Date:   2023-02-23 22:51:07
 4 | # @Last Modified by:   Hanqing Zhu
 5 | # @Last Modified time: 2023-02-23 22:51:23
 6 | import importlib
 7 | import os
 8 | 
 9 | # automatically import any Python files in this directory
10 | for file in sorted(os.listdir(os.path.dirname(__file__))):
11 |     if file.endswith(".py") and not file.startswith("_"):
12 |         source = file[: file.find(".py")]
13 |         module = importlib.import_module("utils." + source)
14 |         if "__all__" in module.__dict__:
15 |             names = module.__dict__["__all__"]
16 |         else:
17 |             # import all names that do not begin with _
18 |             names = [x for x in module.__dict__ if not x.startswith("_")]
19 |         globals().update({k: getattr(module, k) for k in names})


--------------------------------------------------------------------------------
/hardware_simulator/scripts/energy_latency_single.sh:
--------------------------------------------------------------------------------
 1 | exp='energy_latency_single_workload'
 2 | model_name='deit-t'
 3 | tokens=197
 4 | onn_params='./params/device_params/Dota_B_4bit.yaml'
 5 | # choose onn accelerator params from
 6 | # config_dict=(
 7 | #     ['dota_b_4bit']='./params/device_params/Dota_B_4bit.yaml'
 8 | #     ['dota_b_8bit']='./params/device_params/Dota_B_8bit.yaml'
 9 | #     ['dota_l_4bit']='./params/device_params/Dota_L_4bit.yaml'
10 | #     ['dota_l_8bit']='./params/device_params/Dota_L_8bit.yaml'
11 | #     ['mrr_4bit']='./params/device_params/Bs_mrr_bank_4bit.yaml'
12 | #     ['mrr_8bit']='./params/device_params/Bs_mrr_bank_8bit.yaml'
13 | #     ['mzi_4bit']='./params/device_params/Bs_mzi_4bit.yaml'
14 | #     ['mzi_8bit']='./params/device_params/Bs_mzi_8bit.yaml'
15 | # )
16 | 
17 | 
18 | python entry_energy_latency_workload.py \
19 |     -e ${exp} \
20 |     --tokens ${tokens} \
21 |     --model_name ${model_name} \
22 |     --config ${onn_params}
23 | 


--------------------------------------------------------------------------------
/hardware_simulator/params/device_params/Dota_B_4bit.yaml:
--------------------------------------------------------------------------------
 1 | # @Author: Hanqing Zhu
 2 | # @Date:   2023-02-23 22:53:17
 3 | # @Last Modified by:   Hanqing Zhu
 4 | # @Last Modified time: 2023-03-24 23:07:35
 5 | # device level
 6 | 
 7 | core:
 8 |   type: "dota"
 9 |   width: 12
10 |   height: 12
11 |   num_wavelength: 12
12 |   work_freq: 5
13 |   interface:
14 |     ADC:
15 |       choice: 1
16 |       sharing_factor: 1
17 |     DAC:
18 |       choice: 1
19 |     TIA:
20 |       power: 3
21 |       area: 50
22 |   precision:
23 |     in_bit: 4
24 |     w_bit: 4
25 |     act_bit: 4 
26 | 
27 | arch:
28 |   num_tiles: 4
29 |   num_pe_per_tile: 2
30 |   full_range_support_factor: 1 # pe support full range or not -> 2: one operand is positive only -> 4 all operands are positive only
31 |   ### unique arch params for our DOTA
32 |   time_accum_factor: 3
33 |   input_mod_sharing_flag: 1 # whether input is globally shared cross tiles
34 |   adc_share_flag: 1 # multiple PEs share one adc array


--------------------------------------------------------------------------------
/hardware_simulator/params/device_params/Dota_B_8bit.yaml:
--------------------------------------------------------------------------------
 1 | # @Author: Hanqing Zhu
 2 | # @Date:   2023-02-23 22:53:17
 3 | # @Last Modified by:   Hanqing Zhu
 4 | # @Last Modified time: 2023-03-24 23:07:35
 5 | # device level
 6 | 
 7 | core:
 8 |   type: "dota"
 9 |   width: 12
10 |   height: 12
11 |   num_wavelength: 12
12 |   work_freq: 5
13 |   interface:
14 |     ADC:
15 |       choice: 1
16 |       sharing_factor: 1
17 |     DAC:
18 |       choice: 1
19 |     TIA:
20 |       power: 3
21 |       area: 50
22 |   precision:
23 |     in_bit: 8
24 |     w_bit: 8
25 |     act_bit: 8 
26 | 
27 | arch:
28 |   num_tiles: 4
29 |   num_pe_per_tile: 2
30 |   full_range_support_factor: 1 # pe support full range or not -> 2: one operand is positive only -> 4 all operands are positive only
31 |   ### unique arch params for our DOTA
32 |   time_accum_factor: 3
33 |   input_mod_sharing_flag: 1 # whether input is globally shared cross tiles
34 |   adc_share_flag: 1 # multiple PEs share one adc array


--------------------------------------------------------------------------------
/hardware_simulator/params/device_params/Dota_L_4bit.yaml:
--------------------------------------------------------------------------------
 1 | # @Author: Hanqing Zhu
 2 | # @Date:   2023-02-23 22:53:17
 3 | # @Last Modified by:   Hanqing Zhu
 4 | # @Last Modified time: 2023-03-24 23:07:35
 5 | # device level
 6 | 
 7 | core:
 8 |   type: "dota"
 9 |   width: 12
10 |   height: 12
11 |   num_wavelength: 12
12 |   work_freq: 5
13 |   interface:
14 |     ADC:
15 |       choice: 1
16 |       sharing_factor: 1
17 |     DAC:
18 |       choice: 1
19 |     TIA:
20 |       power: 3
21 |       area: 50
22 |   precision:
23 |     in_bit: 4
24 |     w_bit: 4
25 |     act_bit: 4 
26 | 
27 | arch:
28 |   num_tiles: 8
29 |   num_pe_per_tile: 2
30 |   full_range_support_factor: 1 # pe support full range or not -> 2: one operand is positive only -> 4 all operands are positive only
31 |   ### unique arch params for our DOTA
32 |   time_accum_factor: 3
33 |   input_mod_sharing_flag: 1 # whether input is globally shared cross tiles
34 |   adc_share_flag: 1 # multiple PEs share one adc array


--------------------------------------------------------------------------------
/hardware_simulator/params/device_params/Dota_L_8bit.yaml:
--------------------------------------------------------------------------------
 1 | # @Author: Hanqing Zhu
 2 | # @Date:   2023-02-23 22:53:17
 3 | # @Last Modified by:   Hanqing Zhu
 4 | # @Last Modified time: 2023-03-24 23:07:35
 5 | # device level
 6 | 
 7 | core:
 8 |   type: "dota"
 9 |   width: 12
10 |   height: 12
11 |   num_wavelength: 12
12 |   work_freq: 5
13 |   interface:
14 |     ADC:
15 |       choice: 1
16 |       sharing_factor: 1
17 |     DAC:
18 |       choice: 1
19 |     TIA:
20 |       power: 3
21 |       area: 50
22 |   precision:
23 |     in_bit: 8
24 |     w_bit: 8
25 |     act_bit: 8 
26 | 
27 | arch:
28 |   num_tiles: 8
29 |   num_pe_per_tile: 2
30 |   full_range_support_factor: 1 # pe support full range or not -> 2: one operand is positive only -> 4 all operands are positive only
31 |   ### unique arch params for our DOTA
32 |   time_accum_factor: 3
33 |   input_mod_sharing_flag: 1 # whether input is globally shared cross tiles
34 |   adc_share_flag: 1 # multiple PEs share one adc array


--------------------------------------------------------------------------------
/hardware_simulator/scripts/energy_latency_onns_deit.sh:
--------------------------------------------------------------------------------
 1 | exp='energy_latency_onns_deit'
 2 | # model_name='deit-t'
 3 | tokens=197
 4 | declare -A config_dict
 5 | config_dict=(
 6 |     ['dota_b_4bit']='./params/device_params/Dota_B_4bit.yaml'
 7 |     ['mrr_4bit']='./params/device_params/Bs_mrr_bank_4bit.yaml'
 8 |     ['mzi_4bit']='./params/device_params/Bs_mzi_4bit.yaml'
 9 |     ['dota_b_8bit']='./params/device_params/Dota_B_8bit.yaml'
10 |     ['mrr_8bit']='./params/device_params/Bs_mrr_bank_8bit.yaml'
11 |     ['mzi_8bit']='./params/device_params/Bs_mzi_8bit.yaml'
12 | )
13 | 
14 | for model_name in 'deit-t' 'deit-b'
15 | do
16 |     for key in "${!config_dict[@]}"
17 |     do
18 |         # Get the value associated with the key
19 |         onn_params="${config_dict[$key]}"
20 | 
21 |         python entry_energy_latency_workload.py \
22 |             -e ${exp} \
23 |             --tokens ${tokens} \
24 |             --model_name ${model_name} \
25 |             --config ${onn_params}
26 |     done
27 | done


--------------------------------------------------------------------------------
/hardware_simulator/params/device_params/Bs_mrr_bank_4bit.yaml:
--------------------------------------------------------------------------------
 1 | # @Author: Hanqing Zhu
 2 | # @Date:   2023-02-23 22:53:17
 3 | # @Last Modified by:   Hanqing Zhu
 4 | # @Last Modified time: 2023-03-20 15:23:26
 5 | # device level
 6 | device:
 7 |   mrr_modulator:
 8 |     type: 'ring'
 9 |     energy_per_bit: 42
10 |     static_power: 1.2 #mW
11 |     length: 9.66
12 |     width: 9.66
13 |     insertion_loss: 0.95 # db
14 |     insertion_loss_uc: 0.1 # db uncoupled ring loss
15 | 
16 | core:
17 |   type: "mrrbank"
18 |   width: 12
19 |   height: 12
20 |   num_wavelength: 12
21 |   work_freq: 5
22 |   interface:
23 |     ADC:
24 |       choice: 1
25 |       sharing_factor: 1
26 |     DAC:
27 |       choice: 1
28 |     TIA:
29 |       power: 3
30 |       area: 50
31 |   precision:
32 |     in_bit: 4
33 |     w_bit: 4
34 |     act_bit: 4 
35 | 
36 | arch:
37 |   num_tiles: 7
38 |   num_pe_per_tile: 2
39 |   full_range_support_factor: 2 # add-drop ring, only support full-range weights
40 |   weight_reuse_factor: -1 # set to -1 means fully weight-stationary dataflow


--------------------------------------------------------------------------------
/hardware_simulator/params/device_params/Bs_mrr_bank_8bit.yaml:
--------------------------------------------------------------------------------
 1 | # @Author: Hanqing Zhu
 2 | # @Date:   2023-02-23 22:53:17
 3 | # @Last Modified by:   Hanqing Zhu
 4 | # @Last Modified time: 2023-03-20 15:23:26
 5 | # device level
 6 | device:
 7 |   mrr_modulator:
 8 |     type: 'ring'
 9 |     energy_per_bit: 42
10 |     static_power: 1.2 #mW
11 |     length: 9.66
12 |     width: 9.66
13 |     insertion_loss: 0.95 # db
14 |     insertion_loss_uc: 0.1 # db uncoupled ring loss
15 | 
16 | core:
17 |   type: "mrrbank"
18 |   width: 12
19 |   height: 12
20 |   num_wavelength: 12
21 |   work_freq: 5
22 |   interface:
23 |     ADC:
24 |       choice: 1
25 |       sharing_factor: 1
26 |     DAC:
27 |       choice: 1
28 |     TIA:
29 |       power: 3
30 |       area: 50
31 |   precision:
32 |     in_bit: 8
33 |     w_bit: 8
34 |     act_bit: 8 
35 | 
36 | arch:
37 |   num_tiles: 7
38 |   num_pe_per_tile: 2
39 |   full_range_support_factor: 2 # add-drop ring, only support full-range weights
40 |   weight_reuse_factor: -1 # set to -1 means fully weight-stationary dataflow


--------------------------------------------------------------------------------
/software_model/scripts/process_output_logs.sh:
--------------------------------------------------------------------------------
 1 | # This is the scripts to process the saved log file from evaluate_quant_transformer_scan_noise.sh
 2 | # It will generate a csv file to give you the accurcay mean and std of multiple runs
 3 | 
 4 | # set the log file directory
 5 | 
 6 | ## params when you parse logs for sweep_wavelength
 7 | log_file='./logs/deit_t_sweep_input_noise_std.log'
 8 | num_iters=3 # number of runs you launch for accurcay test
 9 | num_vars=6 # how many variations you sweep
10 | 
11 | # ## params when you parse logs for sweep input noise std
12 | # log_file='./logs/deit_t_sweep_input_noise_std.log'
13 | # num_iters=3 # number of runs you launch for accurcay test
14 | # num_vars=6 # how many variations you sweep
15 | 
16 | # ## params when you parse logs for sweep input noise std
17 | # log_file='./logs/deit_t_sweep_phase_noise_std.log'
18 | # num_iters=3 # number of runs you launch for accurcay test
19 | # num_vars=6 # how many variations you sweep
20 | 
21 | python process_logs.py \
22 |     --file ${log_file} \
23 |     --iters ${num_iters} \
24 |     --num_vars ${num_vars}


--------------------------------------------------------------------------------
/hardware_simulator/scripts/area_power_all.sh:
--------------------------------------------------------------------------------
 1 | ## scripts to generate area and power estimation of our optical accelerator system.
 2 | ## It will save the results to ./results/{exp_name}/
 3 | ## dota is our circuit
 4 | ## we will also generate area report for the optical baselines
 5 | 
 6 | declare -A config_dict
 7 | config_dict=(
 8 |     ['dota_b_4bit']='./params/device_params/Dota_B_4bit.yaml'
 9 |     ['dota_b_8bit']='./params/device_params/Dota_B_8bit.yaml'
10 |     ['dota_l_4bit']='./params/device_params/Dota_L_4bit.yaml'
11 |     ['dota_l_8bit']='./params/device_params/Dota_L_8bit.yaml'
12 |     ['mrr_4bit']='./params/device_params/Bs_mrr_bank_4bit.yaml'
13 |     ['mzi_4bit']='./params/device_params/Bs_mzi_4bit.yaml'
14 | )
15 | 
16 | 
17 | exp='area_power_all'
18 | 
19 | # Iterate through the keys in the config_dict
20 | for key in "${!config_dict[@]}"
21 | do
22 |     # Get the value associated with the key
23 |     value="${config_dict[$key]}"
24 | 
25 |     # launch the are and power estimation .py
26 |     python entry_area_power_profile.py \
27 |         -e ${exp} \
28 |         --config "$value"
29 | done


--------------------------------------------------------------------------------
/hardware_simulator/params/device_params/Bs_mzi_4bit.yaml:
--------------------------------------------------------------------------------
 1 | # @Author: Hanqing Zhu
 2 | # @Date:   2023-02-23 22:53:17
 3 | # @Last Modified by:   Hanqing Zhu
 4 | # @Last Modified time: 2023-03-20 15:23:26
 5 | # device level
 6 | device:
 7 |   mzi_modulator: # input modulation
 8 |     type: 'mzi' 
 9 |     energy_per_bit: 450 # fJ/bit # 150
10 |     static_power: 0 # 0 mW
11 |     length: 260
12 |     width: 20
13 |     insertion_loss: 1.2
14 |   mzi: # mzi for mzi mesh
15 |     type: 'mzi'
16 |     energy_per_bit: 450 # fJ/bit
17 |     static_power: 0 # 0 mW
18 |     length: 180 # phase shifter plus directional coupler plus spacing
19 |     width: 100
20 |     insertion_loss: 0.99 # two directional coupler and one phase shifter
21 |     response_time: 2.0e-3 # 2us scale to ms
22 | 
23 | core:
24 |   type: "mzi"
25 |   width: 12
26 |   height: 12
27 |   work_freq: 5
28 |   interface:
29 |     ADC:
30 |       choice: 1
31 |       sharing_factor: 1
32 |     DAC:
33 |       choice: 1
34 |     TIA:
35 |       power: 3
36 |       area: 50
37 |   precision:
38 |     in_bit: 4
39 |     w_bit: 4
40 |     act_bit: 4 
41 | 
42 | arch:
43 |   num_tiles: 4
44 |   num_pe_per_tile: 2
45 |   full_range_support_factor: 1
46 |   weight_reuse_factor: -1


--------------------------------------------------------------------------------
/hardware_simulator/params/device_params/Bs_mzi_8bit.yaml:
--------------------------------------------------------------------------------
 1 | # @Author: Hanqing Zhu
 2 | # @Date:   2023-02-23 22:53:17
 3 | # @Last Modified by:   Hanqing Zhu
 4 | # @Last Modified time: 2023-03-20 15:23:26
 5 | # device level
 6 | device:
 7 |   mzi_modulator: # input modulation
 8 |     type: 'mzi' 
 9 |     energy_per_bit: 450 # fJ/bit # 150
10 |     static_power: 0 # 0 mW
11 |     length: 260
12 |     width: 20
13 |     insertion_loss: 1.2
14 |   mzi: # mzi for mzi mesh
15 |     type: 'mzi'
16 |     energy_per_bit: 450 # fJ/bit
17 |     static_power: 0 # 0 mW
18 |     length: 180 # phase shifter plus directional coupler plus spacing
19 |     width: 100
20 |     insertion_loss: 0.99 # two directional coupler and one phase shifter
21 |     response_time: 2.0e-3 # 2us scale to ms
22 | 
23 | core:
24 |   type: "mzi"
25 |   width: 12
26 |   height: 12
27 |   work_freq: 5
28 |   interface:
29 |     ADC:
30 |       choice: 1
31 |       sharing_factor: 1
32 |     DAC:
33 |       choice: 1
34 |     TIA:
35 |       power: 3
36 |       area: 50
37 |   precision:
38 |     in_bit: 8
39 |     w_bit: 8
40 |     act_bit: 8 
41 | 
42 | arch:
43 |   num_tiles: 4
44 |   num_pe_per_tile: 2
45 |   full_range_support_factor: 1
46 |   weight_reuse_factor: -1


--------------------------------------------------------------------------------
/software_model/scripts/train_quant_transformer_with_noise.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # @Author: Hanqing Zhu(hqzhu@utexas.edu)
 3 | # @Date:   1969-12-31 18:00:00
 4 | # @Last Modified by:   Hanqing Zhu(hqzhu@utexas.edu)
 5 | # @Last Modified time: 2023-11-09 03:03:06
 6 | wbits=4
 7 | abits=4
 8 | id=4bit
 9 | lr=5e-4
10 | weight_decay=1e-8
11 | batch_size=512
12 | epochs=300
13 | port=47771
14 | headwise=1
15 | input_noise_std=0.03
16 | output_noise_std=0.05
17 | 
18 | torchrun \
19 | --master_port ${port} \
20 | --nproc_per_node=4 main.py \
21 | --model deit_tiny_patch16_224_quant \
22 | --drop-path 0 \
23 | --batch-size ${batch_size} \
24 | --lr ${lr} \
25 | --min-lr 0 \
26 | --epochs ${epochs} \
27 | --warmup-epochs 0 \
28 | --weight-decay ${weight_decay} \
29 | --wbits ${wbits} \
30 | --abits ${abits} \
31 | --dist-eval \
32 | --output_dir test/deit_tiny_${id}/${wbits}w${abits}a_bs${batch_size}_baselr${lr}_weightdecay${weight_decay}_ft${epochs}_headwise${headwise}_noise_i_${input_noise_std}_o_${output_noise_std}_linear_noise \
33 | --finetune pretrained/deit_tiny_patch16_224-a1311bcf.pth \
34 | --data-path /home/usr1/zixuan/ImageNet/data \
35 | --headwise \
36 | --input_noise_std ${input_noise_std} \
37 | --output_noise_std ${output_noise_std} \
38 | --enable_linear_noise


--------------------------------------------------------------------------------
/hardware_simulator/scripts/energy_latency_all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Define the experiment type
 4 | exp='energy_latency_all'
 5 | 
 6 | # Define the config_dict with possible values
 7 | declare -A config_dict
 8 | config_dict=(
 9 |     ['dota_b_4bit']='./params/device_params/Dota_B_4bit.yaml'
10 |     ['dota_b_8bit']='./params/device_params/Dota_B_8bit.yaml'
11 |     ['dota_l_4bit']='./params/device_params/Dota_L_4bit.yaml'
12 |     ['dota_l_8bit']='./params/device_params/Dota_L_8bit.yaml'
13 | )
14 | 
15 | # Define the workload_dict with possible values
16 | declare -A workload_dict
17 | workload_dict=(
18 |     ['deit-t']='197'
19 |     ['deit-s']='197'
20 |     ['deit-b']='197'
21 |     ['bert-b']='128'
22 |     ['bert-l']='320'
23 | )
24 | 
25 | # Loop through the workload_dict
26 | for model_name in "${!workload_dict[@]}"
27 | do
28 |     # Get the value associated with the key
29 |     tokens="${workload_dict[$model_name]}"
30 |     
31 |     # Loop through the config_dict
32 |     for onn in "${!config_dict[@]}"
33 |     do
34 |         onn_params="${config_dict[$onn]}"
35 |         
36 |         # Call your Python script with the arguments
37 |         python entry_energy_latency_workload.py \
38 |             -e "${exp}" \
39 |             --tokens "${tokens}" \
40 |             --model_name "${model_name}" \
41 |             --config "${onn_params}"
42 |     done
43 | done


--------------------------------------------------------------------------------
/software_model/scripts/evaluate_quant_transformer.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # @Author: Hanqing Zhu
 3 | # @Date:   2023-01-04 22:18:40
 4 | # @Last Modified by:   Hanqing Zhu(hqzhu@utexas.edu)
 5 | # @Last Modified time: 2023-11-10 16:51:54
 6 | exp='eval_accuracy'
 7 | wbits=4
 8 | abits=4
 9 | id=4bit
10 | headwise=1
11 | 
12 | # noise settings
13 | input_noise_std=0.03
14 | output_noise_std=0.05
15 | # following setting is added for inference only
16 | phase_noise_std=2
17 | num_wavelength=12
18 | channel_spacing=0.4
19 | seed=0
20 | 
21 | resumed_ckpt_path='./resumed_ckpt/best_checkpoint.pth'
22 | 
23 | for i in {1..1}
24 | do
25 |     for input_noise_std in 0.03
26 |     do
27 |         CUDA_VISIBLE_DEVICES=0 python main.py --eval \
28 |         --resume ${resumed_ckpt_path} \
29 |         --model deit_tiny_patch16_224_quant \
30 |         --drop-path 0 \
31 |         --wbits ${wbits} \
32 |         --abits ${abits} \
33 |         --data-path /home/usr1/zixuan/ImageNet/data \
34 |         --headwise \
35 |         --input_noise_std ${input_noise_std} \
36 |         --output_noise_std ${output_noise_std} \
37 |         --phase_noise_std ${phase_noise_std} \
38 |         --num_wavelength ${num_wavelength} \
39 |         --channel_spacing ${channel_spacing} \
40 |         --seed ${seed+$i} \
41 |         --enable_wdm_noise \
42 |         --enable_linear_noise
43 |     done
44 | done
45 | 
46 | 


--------------------------------------------------------------------------------
/software_model/ops/simulator.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author: Hanqing Zhu
 3 | # @Date:   2023-03-28 15:37:29
 4 | # @Last Modified by:   Hanqing Zhu
 5 | # @Last Modified time: 2023-03-28 18:58:41
 6 | import math
 7 | 
 8 | __all__ = ["cal_coupler_wdm_error_list"]
 9 | 
10 | def cal_coupler_wdm_error_list(num_wavelength, channel_spacing):
11 |     channel_spacing = channel_spacing *1e-3
12 |     error_list = [] # 2 * kappa - 1
13 |     
14 |     def coupling_length(w, g=100):
15 |         a = -5.44
16 |         b = 3.53
17 |         c = 0.185
18 |         d = 0.15
19 |         
20 |         L_c = (a * (w - 1.55) + b) * math.exp(g / 1000 / (c * (w - 1.55) + d))
21 |         
22 |         return L_c
23 |     odd_num_wavelength = True if num_wavelength % 2 == 1 else False
24 |     
25 |     for wave_length in range(num_wavelength):
26 |         if odd_num_wavelength:
27 |             wave_length = 1.55 + channel_spacing * (wave_length - (num_wavelength // 2))
28 |         else:
29 |             if wave_length < num_wavelength // 2:
30 |                 wave_length = 1.55 + channel_spacing * (wave_length - (num_wavelength // 2))
31 |             else:
32 |                 wave_length = 1.55 + channel_spacing * (wave_length - (num_wavelength // 2) + 1)
33 |         kappa = math.sin(math.pi / 4 * coupling_length(1.55) / coupling_length(wave_length)) ** 2
34 |         error_list.append(2 * kappa - 1)
35 |         
36 |     return error_list
37 | 


--------------------------------------------------------------------------------
/software_model/scripts/evaluate_quant_transformer_scan_noise.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # @Author: Hanqing Zhu
 3 | # @Date:   2023-01-04 22:18:40
 4 | # @Last Modified by:   Hanqing Zhu(hqzhu@utexas.edu)
 5 | # @Last Modified time: 2023-11-10 16:52:06
 6 | exp='eval_accuracy_scan_noise'
 7 | wbits=4
 8 | abits=4
 9 | id=4bit
10 | headwise=1
11 | 
12 | # noise settings
13 | input_noise_std=0.03
14 | output_noise_std=0.05
15 | # following setting is added for inference only
16 | phase_noise_std=2 
17 | num_wavelength=12
18 | channel_spacing=0.4
19 | seed=0
20 | 
21 | resumed_ckpt_path='./resumed_ckpt/best_checkpoint.pth'
22 | 
23 | 
24 | for i in {1..3}
25 | do
26 |     # for input_noise_std in 0.03 0.04 0.05 0.06 0.07 0.08 ## uncomment this line when scanning input noise
27 |     # for phase_noise_std in 2 3 4 5 6 7 ## uncomment this line when scanning phase noise
28 |     for num_wavelength in 8 12 16 20 24 ## uncomment this line when scanning # wavelength
29 |     do
30 |         CUDA_VISIBLE_DEVICES=2 python main.py --eval \
31 |         --resume ${resumed_ckpt_path} \
32 |         --model deit_tiny_patch16_224_quant \
33 |         --drop-path 0 \
34 |         --wbits ${wbits} \
35 |         --abits ${abits} \
36 |         --data-path /home/usr1/zixuan/ImageNet/data \
37 |         --headwise \
38 |         --input_noise_std ${input_noise_std} \
39 |         --output_noise_std ${output_noise_std} \
40 |         --phase_noise_std ${phase_noise_std} \
41 |         --num_wavelength ${num_wavelength} \
42 |         --channel_spacing ${channel_spacing} \
43 |         --seed ${seed+$i} \
44 |         --enable_wdm_noise \
45 |         --enable_linear_noise
46 |     done
47 | done
48 | 
49 | 


--------------------------------------------------------------------------------
/profile/customized_layer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | 
 4 | """
 5 | three normalization variants without elementwise_affine transformation
 6 | normlize along the last dimension
 7 | """
 8 | 
 9 | decorator = torch.compile
10 | # decorator = torch.jit.script
11 | 
12 | 
13 | @decorator
14 | def layer_norm(x: Tensor, eps: float):
15 |     x_mean = x.mean(dim=-1, keepdim=True)
16 |     x_var = x.var(dim=-1, keepdim=True, correction=0)
17 |     return (x - x_mean) * torch.rsqrt(x_var + eps)
18 | 
19 | 
20 | @decorator
21 | def rms_norm(x, eps: float):
22 |     return x * torch.rsqrt(x.square().mean(dim=-1, keepdim=True) + eps)
23 | 
24 | 
25 | @decorator
26 | def crms_norm(x, eps: float):
27 |     discarded_element = x.sum(dim=-1, keepdim=True)
28 |     return x * torch.rsqrt((x.square().sum(dim=-1, keepdim=True) + discarded_element.square()) / (x.shape[-1] + 1) + eps)
29 | 
30 | 
31 | class CustomizedLayerNorm(torch.nn.LayerNorm):
32 |     def forward(self, x: Tensor) -> Tensor:
33 |         return layer_norm(x.float(), self.eps).type_as(x)
34 | 
35 | 
36 | class RMSNorm(torch.nn.LayerNorm):
37 |     def forward(self, x: Tensor) -> Tensor:
38 |         return rms_norm(x.float(), self.eps).type_as(x)
39 | 
40 | 
41 | class CRMSNorm(torch.nn.LayerNorm):
42 |     def forward(self, x: Tensor) -> Tensor:
43 |         return crms_norm(x.float(), self.eps).type_as(x)
44 | 
45 | 
46 | class LinearZeroMeanOutput(torch.nn.Linear):
47 |     def forward(self, x):
48 |         zero_mean_weight = self.weight - self.weight.mean(dim=0, keepdim=True)
49 |         zero_mean_bias = self.bias - self.bias.mean()
50 |         return torch.nn.functional.linear(x, zero_mean_weight, zero_mean_bias)
51 | 


--------------------------------------------------------------------------------
/hardware_simulator/hardware/SRAM.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author: Hanqing Zhu
 3 | # @Date:   2023-03-05 19:39:10
 4 | # @Last Modified by:   Hanqing Zhu(hqzhu@utexas.edu)
 5 | # @Last Modified time: 2023-11-08 17:16:44
 6 | import math
 7 | 
 8 | class SRAM:
 9 |     def __init__(self, size=2048) -> None:
10 |         
11 |         # the largest SRAM -> 2MB
12 |         self.max_data = size * 1024 * 8
13 |         
14 |         # HBM to SRAM
15 |         self.bandwidth_dram_to_sram = 1024 * 1024 * 1024 * 1024 * 8 # 1TB/s
16 |         self.bandwidth_sram = 1 / 0.604347* 64 * 64 * 1024 * 1024 * 1024 * 8 # based on cacti simulation
17 |         self.bandwidth_sram_to_rf = 1024 * 1024 * 1024 * 1024 * 8 * 100 # set to inifnity
18 |         self.clock_frequency = 500 * 1e6 # 500MHz
19 |  
20 |     def preload_DRAM_SRAM(self, nums=0, bits=32, bandwidth_ratio=1):
21 |         cycle = 0
22 |         latency = nums * bits / (self.bandwidth_dram_to_sram * bandwidth_ratio)
23 |         cycle = math.ceil(latency * self.clock_frequency)
24 |         if nums * bits > self.max_data:
25 |             print('Error: loading DRAM to SRAM exceeds SRAM size')
26 |         else:
27 |             latency = nums * bits / (self.bandwidth_dram_to_sram * bandwidth_ratio)
28 |             cycle = math.ceil(latency * self.clock_frequency)
29 |         
30 |         return cycle
31 |     
32 |     def load_SRAM_RF(self, nums=0, bits=32, bandwidth_ratio=1):
33 |         cycle = 0
34 |         latency = nums * bits / (self.bandwidth_sram_to_rf * bandwidth_ratio)
35 |         cycle = math.ceil(latency * self.clock_frequency)
36 |         return cycle
37 |     
38 |     def load_GB_SRAM(self, nums=0, bits=32, bandwidth_ratio=1):
39 |         cycle = 0
40 |         latency = nums * bits / (self.bandwidth_sram * bandwidth_ratio)
41 |         cycle = math.ceil(latency * self.clock_frequency)
42 |         return cycle


--------------------------------------------------------------------------------
/software_model/process_logs.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author: Hanqing Zhu
 3 | # @Date:   2023-03-21 15:40:36
 4 | # @Last Modified by:   Hanqing Zhu(hqzhu@utexas.edu)
 5 | # @Last Modified time: 2023-11-10 10:43:07
 6 | import re
 7 | import csv
 8 | import statistics
 9 | import argparse
10 | 
11 | parser = argparse.ArgumentParser()
12 | 
13 | parser.add_argument("-f", "--file", default="./robustness/sweep_phase_noise_deit_tiny_4bit.log",
14 |                         help="file")
15 | parser.add_argument("-i", "--iters", default=3,
16 |                         help="iterations")
17 | parser.add_argument("-n", "--num_vars", default=6,
18 |                         help="number of variations you sweep")
19 | 
20 | args, opts = parser.parse_known_args()
21 | 
22 | log_file = args.file
23 | num_iters = int(args.iters)
24 | num_variations = int(args.num_vars)
25 | 
26 | 
27 | with open(log_file, "r") as file:
28 |     log_data = file.read()
29 | 
30 | accuracy_pattern = r"\* Acc@1 (\d+\.\d+)"
31 | 
32 | accuracy_matches = re.findall(accuracy_pattern, log_data)
33 | 
34 | if accuracy_matches:
35 |     accuracies = [float(match) for match in accuracy_matches]
36 |     print(f"Accuracy: {accuracies}")
37 | else:
38 |     print("Accuracy not found in log file.")
39 |     
40 | indices = [x*num_variations for x in range(num_iters)]
41 | result = []
42 | 
43 | for i in range(num_variations):
44 |     print("**", indices)
45 |     tmp = [float(accuracy_matches[i]) for i in indices]
46 |     mean = statistics.mean(tmp)
47 |     std = statistics.stdev(tmp)
48 |     tmp.extend([mean, std])
49 |     result.append(tmp)
50 |     indices = [x + 1 for x in indices]
51 | 
52 | filename = log_file.split("/")[-1].split(".")[0] + '.csv'
53 | 
54 | def save_arrays_to_file(file_name, arrays):
55 |     with open(file_name, mode='w', newline='') as file:
56 |         writer = csv.writer(file)
57 |         writer.writerow(['test1', 'test2', 'test3', 'mean', 'std'])
58 |         for array in arrays:
59 |             writer.writerow(array)
60 | 
61 | save_arrays_to_file(filename, result)


--------------------------------------------------------------------------------
/profile/vit_infer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author: Hanqing Zhu(hqzhu@utexas.edu)
 3 | # @Date:   1969-12-31 18:00:00
 4 | # @Last Modified by:   Hanqing Zhu(hqzhu@utexas.edu)
 5 | # @Last Modified time: 2023-11-09 23:05:25
 6 | import argparse
 7 | import torch
 8 | import torch.utils.benchmark as benchmark
 9 | from model import PreDefinedViT
10 | 
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument("-m", "--model_name", default="deit-s",
13 |                         help="model")
14 | 
15 | args, opts = parser.parse_known_args()
16 | 
17 | image_size = 224
18 | num_classes = 1000
19 | using_torch_compile = False
20 | device = torch.device('cuda')
21 | # device = torch.device('cpu')
22 | 
23 | batch_size_list = [1]
24 | num_threads_list = [1]
25 | min_run_time = 100
26 | 
27 | model_dict = {
28 |     'deit-t': ['Tiny', 16],
29 |     'deit-s': ['Small', 16],
30 |     'deit-b': ['Base', 16]
31 | }
32 | 
33 | results = []
34 | 
35 | model_variant = model_dict[args.model_name]
36 | model_name, patch_size = model_variant
37 | for method in ['pre-ln']:
38 |     raw_model = PreDefinedViT(image_size=image_size, patch_size=patch_size, num_classes=num_classes, variant=model_name, method=method).to(device)
39 |     model = torch.compile(raw_model) if using_torch_compile else raw_model
40 |     model.eval()
41 | 
42 |     with torch.no_grad():
43 |         with torch.cuda.amp.autocast():
44 |             for batch_size in batch_size_list:
45 |                 for num_threads in num_threads_list:
46 |                     x = torch.randn(batch_size, 3, image_size, image_size).to(device)
47 |                     result = benchmark.Timer(stmt='y = model(x)',
48 |                                                 setup='from __main__ import model',
49 |                                                 globals={'x': x},
50 |                                                 num_threads=num_threads,
51 |                                                 sub_label=f'batch_size {batch_size} method {method}',
52 |                                                 description=model_name,
53 |                                                 ).blocked_autorange(min_run_time=min_run_time)
54 |                     results.append(result)
55 |                     print(result)
56 | 
57 | compare = benchmark.Compare(results)
58 | compare.print()
59 | 


--------------------------------------------------------------------------------
/profile/bert_infer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author: Hanqing Zhu(hqzhu@utexas.edu)
 3 | # @Date:   1969-12-31 18:00:00
 4 | # @Last Modified by:   Hanqing Zhu(hqzhu@utexas.edu)
 5 | # @Last Modified time: 2023-11-09 23:42:51
 6 | import argparse
 7 | import torch
 8 | import torch.utils.benchmark as benchmark
 9 | from model import PreDefinedBERT
10 | 
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument("-m", "--model_name", default="deit-s",
13 |                         help="model")
14 | parser.add_argument("-s", "--seq_length", default=128,
15 |                         help="seq length")
16 | 
17 | args, opts = parser.parse_known_args()
18 | 
19 | vocab_size = 30528
20 | max_seq_length = 2048
21 | num_classes = 2
22 | using_torch_compile = False
23 | device = torch.device('cuda')
24 | # device = torch.device('cpu')
25 | 
26 | batch_size_list = [1]
27 | num_threads_list = [1]
28 | min_run_time = 100
29 | 
30 | model_dict = {
31 |     'bert-b': ['Base', 768],
32 |     'bert-l': ['Large', 1024]
33 | }
34 | 
35 | results = []
36 | 
37 | model_name, emebedding_size = model_dict[args.model_name]
38 | seq_len = int(args.seq_length)
39 | for method in ['pre-ln']:
40 |     raw_model = PreDefinedBERT(vocab_size=vocab_size, max_seq_length=max_seq_length, variant=model_name, method=method, num_classes=num_classes).to(device)
41 |     model = torch.compile(raw_model) if using_torch_compile else raw_model
42 |     model.eval()
43 |     
44 |     with torch.no_grad():
45 |         with torch.cuda.amp.autocast():
46 |             for batch_size in batch_size_list:
47 |                 for num_threads in num_threads_list:
48 |                     x = torch.randn(batch_size, seq_len, emebedding_size).to(device)
49 |                     result = benchmark.Timer(stmt='y = model(x)',
50 |                                                 setup='from __main__ import model',
51 |                                                 globals={'x': x},
52 |                                                 num_threads=num_threads,
53 |                                                 sub_label=f'batch_size {batch_size} seq_len {seq_len}',
54 |                                                 description=model_name + ' ' + method,
55 |                                                 ).blocked_autorange(min_run_time=min_run_time)
56 |                     results.append(result)
57 |                     print(result)
58 | 
59 | compare = benchmark.Compare(results)
60 | compare.print()
61 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | # Lightening-Transformer HPCA 2024 AE
 2 | This contains the codebases for the main codebases of the paper "Lightening-Transformer: A Dynamically-operated Optically-interconnected Photonic Transformer Accelerator".
 3 | 
 4 | ---
 5 | 
 6 | ## Usage of the Provided Codebase
 7 | 
 8 | We provides three kinds of codebases:
 9 | 
10 | * (1) algorithm codes for training/running models on our photonic accelerator, with the analytic transformation of our unique photonic tensor core embedded in the computation process. See `./software_model` for detailed implementation and usages, including the [DeiT](https://arxiv.org/abs/2012.12877) case.
11 | 
12 | * (2) hardware simulator for estimating the energy and latency running Transformers on our photonic accelerator. See `./hardware_simulator` for detailed implementation and usages.
13 | 
14 | * (3) profile codes for profiling latency and power usage of running Transformers on GPU. See `./profile` for detailed implementation and usages. The implementation refers to [Neurips'23, Pre-RMSNorm and Pre-CRMSNorm Transformers: Equivalent and Efficient Pre-LN Transformers](https://github.com/zixuanjiang/pre-rmsnorm-transformer).
15 | 
16 | ---
17 | 
18 | ## Required Dependencies
19 | 
20 | The DeiT requires to install PyTorch and torchvision 0.8.1+ and [pytorch-image-models 0.3.2](https://github.com/rwightman/pytorch-image-models).
21 | 
22 | 
23 | ```
24 | conda create -n test # create a virtual env
25 | conda install pytorch torchvision torchaudio pytorch-cuda=your_cuda_version -c pytorch -c nvidia # install pytorch
26 | pip install timm==0.3.2 torchpack packaging einops gdown
27 | conda activate test # activate the test env
28 | ```
29 | 
30 | For torch.2.0+, you will encounter the ModuleNotFoundError: No module named 'torch._six' in '/path_to_your_conda_envs/your_env_name/lib/python_version/site-packages/timm/models/layers/helpers.py". This is because torch2.0 doesn't have torch._six. Please replace the helper.py file with the following one.
31 | 
32 | ```
33 | from itertools import repeat
34 | # from torch._six import container_abcs
35 | 
36 | 
37 | # From PyTorch internals
38 | def _ntuple(n):
39 |     def parse(x):
40 |         if isinstance(x, str):
41 |             return x
42 |         return tuple(repeat(x, n))
43 |     return parse
44 | 
45 | 
46 | to_1tuple = _ntuple(1)
47 | to_2tuple = _ntuple(2)
48 | to_3tuple = _ntuple(3)
49 | to_4tuple = _ntuple(4)
50 | to_ntuple = _ntuple
51 | ```
52 | 
53 | ## Reference
54 | 
55 | [1] Hanqing Zhu, Jiaqi Gu, Hanrui Wang, Zixuan Jiang, Rongxing Tang, Zhekai Zhang, Chenghao Feng, Song Han, Ray T. Chen and David Z. Pan. "Lightening-Transformer: A Dynamically-operated Optically-interconnected Photonic Transformer Accelerator", IEEE International Symposium on High-Performance Computer Architecture (HPCA'24). 
56 | 


--------------------------------------------------------------------------------
/hardware_simulator/params/device_params/default.yaml:
--------------------------------------------------------------------------------
  1 | # @Author: Hanqing Zhu
  2 | # @Date:   1969-12-31 18:00:00
  3 | # @Last Modified by:   Hanqing Zhu
  4 | # @Last Modified time: 2023-05-10 22:57:20
  5 | # power in mW
  6 | device:
  7 |   mzi_modulator:
  8 |     type: 'mzi'
  9 |     energy_per_bit: 450 # fJ/bit # 150
 10 |     static_power: 0 # 0 mW
 11 |     length: 260
 12 |     width: 20
 13 |     insertion_loss: 1.2
 14 |   mrr_modulator:
 15 |     type: 'ring'
 16 |     energy_per_bit: 42 # fJ/bit -> 42fJ/bit @ 40Gbit
 17 |     static_power: 1.2 #mW
 18 |     length: 9.66
 19 |     width: 9.66
 20 |     insertion_loss: 0.95 # insertion loss
 21 |     insertion_loss_uc: 0.1 # uncoupled insertion loss
 22 |   mrr_router:
 23 |     static_power: 0.275
 24 |     length: 4.8
 25 |     width: 4.8
 26 |     insertion_loss: 0.93
 27 |   phase_shifter:
 28 |     dynamic_power: 0
 29 |     static_power: 0
 30 |     insertion_loss: 0.33
 31 |     length: 100
 32 |     width: 45
 33 |   direction_coupler:
 34 |     insertion_loss: 0.33
 35 |     length: 5.25
 36 |     width: 2.4
 37 |   photo_detector:
 38 |     power: 1.1
 39 |     sensitivity: -25 #dbm
 40 |     length: 4
 41 |     width: 10
 42 |   mzi:
 43 |     type: 'mzi'
 44 |     energy_per_bit: 450 # fJ/bit
 45 |     static_power: 0 # 0 mW
 46 |     length: 180
 47 |     width: 100
 48 |     insertion_loss: 0.99 #two directional coupler 0.04 + 2 * 0.33
 49 |     response_time: 2.0e-3 # 2mus
 50 |   laser:
 51 |     power: 23.5
 52 |     length: 400
 53 |     width: 300
 54 |     wall_plug_eff: 0.2
 55 |   y_branch:
 56 |     insertion_loss: 0.1
 57 |     length: 1.8
 58 |     width: 1.3
 59 |   micro_comb:
 60 |     length: 1184
 61 |     width: 1184
 62 | 
 63 | core:
 64 |   type: "dota"
 65 |   width: 12
 66 |   height: 12
 67 |   num_wavelength: 12
 68 |   work_freq: 5
 69 |   interface:
 70 |     ADC:
 71 |       choice: 1
 72 |       sharing_factor: 1
 73 |     DAC:
 74 |       choice: 1
 75 |     TIA:
 76 |       power: 3
 77 |       area: 50
 78 |   precision:
 79 |     in_bit: 4
 80 |     w_bit: 4
 81 |     act_bit: 4 
 82 | 
 83 | arch:
 84 |   num_tiles: 4
 85 |   num_pe_per_tile: 2
 86 |   full_range_support_factor: 1
 87 |   weight_reuse_factor: -1
 88 |   ### unique arch params for our DOTA
 89 |   time_accum_factor: 1
 90 |   input_mod_sharing_flag: 1 # whether input is globally shared cross tiles
 91 |   adc_share_flag: 1 # multiple PEs share one adc array
 92 |   datamovement: # datamovement cost from CACTI: leakage power * access time + dynamic energy per acess * cache access rate
 93 |     DRAM: 62.4e-9
 94 |     DRAM_GB: 62.4e-9
 95 |     GB2: 1.655e-9 # mJ/2byte: we divide the large global SRAM into 32KB banks
 96 |     GB1: 0.92e-9 # mJ/2byte
 97 |     NoC: 2.0e-9 # from eyerisis
 98 |     RF: 0.073e-9 # mJ/2byte
 99 |   memory_size:
100 |     M2_buffer_size: 4096


--------------------------------------------------------------------------------
/software_model/samplers.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2015-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | import torch
 4 | import torch.distributed as dist
 5 | import math
 6 | 
 7 | 
 8 | class RASampler(torch.utils.data.Sampler):
 9 |     """Sampler that restricts data loading to a subset of the dataset for distributed,
10 |     with repeated augmentation.
11 |     It ensures that different each augmented version of a sample will be visible to a
12 |     different process (GPU)
13 |     Heavily based on torch.utils.data.DistributedSampler
14 |     """
15 | 
16 |     def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True, num_repeats: int = 3):
17 |         if num_replicas is None:
18 |             if not dist.is_available():
19 |                 raise RuntimeError("Requires distributed package to be available")
20 |             num_replicas = dist.get_world_size()
21 |         if rank is None:
22 |             if not dist.is_available():
23 |                 raise RuntimeError("Requires distributed package to be available")
24 |             rank = dist.get_rank()
25 |         if num_repeats < 1:
26 |             raise ValueError("num_repeats should be greater than 0")
27 |         self.dataset = dataset
28 |         self.num_replicas = num_replicas
29 |         self.rank = rank
30 |         self.num_repeats = num_repeats
31 |         self.epoch = 0
32 |         self.num_samples = int(math.ceil(len(self.dataset) * self.num_repeats / self.num_replicas))
33 |         self.total_size = self.num_samples * self.num_replicas
34 |         # self.num_selected_samples = int(math.ceil(len(self.dataset) / self.num_replicas))
35 |         self.num_selected_samples = int(math.floor(len(self.dataset) // 256 * 256 / self.num_replicas))
36 |         self.shuffle = shuffle
37 | 
38 |     def __iter__(self):
39 |         if self.shuffle:
40 |             # deterministically shuffle based on epoch
41 |             g = torch.Generator()
42 |             g.manual_seed(self.epoch)
43 |             indices = torch.randperm(len(self.dataset), generator=g)
44 |         else:
45 |             indices = torch.arange(start=0, end=len(self.dataset))
46 | 
47 |         # add extra samples to make it evenly divisible
48 |         indices = torch.repeat_interleave(indices, repeats=self.num_repeats, dim=0).tolist()
49 |         padding_size: int = self.total_size - len(indices)
50 |         if padding_size > 0:
51 |             indices += indices[:padding_size]
52 |         assert len(indices) == self.total_size
53 | 
54 |         # subsample
55 |         indices = indices[self.rank:self.total_size:self.num_replicas]
56 |         assert len(indices) == self.num_samples
57 | 
58 |         return iter(indices[:self.num_selected_samples])
59 | 
60 |     def __len__(self):
61 |         return self.num_selected_samples
62 | 
63 |     def set_epoch(self, epoch):
64 |         self.epoch = epoch
65 | 


--------------------------------------------------------------------------------
/hardware_simulator/hardware/ADC.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author: Hanqing Zhu
 3 | # @Date:   1969-12-31 18:00:00
 4 | # @Last Modified by:   Hanqing Zhu(hqzhu@utexas.edu)
 5 | # @Last Modified time: 2023-11-10 23:43:45
 6 | import logging
 7 | 
 8 | 
 9 | ADC_list = {
10 |     1: {'area': 2850, 'prec': 8, 'power': 14.8, 'sample_rate': 10, 'type': 'sar'},
11 | }
12 | 
13 | class ADC():
14 |     def __init__(self, choice=1) -> None:
15 |         self.ADC_choice = choice
16 |         
17 |         assert choice == 1
18 | 
19 |         # loaded ADC params
20 |         # make it private
21 |         self.__ADC_area = 0
22 |         self.__ADC_prec = 0
23 |         self.__ADC_power = 0
24 |         self.__ADC_sample_rate = 0
25 |         self.__ADC_type = None
26 |         
27 |         # obtain ADC param
28 |         self._obatin_ADC_param()
29 |         self.ADC_freq = self.__ADC_sample_rate # set to sample rate by default
30 |         self.ADC_prec = self.__ADC_prec # set to sample rate by default
31 |     
32 |     def _obatin_ADC_param(self):
33 |         if self.ADC_choice is not None:
34 |             self.__chosen_ADC_list = ADC_list[self.ADC_choice]
35 |             self.__ADC_area = self.__chosen_ADC_list['area']
36 |             self.__ADC_prec = self.__chosen_ADC_list['prec']
37 |             self.__ADC_power = self.__chosen_ADC_list['power']
38 |             self.__ADC_sample_rate = self.__chosen_ADC_list['sample_rate']
39 |             self.__ADC_type = self.__chosen_ADC_list['type']
40 |         else:
41 |             raise NotImplementedError
42 | 
43 |     def set_ADC_work_freq(self, work_freq):
44 |         if work_freq > self.__ADC_sample_rate:
45 |             raise ValueError(f"Got required ADC work frequency {work_freq} exceeds the ADC frequency limit")
46 |         self.ADC_freq = work_freq
47 |     
48 |     def set_ADC_work_prec(self, work_prec):
49 |         if work_prec > self.__ADC_prec:
50 |             raise ValueError(f"Got required ADC work precision {work_prec} exceeds the ADC precision limit")
51 |         self.ADC_prec = work_prec
52 | 
53 |     def cal_ADC_param(self, print_msg=False):
54 |         # convert power to desired freq and bit width
55 |         if self.__ADC_type == "sar":
56 |             # P \propto N
57 |             self.ADC_power = self.__ADC_power * self.ADC_freq / \
58 |                 self.__ADC_sample_rate * (self.ADC_prec / self.__ADC_prec)
59 |         elif self.__ADC_type == "flash":
60 |             # P \propto (2**N - 1)
61 |             self.ADC_power = self.__ADC_power * self.ADC_freq / \
62 |                 self.__ADC_sample_rate * \
63 |                 ((2**self.ADC_prec - 1) / (2**self.__ADC_prec - 1))
64 | 
65 |         self.ADC_area = self.__ADC_area
66 |         
67 |         if print_msg:
68 |             logging.info('The %s-bit ADC power @%sGHz is %.2f mW', self.ADC_prec, self.ADC_freq, self.ADC_power)
69 |             logging.info('The %s-bit ADC area is %.4f um^2', self.ADC_prec, self.ADC_area)
70 |         
71 |         
72 | if __name__ == "__main__":
73 |     logging.basicConfig()
74 |     logging.getLogger().setLevel(logging.INFO)
75 |     test = ADC(choice=1)
76 |     test.set_ADC_work_freq(4)
77 |     test.set_ADC_work_prec(6)
78 |     test.cal_ADC_param(print_msg=True)


--------------------------------------------------------------------------------
/hardware_simulator/hardware/DAC.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author: Hanqing Zhu
 3 | # @Date:   1969-12-31 18:00:00
 4 | # @Last Modified by:   Hanqing Zhu(hqzhu@utexas.edu)
 5 | # @Last Modified time: 2023-11-10 23:43:37
 6 | import logging
 7 | 
 8 | 
 9 | #  area: um^2, prec: bit, power: mw, sample_rate: GSample/s
10 | DAC_list = {
11 |     1: {'area': 11000, 'prec': 8, 'power': 50, 'sample_rate': 14, 'FoM': None, 'type': 'cap'}
12 | }
13 | 
14 | class DAC():
15 |     def __init__(self, choice=1) -> None:
16 |         self.DAC_choice = choice
17 |         assert choice == 1
18 |         # loaded DAC params
19 |         # make it private
20 |         self.__DAC_area = 0
21 |         self.__DAC_prec = 0
22 |         self.__DAC_power = 0
23 |         self.__DAC_sample_rate = 0
24 |         self.__DAC_type = None
25 |         self.__DAC_FoM = 0
26 |         
27 |         # obtain DAC param
28 |         self._obatin_DAC_param()
29 |         self.DAC_freq = self.__DAC_sample_rate # set to sample rate by default
30 |         self.DAC_prec = self.__DAC_prec # set to sample rate by default
31 |     
32 |     def _obatin_DAC_param(self):
33 |         if self.DAC_choice is not None:
34 |             self.__chosen_DAC_list = DAC_list[self.DAC_choice]
35 |             self.__DAC_area = self.__chosen_DAC_list['area']
36 |             self.__DAC_prec = self.__chosen_DAC_list['prec']
37 |             self.__DAC_power = self.__chosen_DAC_list['power']
38 |             self.__DAC_sample_rate = self.__chosen_DAC_list['sample_rate']
39 |             self.__DAC_type = self.__chosen_DAC_list['type']
40 |             self.__DAC_FoM = self.__chosen_DAC_list['FoM']
41 |         else:
42 |             raise NotImplementedError
43 | 
44 |     def set_DAC_work_freq(self, work_freq):
45 |         if work_freq > self.__DAC_sample_rate:
46 |             raise ValueError(f"Got required DAC work frequency {work_freq} exceeds the DAC frequency limit")
47 |         self.DAC_freq = work_freq
48 |     
49 |     def set_DAC_work_prec(self, work_prec):
50 |         if work_prec > self.__DAC_prec:
51 |             raise ValueError(f"Got required DAC work precision {work_prec} exceeds the DAC precision limit")
52 |         self.DAC_prec = work_prec
53 | 
54 |     def cal_DAC_param(self, print_msg=False):
55 |         # convert power to desired freq and bit width
56 |         if self.__DAC_FoM is not None:
57 |             # following 2 * FoM * nb * Fs / Br (assuming Fs=Br)
58 |             self.DAC_power = 2 * self.__DAC_FoM * \
59 |                 self.DAC_prec * self.DAC_freq * 1e-3
60 |         else:
61 |             # P \propto 2**N/(N+1) * f_clk
62 |             self.DAC_power = self.__DAC_power * (2**self.DAC_prec / (self.DAC_prec)) / (
63 |                 2**self.__DAC_prec / (self.__DAC_prec)) * self.DAC_freq / self.__DAC_sample_rate
64 | 
65 |         self.DAC_area = self.__DAC_area
66 |         
67 |         if print_msg:
68 |             logging.info('The %s-bit DAC power @%sGHz is %.2f mW', self.DAC_prec, self.DAC_freq, self.DAC_power)
69 |             logging.info('The %s-bit DAC area is %.4f um^2', self.DAC_prec, self.DAC_area)
70 |         
71 |         
72 | if __name__ == "__main__":
73 |     logging.basicConfig()
74 |     logging.getLogger().setLevel(logging.INFO)
75 |     test = DAC(choice=2)
76 |     test.set_DAC_work_freq(4)
77 |     test.set_DAC_work_prec(5)
78 |     test.cal_DAC_param(print_msg=True)


--------------------------------------------------------------------------------
/software_model/losses.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2015-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | """
 4 | Implements the knowledge distillation loss
 5 | """
 6 | import torch
 7 | from torch.nn import functional as F
 8 | 
 9 | 
10 | class DistillationLoss(torch.nn.Module):
11 |     """
12 |     This module wraps a standard criterion and adds an extra knowledge distillation loss by
13 |     taking a teacher model prediction and using it as additional supervision.
14 |     """
15 |     def __init__(self, base_criterion: torch.nn.Module, teacher_model: torch.nn.Module,
16 |                  distillation_type: str, alpha: float, tau: float):
17 |         super().__init__()
18 |         self.base_criterion = base_criterion
19 |         self.teacher_model = teacher_model
20 |         assert distillation_type in ['none', 'soft', 'hard']
21 |         self.distillation_type = distillation_type
22 |         self.alpha = alpha
23 |         self.tau = tau
24 | 
25 |     def forward(self, inputs, outputs, labels):
26 |         """
27 |         Args:
28 |             inputs: The original inputs that are feed to the teacher model
29 |             outputs: the outputs of the model to be trained. It is expected to be
30 |                 either a Tensor, or a Tuple[Tensor, Tensor], with the original output
31 |                 in the first position and the distillation predictions as the second output
32 |             labels: the labels for the base criterion
33 |         """
34 |         outputs_kd = None
35 |         if not isinstance(outputs, torch.Tensor):
36 |             # assume that the model outputs a tuple of [outputs, outputs_kd]
37 |             outputs, outputs_kd = outputs
38 |         base_loss = self.base_criterion(outputs, labels)
39 |         if self.distillation_type == 'none':
40 |             return base_loss
41 | 
42 |         if outputs_kd is None:
43 |             raise ValueError("When knowledge distillation is enabled, the model is "
44 |                              "expected to return a Tuple[Tensor, Tensor] with the output of the "
45 |                              "class_token and the dist_token")
46 |         # don't backprop throught the teacher
47 |         with torch.no_grad():
48 |             teacher_outputs = self.teacher_model(inputs)
49 | 
50 |         if self.distillation_type == 'soft':
51 |             T = self.tau
52 |             # taken from https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100
53 |             # with slight modifications
54 |             distillation_loss = F.kl_div(
55 |                 F.log_softmax(outputs_kd / T, dim=1),
56 |                 #We provide the teacher's targets in log probability because we use log_target=True 
57 |                 #(as recommended in pytorch https://github.com/pytorch/pytorch/blob/9324181d0ac7b4f7949a574dbc3e8be30abe7041/torch/nn/functional.py#L2719)
58 |                 #but it is possible to give just the probabilities and set log_target=False. In our experiments we tried both.
59 |                 F.log_softmax(teacher_outputs / T, dim=1),
60 |                 reduction='sum',
61 |                 log_target=True
62 |             ) * (T * T) / outputs_kd.numel()
63 |             #We divide by outputs_kd.numel() to have the legacy PyTorch behavior. 
64 |             #But we also experiments output_kd.size(0) 
65 |             #see issue 61(https://github.com/facebookresearch/deit/issues/61) for more details
66 |         elif self.distillation_type == 'hard':
67 |             distillation_loss = F.cross_entropy(outputs_kd, teacher_outputs.argmax(dim=1))
68 | 
69 |         loss = base_loss * (1 - self.alpha) + distillation_loss * self.alpha
70 |         return loss
71 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # ignor pretrained model
  2 | software_model/pretrained/
  3 | software_model/results/
  4 | software_model/resumed_ckpt/
  5 | 
  6 | hardware_simulator/results/
  7 | 
  8 | # Byte-compiled / optimized / DLL files
  9 | __pycache__/
 10 | *.py[cod]
 11 | *$py.class
 12 | 
 13 | # C extensions
 14 | *.so
 15 | 
 16 | # Distribution / packaging
 17 | .Python
 18 | build/
 19 | develop-eggs/
 20 | dist/
 21 | downloads/
 22 | eggs/
 23 | .eggs/
 24 | lib/
 25 | lib64/
 26 | parts/
 27 | sdist/
 28 | var/
 29 | wheels/
 30 | share/python-wheels/
 31 | *.egg-info/
 32 | .installed.cfg
 33 | *.egg
 34 | MANIFEST
 35 | 
 36 | # PyInstaller
 37 | #  Usually these files are written by a python script from a template
 38 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 39 | *.manifest
 40 | *.spec
 41 | 
 42 | # Installer logs
 43 | pip-log.txt
 44 | pip-delete-this-directory.txt
 45 | 
 46 | # Unit test / coverage reports
 47 | htmlcov/
 48 | .tox/
 49 | .nox/
 50 | .coverage
 51 | .coverage.*
 52 | .cache
 53 | nosetests.xml
 54 | coverage.xml
 55 | *.cover
 56 | *.py,cover
 57 | .hypothesis/
 58 | .pytest_cache/
 59 | cover/
 60 | 
 61 | # Translations
 62 | *.mo
 63 | *.pot
 64 | 
 65 | # Django stuff:
 66 | *.log
 67 | local_settings.py
 68 | db.sqlite3
 69 | db.sqlite3-journal
 70 | 
 71 | # Flask stuff:
 72 | instance/
 73 | .webassets-cache
 74 | 
 75 | # Scrapy stuff:
 76 | .scrapy
 77 | 
 78 | # Sphinx documentation
 79 | docs/_build/
 80 | 
 81 | # PyBuilder
 82 | .pybuilder/
 83 | target/
 84 | 
 85 | # Jupyter Notebook
 86 | .ipynb_checkpoints
 87 | 
 88 | # IPython
 89 | profile_default/
 90 | ipython_config.py
 91 | 
 92 | # pyenv
 93 | #   For a library or package, you might want to ignore these files since the code is
 94 | #   intended to run in multiple environments; otherwise, check them in:
 95 | # .python-version
 96 | 
 97 | # pipenv
 98 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 99 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
100 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
101 | #   install all needed dependencies.
102 | #Pipfile.lock
103 | 
104 | # poetry
105 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
106 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
107 | #   commonly ignored for libraries.
108 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
109 | #poetry.lock
110 | 
111 | # pdm
112 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113 | #pdm.lock
114 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
115 | #   in version control.
116 | #   https://pdm.fming.dev/#use-with-ide
117 | .pdm.toml
118 | 
119 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
120 | __pypackages__/
121 | 
122 | # Celery stuff
123 | celerybeat-schedule
124 | celerybeat.pid
125 | 
126 | # SageMath parsed files
127 | *.sage.py
128 | 
129 | # Environments
130 | .env
131 | .venv
132 | env/
133 | venv/
134 | ENV/
135 | env.bak/
136 | venv.bak/
137 | 
138 | # Spyder project settings
139 | .spyderproject
140 | .spyproject
141 | 
142 | # Rope project settings
143 | .ropeproject
144 | 
145 | # mkdocs documentation
146 | /site
147 | 
148 | # mypy
149 | .mypy_cache/
150 | .dmypy.json
151 | dmypy.json
152 | 
153 | # Pyre type checker
154 | .pyre/
155 | 
156 | # pytype static type analyzer
157 | .pytype/
158 | 
159 | # Cython debug symbols
160 | cython_debug/
161 | 
162 | # PyCharm
163 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
164 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
165 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
166 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
167 | #.idea/


--------------------------------------------------------------------------------
/software_model/augment.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Author: Hanqing Zhu(hqzhu@utexas.edu)
  3 | # @Date:   1969-12-31 18:00:00
  4 | # @Last Modified by:   Hanqing Zhu(hqzhu@utexas.edu)
  5 | # @Last Modified time: 2023-11-07 00:27:04
  6 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  7 | # All rights reserved.
  8 | 
  9 | """
 10 | 3Augment implementation
 11 | Data-augmentation (DA) based on dino DA (https://github.com/facebookresearch/dino)
 12 | and timm DA(https://github.com/rwightman/pytorch-image-models)
 13 | """
 14 | import torch
 15 | from torchvision import transforms
 16 | 
 17 | from timm.data.transforms import RandomResizedCropAndInterpolation, ToNumpy, ToTensor
 18 | 
 19 | import numpy as np
 20 | from torchvision import datasets, transforms
 21 | import random
 22 | 
 23 | 
 24 | 
 25 | from PIL import ImageFilter, ImageOps
 26 | import torchvision.transforms.functional as TF
 27 | 
 28 | 
 29 | class GaussianBlur(object):
 30 |     """
 31 |     Apply Gaussian Blur to the PIL image.
 32 |     """
 33 |     def __init__(self, p=0.1, radius_min=0.1, radius_max=2.):
 34 |         self.prob = p
 35 |         self.radius_min = radius_min
 36 |         self.radius_max = radius_max
 37 | 
 38 |     def __call__(self, img):
 39 |         do_it = random.random() <= self.prob
 40 |         if not do_it:
 41 |             return img
 42 | 
 43 |         img = img.filter(
 44 |             ImageFilter.GaussianBlur(
 45 |                 radius=random.uniform(self.radius_min, self.radius_max)
 46 |             )
 47 |         )
 48 |         return img
 49 | 
 50 | class Solarization(object):
 51 |     """
 52 |     Apply Solarization to the PIL image.
 53 |     """
 54 |     def __init__(self, p=0.2):
 55 |         self.p = p
 56 | 
 57 |     def __call__(self, img):
 58 |         if random.random() < self.p:
 59 |             return ImageOps.solarize(img)
 60 |         else:
 61 |             return img
 62 | 
 63 | class gray_scale(object):
 64 |     """
 65 |     Apply Solarization to the PIL image.
 66 |     """
 67 |     def __init__(self, p=0.2):
 68 |         self.p = p
 69 |         self.transf = transforms.Grayscale(3)
 70 |  
 71 |     def __call__(self, img):
 72 |         if random.random() < self.p:
 73 |             return self.transf(img)
 74 |         else:
 75 |             return img
 76 |  
 77 |     
 78 |     
 79 | class horizontal_flip(object):
 80 |     """
 81 |     Apply Solarization to the PIL image.
 82 |     """
 83 |     def __init__(self, p=0.2,activate_pred=False):
 84 |         self.p = p
 85 |         self.transf = transforms.RandomHorizontalFlip(p=1.0)
 86 |  
 87 |     def __call__(self, img):
 88 |         if random.random() < self.p:
 89 |             return self.transf(img)
 90 |         else:
 91 |             return img
 92 |         
 93 |     
 94 |     
 95 | def new_data_aug_generator(args = None):
 96 |     img_size = args.input_size
 97 |     remove_random_resized_crop = args.src
 98 |     mean, std = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]
 99 |     primary_tfl = []
100 |     scale=(0.08, 1.0)
101 |     interpolation='bicubic'
102 |     if remove_random_resized_crop:
103 |         primary_tfl = [
104 |             transforms.Resize(img_size, interpolation=3),
105 |             transforms.RandomCrop(img_size, padding=4,padding_mode='reflect'),
106 |             transforms.RandomHorizontalFlip()
107 |         ]
108 |     else:
109 |         primary_tfl = [
110 |             RandomResizedCropAndInterpolation(
111 |                 img_size, scale=scale, interpolation=interpolation),
112 |             transforms.RandomHorizontalFlip()
113 |         ]
114 | 
115 |         
116 |     secondary_tfl = [transforms.RandomChoice([gray_scale(p=1.0),
117 |                                               Solarization(p=1.0),
118 |                                               GaussianBlur(p=1.0)])]
119 |    
120 |     if args.color_jitter is not None and not args.color_jitter==0:
121 |         secondary_tfl.append(transforms.ColorJitter(args.color_jitter, args.color_jitter, args.color_jitter))
122 |     final_tfl = [
123 |             transforms.ToTensor(),
124 |             transforms.Normalize(
125 |                 mean=torch.tensor(mean),
126 |                 std=torch.tensor(std))
127 |         ]
128 |     return transforms.Compose(primary_tfl+secondary_tfl+final_tfl)
129 | 


--------------------------------------------------------------------------------
/software_model/engine.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Author: Hanqing Zhu
  3 | # @Date:   2022-11-30 21:32:37
  4 | # @Last Modified by:   Hanqing Zhu
  5 | # @Last Modified time: 2023-03-29 01:56:00
  6 | # Copyright (c) 2015-present, Facebook, Inc.
  7 | # All rights reserved.
  8 | """
  9 | Train and eval functions used in main.py
 10 | """
 11 | import math
 12 | import sys
 13 | from typing import Iterable, Optional
 14 | 
 15 | import torch
 16 | 
 17 | from timm.data import Mixup
 18 | from timm.utils import accuracy, ModelEma
 19 | 
 20 | from losses import DistillationLoss
 21 | import utils
 22 | 
 23 | 
 24 | def train_one_epoch(model: torch.nn.Module, criterion: DistillationLoss,
 25 |                     data_loader: Iterable, optimizer: torch.optim.Optimizer,
 26 |                     device: torch.device, epoch: int, loss_scaler, max_norm: float = 0,
 27 |                     model_ema: Optional[ModelEma] = None, mixup_fn: Optional[Mixup] = None,
 28 |                     set_training_mode=True, args = None):
 29 |     model.train(set_training_mode)
 30 |     metric_logger = utils.MetricLogger(delimiter="  ")
 31 |     metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
 32 |     header = 'Epoch: [{}]'.format(epoch)
 33 |     print_freq = 10
 34 | 
 35 |     for samples, targets in metric_logger.log_every(data_loader, print_freq, header):
 36 |         samples = samples.to(device, non_blocking=True)
 37 |         targets = targets.to(device, non_blocking=True)
 38 | 
 39 |         if mixup_fn is not None:
 40 |             samples, targets = mixup_fn(samples, targets)
 41 |             
 42 |         if args.bce_loss:
 43 |             targets = targets.gt(0.0).type(targets.dtype)
 44 |                     
 45 |         with torch.cuda.amp.autocast():
 46 |             outputs = model(samples)
 47 |             loss = criterion(samples, outputs, targets)
 48 | 
 49 |         loss_value = loss.item()
 50 | 
 51 |         if not math.isfinite(loss_value):
 52 |             print("Loss is {}, stopping training".format(loss_value))
 53 |             sys.exit(1)
 54 | 
 55 |         optimizer.zero_grad()
 56 | 
 57 |         # this attribute is added by timm on one optimizer (adahessian)
 58 |         is_second_order = hasattr(optimizer, 'is_second_order') and optimizer.is_second_order
 59 |         loss_scaler(loss, optimizer, clip_grad=max_norm,
 60 |                     parameters=model.parameters(), create_graph=is_second_order)
 61 | 
 62 |         torch.cuda.synchronize()
 63 |         if model_ema is not None:
 64 |             model_ema.update(model)
 65 | 
 66 |         metric_logger.update(loss=loss_value)
 67 |         metric_logger.update(lr=optimizer.param_groups[0]["lr"])
 68 |     # gather the stats from all processes
 69 |     metric_logger.synchronize_between_processes()
 70 |     print("Averaged stats:", metric_logger)
 71 |     return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
 72 | 
 73 | 
 74 | @torch.no_grad()
 75 | def evaluate(data_loader, model, device):
 76 |     criterion = torch.nn.CrossEntropyLoss()
 77 | 
 78 |     metric_logger = utils.MetricLogger(delimiter="  ")
 79 |     header = 'Test:'
 80 | 
 81 |     # switch to evaluation mode
 82 |     model.eval()
 83 | 
 84 |     num_images = 0
 85 |     for images, target in metric_logger.log_every(data_loader, 10, header):
 86 |         images = images.to(device, non_blocking=True)
 87 |         target = target.to(device, non_blocking=True)
 88 |         num_images += images.shape[0]
 89 | 
 90 |         # compute output
 91 |         with torch.cuda.amp.autocast():
 92 |             output = model(images)
 93 |             loss = criterion(output, target)
 94 | 
 95 |         acc1, acc5 = accuracy(output, target, topk=(1, 5))
 96 | 
 97 |         batch_size = images.shape[0]
 98 |         metric_logger.update(loss=loss.item())
 99 |         metric_logger.meters['acc1'].update(acc1.item(), n=batch_size)
100 |         metric_logger.meters['acc5'].update(acc5.item(), n=batch_size)
101 | 
102 |     # gather the stats from all processes
103 |     metric_logger.synchronize_between_processes()
104 |     print('* Acc@1 {top1.global_avg:.3f} Acc@5 {top5.global_avg:.3f} loss {losses.global_avg:.3f}'
105 |           .format(top1=metric_logger.acc1, top5=metric_logger.acc5, losses=metric_logger.loss))
106 | 
107 |     return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
108 | 


--------------------------------------------------------------------------------
/software_model/datasets.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2015-present, Facebook, Inc.
  2 | # All rights reserved.
  3 | import os
  4 | import json
  5 | 
  6 | from torchvision import datasets, transforms
  7 | from torchvision.datasets.folder import ImageFolder, default_loader
  8 | 
  9 | from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 10 | from timm.data import create_transform
 11 | 
 12 | 
 13 | class INatDataset(ImageFolder):
 14 |     def __init__(self, root, train=True, year=2018, transform=None, target_transform=None,
 15 |                  category='name', loader=default_loader):
 16 |         self.transform = transform
 17 |         self.loader = loader
 18 |         self.target_transform = target_transform
 19 |         self.year = year
 20 |         # assert category in ['kingdom','phylum','class','order','supercategory','family','genus','name']
 21 |         path_json = os.path.join(root, f'{"train" if train else "val"}{year}.json')
 22 |         with open(path_json) as json_file:
 23 |             data = json.load(json_file)
 24 | 
 25 |         with open(os.path.join(root, 'categories.json')) as json_file:
 26 |             data_catg = json.load(json_file)
 27 | 
 28 |         path_json_for_targeter = os.path.join(root, f"train{year}.json")
 29 | 
 30 |         with open(path_json_for_targeter) as json_file:
 31 |             data_for_targeter = json.load(json_file)
 32 | 
 33 |         targeter = {}
 34 |         indexer = 0
 35 |         for elem in data_for_targeter['annotations']:
 36 |             king = []
 37 |             king.append(data_catg[int(elem['category_id'])][category])
 38 |             if king[0] not in targeter.keys():
 39 |                 targeter[king[0]] = indexer
 40 |                 indexer += 1
 41 |         self.nb_classes = len(targeter)
 42 | 
 43 |         self.samples = []
 44 |         for elem in data['images']:
 45 |             cut = elem['file_name'].split('/')
 46 |             target_current = int(cut[2])
 47 |             path_current = os.path.join(root, cut[0], cut[2], cut[3])
 48 | 
 49 |             categors = data_catg[target_current]
 50 |             target_current_true = targeter[categors[category]]
 51 |             self.samples.append((path_current, target_current_true))
 52 | 
 53 |     # __getitem__ and __len__ inherited from ImageFolder
 54 | 
 55 | 
 56 | def build_dataset(is_train, args):
 57 |     transform = build_transform(is_train, args)
 58 | 
 59 |     if args.data_set == 'CIFAR':
 60 |         dataset = datasets.CIFAR100(args.data_path, train=is_train, transform=transform)
 61 |         nb_classes = 100
 62 |     elif args.data_set == 'IMNET':
 63 |         root = os.path.join(args.data_path, 'train' if is_train else 'val')
 64 |         dataset = datasets.ImageFolder(root, transform=transform)
 65 |         nb_classes = 1000
 66 |     elif args.data_set == 'INAT':
 67 |         dataset = INatDataset(args.data_path, train=is_train, year=2018,
 68 |                               category=args.inat_category, transform=transform)
 69 |         nb_classes = dataset.nb_classes
 70 |     elif args.data_set == 'INAT19':
 71 |         dataset = INatDataset(args.data_path, train=is_train, year=2019,
 72 |                               category=args.inat_category, transform=transform)
 73 |         nb_classes = dataset.nb_classes
 74 | 
 75 |     return dataset, nb_classes
 76 | 
 77 | 
 78 | def build_transform(is_train, args):
 79 |     resize_im = args.input_size > 32
 80 |     if is_train:
 81 |         # this should always dispatch to transforms_imagenet_train
 82 |         transform = create_transform(
 83 |             input_size=args.input_size,
 84 |             is_training=True,
 85 |             color_jitter=args.color_jitter,
 86 |             auto_augment=args.aa,
 87 |             interpolation=args.train_interpolation,
 88 |             re_prob=args.reprob,
 89 |             re_mode=args.remode,
 90 |             re_count=args.recount,
 91 |         )
 92 |         if not resize_im:
 93 |             # replace RandomResizedCropAndInterpolation with
 94 |             # RandomCrop
 95 |             transform.transforms[0] = transforms.RandomCrop(
 96 |                 args.input_size, padding=4)
 97 |         return transform
 98 | 
 99 |     t = []
100 |     if resize_im:
101 |         size = int(args.input_size / args.eval_crop_ratio)
102 |         t.append(
103 |             transforms.Resize(size, interpolation=3),  # to maintain same ratio w.r.t. 224 images
104 |         )
105 |         t.append(transforms.CenterCrop(args.input_size))
106 | 
107 |     t.append(transforms.ToTensor())
108 |     t.append(transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD))
109 |     return transforms.Compose(t)
110 | 


--------------------------------------------------------------------------------
/profile/README.md:
--------------------------------------------------------------------------------
 1 | # Profiling Workloads on GPU
 2 | 
 3 | We provides profiling scripts on GPU. The reported results are measured on single A100 GPU with automatic mixed precision.
 4 | 
 5 | 1. `vit_infer.py`, `bert_infer.py` for launching inference on a single device.
 6 | 2. `model.py` provides a simplidied DeiT and BERT implementation based on [a](https://github.com/zixuanjiang/pre-rmsnorm-transformer) and [b](https://github.com/lucidrains/vit-pytorch).
 7 | 
 8 | 
 9 | We provide the benchmark logs of our tested results in `benchmark_logs/`.
10 | 
11 | ---
12 | ## Latency Measurement
13 | 
14 | We use `torch.utils.benchmark` to measure the latency for DeiT and BERT models for SST tasks(BERT for sequential classification).
15 | 
16 | We set the minimum run to 100 for each measurement.
17 | 
18 | ### How to use
19 | 
20 | #### DeiT
21 | Launch `python vit_infer.py -m model_name` to obtain latency for DeiT. 
22 | * `-m`: The flag for different models. We can set it to `deit-t`, `deit-s`, `deit-b` to test latency for DeiT-Tiny, DeiT-Small, DeiT-Base.
23 | 
24 | #### BERT
25 | Launch `python bert_infer.py -m model_name -s seq_length` to obtain latency for BERT for sequence classification. 
26 | * `-m`: The flag for different models. We can set it to `bert-b`, `bert-l` to test latency for DeiT-Tiny, DeiT-Small, DeiT-Base.
27 | * `-s`: The flag for sequence length. You can try 128, 256, 384, 320 for BERT.
28 | 
29 | ### Expected results
30 | If we set model to DeiT-B, and run `python vit_infer.py -m deit-b` the reported results should be like
31 | 
32 | ```
33 | y = model(x): batch_size 1 method pre-ln
34 | Base
35 | setup: from __main__ import model
36 |   Median: 4.37 ms
37 |   IQR:    0.06 ms (4.36 to 4.42)
38 |   226 measurements, 100 runs per measurement, 1 thread
39 | [-------------------  ------------------]
40 |                                   |  Base
41 | 1 threads: ------------------------------
42 |       batch_size 1 method pre-ln  |  4.4 
43 | 
44 | Times are in milliseconds (ms).
45 | ```
46 | 
47 | ---
48 | 
49 | ## Power Tracing
50 | 
51 | We use nvidia-smi to monitor the power usage when running the workloads on GPU.
52 | ```
53 | nvidia-smi dmon -s puc -d 1 -i 0 > ./power_results/power_usage.csv
54 | ```
55 | * `-s puc`: The `-s`` flag specifies which metrics to monitor. In this case, it's set to `puc``, which stands for "power usage in watts of the GPU."
56 | * `-d 1`: The `-d` flag specifies the update interval in seconds. Here, it's set to 1 second, which means that the power usage will be sampled and recorded every 1 second.
57 | * `-i 0`: The `-i` flag specifies the GPU index to monitor. In this case, it's set to 0, indicating that the monitoring should be done on GPU index 0. You can change this number to monitor a different GPU if you have run jobs on different GPUs in your system. By default, we use GPU with index 0.
58 | * `> ./power_results/power_usage.csv`: Save the monitor power usage result into power_usage.csv.
59 | 
60 | ### How to use
61 | 
62 | Launch `power_monitor.sh` to mointor the power usage. You can save the power usage into a csv file for further processing.
63 | 
64 | Please run `power_monitor.sh` before launching inference scripts `vit_infer.py`, `bert_infer.py`.
65 | 
66 | ### Expected results
67 | 
68 | Take the DeiT-T as an example (see `./benchmark_logs/deit-s-power.csv`).
69 | The monitored power usage shows you the idle power (61W in our case) and work power (72W). 
70 | Then the power during inference is 72-61=11W.
71 | 
72 | ## Energy estimation
73 | 
74 | Multiply the power with the measured latency for single inference, you can get the energy cost for single inference.
75 | 
76 | For example, the DeiT-base model has a power of 16W and a latency of 4.37ms, so the energy cost is 113.62 mJ.
77 | 
78 | ---
79 | 
80 | ## AE workflow
81 | 
82 | Follow the three steps to obtain both latency and power usage
83 | Open two terminals on the same machine.
84 | 
85 | * First run `./power_monitor.sh > ./power_results/power_usage.csv` to mointor the power usage of GPU 0. `> ./power_results/power_usage.csv`: Save the monitor power usage result into power_usage.csv.
86 | * Then launch the latency measurement file: `python vit_infer.py -m model_name`  or `python bert_infer.py -m model_name -s seq_length` on *another teminal*.
87 | * Kill the power monitor script when the latenyc measurement finishs.
88 | 
89 | Obtain the power of GPU for running workloads by substracting the power with the idle power.
90 | 
91 | For example, the DeiT-base model has a power of (87-61=16W) and a latency of 4.37ms, so the energy cost is 113.62 mJ.
92 | 
93 | ---
94 | 
95 | We refer to the following implementation.
96 | 1. [A simplified ViT implementation in PyTorch](https://github.com/lucidrains/vit-pytorch)
97 | 2. [BERT implementation from Nvidia](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/LanguageModeling/BERT)
98 | 3. [Measurement codes from pre-rmsnorm-transformer](https://github.com/zixuanjiang/pre-rmsnorm-transformer)
99 | 


--------------------------------------------------------------------------------
/hardware_simulator/utils/config.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Author: Hanqing Zhu
  3 | # @Date:   2023-02-23 22:45:26
  4 | # @Last Modified by:   Hanqing Zhu
  5 | # @Last Modified time: 2023-02-23 22:45:32
  6 | """
  7 | Description: Modified based on torchpack 0.3.0
  8 | Author: Jiaqi Gu (jqgu@utexas.edu)
  9 | Date: 2021-06-06 01:46:57
 10 | LastEditors: Jiaqi Gu (jqgu@utexas.edu)
 11 | LastEditTime: 2021-06-06 01:46:57
 12 | """
 13 | import hashlib
 14 | import json
 15 | import yaml
 16 | import os
 17 | from ast import literal_eval
 18 | from typing import Any, Dict, List, Tuple, Union
 19 | 
 20 | from multimethod import multimethod
 21 | 
 22 | 
 23 | __all__ = [
 24 |     "Config",
 25 |     "configs",
 26 | ]
 27 | 
 28 | 
 29 | class Config(dict):
 30 |     def __getattr__(self, key: str) -> Any:
 31 |         if key not in self:
 32 |             d = self
 33 |             ## try hierarchical access
 34 |             keys = key.split(".")
 35 |             for k in keys:
 36 |                 if k not in d:
 37 |                     raise AttributeError(key)
 38 |                 d = d[k]
 39 |             return d
 40 |         else:
 41 |             return self[key]
 42 | 
 43 |     def __setattr__(self, key: str, value: Any) -> None:
 44 |         self[key] = value
 45 | 
 46 |     def __delattr__(self, key: str) -> None:
 47 |         del self[key]
 48 | 
 49 |     def load(self, fpath: str, *, recursive: bool = False) -> None:
 50 |         if not os.path.exists(fpath):
 51 |             raise FileNotFoundError(fpath)
 52 |         fpaths = [fpath]
 53 |         if recursive:
 54 |             while fpath:
 55 |                 fpath = os.path.dirname(fpath)
 56 |                 for fname in ["default.yaml", "default.yml"]:
 57 |                     fpaths.append(os.path.join(fpath, fname))
 58 |         for fpath in reversed(fpaths):
 59 |             if os.path.exists(fpath):
 60 |                 with open(fpath, "r") as f:
 61 |                     cfg_dict = yaml.safe_load(f)
 62 |                 self.update(cfg_dict)
 63 | 
 64 |     def reload(self, fpath: str, *, recursive: bool = False) -> None:
 65 |         self.clear()
 66 |         self.load(fpath, recursive=recursive)
 67 | 
 68 |     @multimethod
 69 |     def update(self, other: Dict) -> None:
 70 |         for key, value in other.items():
 71 |             if isinstance(value, dict):
 72 |                 if key not in self or not isinstance(self[key], Config):
 73 |                     self[key] = Config()
 74 |                 self[key].update(value)
 75 |             else:
 76 |                 self[key] = value
 77 | 
 78 |     @multimethod
 79 |     def update(self, opts: Union[List, Tuple]) -> None:
 80 |         index = 0
 81 |         while index < len(opts):
 82 |             opt = opts[index]
 83 |             if opt.startswith("--"):
 84 |                 opt = opt[2:]
 85 |             if "=" in opt:
 86 |                 key, value = opt.split("=", 1)
 87 |                 index += 1
 88 |             else:
 89 |                 key, value = opt, opts[index + 1]
 90 |                 index += 2
 91 |             current = self
 92 |             subkeys = key.split(".")
 93 |             try:
 94 |                 value = literal_eval(value)
 95 |             except:
 96 |                 pass
 97 |             for subkey in subkeys[:-1]:
 98 |                 current = current.setdefault(subkey, Config())
 99 |             current[subkeys[-1]] = value
100 | 
101 |     def dict(self) -> Dict[str, Any]:
102 |         configs = dict()
103 |         for key, value in self.items():
104 |             if isinstance(value, Config):
105 |                 value = value.dict()
106 |             configs[key] = value
107 |         return configs
108 | 
109 |     def flat_dict(self) -> Dict[str, Any]:
110 |         def _flatten_dict(dd, separator: str = "_", prefix: str = ""):
111 |             return (
112 |                 {
113 |                     prefix + separator + k if prefix else k: v
114 |                     for kk, vv in dd.items()
115 |                     for k, v in _flatten_dict(vv, separator, kk).items()
116 |                 }
117 |                 if isinstance(dd, dict)
118 |                 else {prefix: dd}
119 |             )
120 | 
121 |         return _flatten_dict(self.dict(), separator=".")
122 | 
123 |     def hash(self) -> str:
124 |         buffer = json.dumps(self.dict(), sort_keys=True)
125 |         return hashlib.sha256(buffer.encode()).hexdigest()
126 | 
127 |     def dump_to_yml(self, path: str) -> None:
128 |         with open(path, "w") as f:
129 |             yaml.safe_dump(self.dict(), f)
130 | 
131 |     def __str__(self) -> str:
132 |         texts = []
133 |         for key, value in self.items():
134 |             if isinstance(value, Config):
135 |                 seperator = "\n"
136 |             else:
137 |                 seperator = " "
138 |             text = key + ":" + seperator + str(value)
139 |             lines = text.split("\n")
140 |             for k, line in enumerate(lines[1:]):
141 |                 lines[k + 1] = (" " * 2) + line
142 |             texts.extend(lines)
143 |         return "\n".join(texts)
144 | 
145 | 
146 | configs = Config()


--------------------------------------------------------------------------------
/hardware_simulator/entry_energy_latency_workload.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Author: Hanqing Zhu(hqzhu@utexas.edu)
  3 | # @Date:   1969-12-31 18:00:00
  4 | # @Last Modified by:   Hanqing Zhu(hqzhu@utexas.edu)
  5 | # @Last Modified time: 2023-11-12 21:02:24
  6 | 
  7 | import os
  8 | import csv
  9 | import argparse
 10 | from utils.general import ensure_dir
 11 | from utils.config import configs
 12 | from utils.model import modelParams
 13 | 
 14 | from simulator_FFN import FFNPrediction
 15 | from simulator_attn import attnPrediction
 16 | 
 17 | def main(configs, model_name='deit-s', exp_name='compare_onn', optimize_flag='arch_opt', tokens=197, print_msg=False):
 18 |     # extraxt model workload charaterstics
 19 |     model_zoo = modelParams()
 20 |     ops_list = model_zoo.obtain_ops_list(model_name=model_name, tokens=tokens)
 21 |     
 22 |     if model_name == 'bert-l':
 23 |         factor = 2
 24 |     else:
 25 |         factor = 1
 26 |     
 27 |     sv_path = f"./results/{exp_name}/{model_name}_{tokens}_{configs.core.precision.in_bit}bit/{configs.core.type}_{optimize_flag}_{configs.arch.num_tiles}t_{configs.arch.num_pe_per_tile}c/"
 28 |     sv_sub_path = f"./results/{exp_name}/{model_name}_{tokens}_{configs.core.precision.in_bit}bit/{configs.core.type}_{optimize_flag}_{configs.arch.num_tiles}t_{configs.arch.num_pe_per_tile}c/modules/"
 29 |     
 30 |     ensure_dir(sv_path)
 31 |     ensure_dir(sv_sub_path)
 32 | 
 33 |     energy_sum = 0
 34 |     latency_sum = 0
 35 |     saved_arrays = []
 36 |     for item in ops_list:
 37 |         idx = item["idx"]
 38 |         name = item["name"]
 39 |         type = item["type"]
 40 |         if type == "fc":
 41 |             predictor = FFNPrediction(item, configs)
 42 |             predictor.run(print_msg=print_msg)
 43 |             predictor.save(sv_name=name, sv_path=sv_sub_path)
 44 |             energy_cost = predictor.energy_dict['linear']['comp']['total'][0] + \
 45 |                 predictor.energy_dict['linear']['datamovement']['total'][0]
 46 | 
 47 |             latency_cost = predictor.latency_dict['linear']['total'][1]
 48 |             if not 'head' in name and not 'embed' in name:
 49 |                 energy_cost *= 12 * factor
 50 |                 latency_cost *= 12 * factor
 51 |             saved_arrays.append([name, energy_cost, latency_cost])
 52 |         elif type == 'attn':
 53 |             if configs.core.type != 'mzi':
 54 |                 predictor = attnPrediction(item, configs)
 55 |                 predictor.run(print_msg=print_msg)
 56 |                 predictor.save(sv_name=name, sv_path=sv_sub_path)
 57 |                 energy_cost = predictor.energy_dict['Q*K^T']['comp']['total'][0] + predictor.energy_dict['Q*K^T']['datamovement']['total'][0] + \
 58 |                     predictor.energy_dict['S*V']['comp']['total'][0] + predictor.energy_dict['S*V']['datamovement']['total'][0]
 59 |                 # print(predictor.energy_dict['linear']['comp'])
 60 |                 latency_cost = predictor.latency_dict['Q*K^T']['total'][1] + predictor.latency_dict['S*V']['total'][1]
 61 |                 energy_cost *= 12 * factor
 62 |                 latency_cost *= 12 * factor
 63 |                 saved_arrays.append([name, energy_cost, latency_cost])
 64 |         else:
 65 |             raise NotImplementedError
 66 |         energy_sum += energy_cost
 67 |         latency_sum += latency_cost
 68 |     
 69 |     energy_others, latency_others = model_zoo.obtain_other_costs(model_name=model_name, tokens=tokens)
 70 |     saved_arrays.append(["others", energy_others, latency_others])
 71 |     energy_sum += energy_others
 72 |     latency_sum += latency_others
 73 | 
 74 |     def __save_csv(sv_name, total, arrays):
 75 |         with open(sv_name, 'w') as csvfile:
 76 |             writer = csv.writer(csvfile)
 77 |             writer.writerow(['', 'energy (mJ)', 'latency (ms)'])
 78 |             writer.writerow(total)
 79 |             for each in arrays:
 80 |                 writer.writerow(each)
 81 |     __save_csv(os.path.join(sv_path, 'total.csv'), [
 82 |                'total', energy_sum, latency_sum], saved_arrays)
 83 | 
 84 | 
 85 | if __name__ == "__main__":
 86 |     parser = argparse.ArgumentParser()
 87 |     parser.add_argument("-c", "--config", default=".params.yaml",
 88 |                         metavar="FILE", help="config file")
 89 |     parser.add_argument("-m", "--model_name", default="deit-s",
 90 |                         help="model")
 91 |     parser.add_argument("-t", "--tokens", default=197,
 92 |                         help="tokens or sequence length")
 93 |     parser.add_argument("-o", "--optimize_flag", default="arch_opt",
 94 |                         help="optimize flag for DOTA")
 95 |     parser.add_argument("-e", "--exp_name", default="compare_onn",
 96 |                         help="experiments name")
 97 |     args, opts = parser.parse_known_args()
 98 |     configs.load(args.config, recursive=True)
 99 |     configs.update(opts)
100 |     
101 |     if configs.core.type == "dota":
102 |         # three different optimize flag
103 |         # broadcast
104 |         # crossbar
105 |         # arch-opt
106 |         assert args.optimize_flag in ["broadcast", "crossbar", "arch_opt"]
107 |         configs.arch.disable_crossbar_topology = 1 if args.optimize_flag == "broadcast" else 0
108 |         if args.optimize_flag == "arch_opt":
109 |             configs.arch.adc_share_flag = 1
110 |             configs.arch.time_accum_factor = 3
111 |             configs.arch.input_mod_sharing_flag = 1
112 |         else:
113 |             configs.arch.adc_share_flag = 0
114 |             configs.arch.time_accum_factor = 1
115 |             configs.arch.input_mod_sharing_flag = 0
116 |     elif configs.core.type == 'mrrbank' or configs.core.type == 'mzi':
117 |         configs.arch.weight_reuse_factor = -1 # fully weight-stationary flow
118 |         args.optimize_flag = 'broadcast'
119 |     else:
120 |         raise ValueError(f"Got unsupportted core type {configs.core.type}")
121 |     print(f"Report energy and latency estimation for {args.model_name}_{args.tokens}_{configs.core.precision.in_bit}bit on {configs.core.type}_{args.optimize_flag}_{configs.arch.num_tiles}t_{configs.arch.num_pe_per_tile}c")
122 | 
123 |     main(configs=configs, model_name=args.model_name, exp_name=args.exp_name, optimize_flag=args.optimize_flag, tokens=int(args.tokens))
124 |     
125 |     sv_path = f"./results/{args.exp_name}/{args.model_name}_{args.tokens}_{configs.core.precision.in_bit}bit/{configs.core.type}_{args.optimize_flag}_{configs.arch.num_tiles}t_{configs.arch.num_pe_per_tile}c"
126 |     
127 |     print(f'Finish and save report to {sv_path}')
128 |     print('-'*20)


--------------------------------------------------------------------------------
/hardware_simulator/readme.md:
--------------------------------------------------------------------------------
  1 | # Hardware simulator for our photonic Transformer accelerator
  2 | 
  3 | This contains the hardware simulator for our photonic Transformer accelerator, DOTA, in our lightning-transformer work.
  4 | Our simulator is based on behavior-level simulation.
  5 | 
  6 | We support simulates our DOTA-B/L variants with 4-bit/8-bit work mode. And we also support simulating the photonic baselines, MRR bank and MZI.
  7 | 
  8 | ---
  9 | 
 10 | ## Code structures
 11 | 
 12 | * `./hardwares/`. This directory contains the modeling for photonic tensor cores, including our dynamically-operated crossbar-style PTC and two baselines: MRR bank and MZI.
 13 | * `./params/`. 
 14 |     * `./params/device_params/` This directory contains the accelerator detailed params as well as all the device parameters.
 15 |         * DOTA-B: A 4 tile variant of our DOTA photonic Transformer accelerator.
 16 |         * DOTA-L: A 8 tile variant of our DOTA photonic Transformer accelerator.
 17 |         * MZI: A 2 tile variant of MZI mesh.
 18 |         * MRR bank: A 7 tile variant of MRR bank.
 19 |         * *NOTE: we keep DOTA-B, MZI, and MRR bank under the same area budget for fair comparasion.* 
 20 | 
 21 | * `entry_area_power_profile.py`. The python file you can launch to profile the area and power of the accelerator.
 22 | * `entry_energy_latency_workload.py`. The python file you can launch to profile the energy and latency when running one workload on given accelerator.
 23 | * `/results/`. The generated results will be dumpped into this directory.
 24 | * `/utils/`. Utility functions.
 25 | 
 26 | ## AE exp1: Simulate the area and power of our photonic accelerator.
 27 | 
 28 | ### Single run by run `entry_area_power_profile.py`
 29 | 
 30 | To simulate the area and power, run
 31 | ```
 32 | exp='area_power_profile_single' # exp name you give
 33 | config='./params/device_params/Dota_B_4bit.yaml' # the param file of the given photonic accelerator
 34 | 
 35 | python entry_area_power_profile.py \
 36 |         -e ${exp} \
 37 |         --config ${config}
 38 | ```
 39 | 
 40 | It will generate the area and power report under `./results/exp_name_you_give/accelerator_name/`. It contains two CSV files for area and power estimation.
 41 | 
 42 | For example if you run
 43 | ```
 44 | python entry_area_power_profile.py -e area_power_profile_single --config ./params/device_params/Dota_B_4bit.yaml
 45 | ```
 46 | You will have the area and power report under `./results/area_power_profile_single/dota_4t_2c_4bit/`.
 47 | The area report would be 
 48 | 
 49 | |dota         |area (mm^2)         |percentage (%)|
 50 | |-------------|--------------------|--------------|
 51 | |total        |60.329395086        |1             |
 52 | |laser        |0.72                |1.19          |
 53 | |DAC          |15.84               |26.26         |
 54 | |MZM          |7.59416832          |12.59         |
 55 | |ADC          |1.6416              |2.72          |
 56 | |TIA          |0.0576              |0.1           |
 57 | |photonic_core|11.318291999999998  |18.76         |
 58 | |adder        |0.051199999999999996|0.08          |
 59 | |mem          |14.695398766000002  |24.36         |
 60 | |micro_comb   |8.411135999999999   |13.94         |
 61 | 
 62 | 
 63 | *Note that we only provide area report for MZI and MRR baselines.*
 64 | 
 65 | ### Batch run by run `./scripts/area_power_all.sh`
 66 | 
 67 | We provide one script to run the area and power estimation for all photonic accelerator variants: DOTA-B-4/8bit, DOTA-L-4/8bit, MRR-4/8bit, MZI-4/8bit. It generated results under `./results/area_power_all/`
 68 | 
 69 | 
 70 | ## AE exp2: Simulate the energy and latency when running workload on photonic system.
 71 | 
 72 | ### Single run by run `entry_energy_latency_workload.py`
 73 | 
 74 | To simulate the energy and latency for a given Transformer workload (DeiT-T/S/B, BERT-B/L in our work), run the 
 75 | ```
 76 | exp='energy_latency_single_workload' # exp name
 77 | model_name='deit-t' # model name
 78 | tokens=197 # number of tokens, 197 for deit, you can define number of tokens for bert
 79 | onn_params='./params/device_params/Dota_B_4bit.yaml'
 80 | # choose onn accelerator params from
 81 | # config_dict=(
 82 | #     ['dota_b_4bit']='./params/device_params/Dota_B_4bit.yaml'
 83 | #     ['dota_b_8bit']='./params/device_params/Dota_B_8bit.yaml'
 84 | #     ['dota_l_4bit']='./params/device_params/Dota_L_4bit.yaml'
 85 | #     ['dota_l_8bit']='./params/device_params/Dota_L_8bit.yaml'
 86 | #     ['mrr_4bit']='./params/device_params/Bs_mrr_bank_4bit.yaml'
 87 | #     ['mrr_8bit']='./params/device_params/Bs_mrr_bank_8bit.yaml'
 88 | #     ['mzi_4bit']='./params/device_params/Bs_mzi_4bit.yaml'
 89 | #     ['mzi_8bit']='./params/device_params/Bs_mzi_8bit.yaml'
 90 | # )
 91 | 
 92 | python entry_energy_latency_workload.py \
 93 |     -e ${exp} \
 94 |     --tokens ${tokens} \
 95 |     --model_name ${model_name} \
 96 |     --config ${onn_params}
 97 | ```
 98 | 
 99 | It will generate the energy and latency report under `./results/exp_name_you_give/accelerator_name/`. 
100 | 
101 | It contains a `total.csv` under this directory for energy and latency estimation, which also has the breakdown based on different layer types, e.g., attention/FFN/QKV/head.
102 | 
103 | We also privide a more detailed energy breakdown for different layer types under the `modules/` in this directory.
104 | It provides the energy breakdown across different compoents (e.g., laser, DAC, ADC, data-moevemnt, etc.).
105 | 
106 | You can change the arguments for model_name and the corresponding tokens. We support following arguments for `model_name`
107 | * deit-t
108 | * deit-s
109 | * deit-b
110 | * bert-b
111 | * bert-l
112 | 
113 | The correct token number for deit on ImageNet dataset is 197. For bert, you can vary different number of tokens.
114 | 
115 | You can also change the argument to enable/disable architecture-level optimization for our DOTA by settting the argument
116 | ```
117 | --optimize_flag arch_opt # set to crossbar to disable arch optimization
118 | ```
119 | 
120 | ### Batch run by run `./scripts/energy_latency_all.sh`
121 | 
122 | We provide one script to run the estimation for all workloads we used in our paper.
123 | * deit-t with tokens=197
124 | * deit-s with tokens=197
125 | * deit-b with tokens=197
126 | * bert-b with tokens=384
127 | * bert-l with tokens=320
128 | for photonic accelerator variants: DOTA-B-4/8bit, DOTA-L-4/8bit, MRR-4/8bit, MZI-4/8bit. It generated results under `./results/energy_latency_all/`
129 | 
130 | *Note that we only provide reports on linear layers for MZI since it cannot support Attention efficiently due to on-the-fly activation decomposition that is extremely expensive.*
131 | 


--------------------------------------------------------------------------------
/hardware_simulator/hardware/photonic_core_base.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Author: Hanqing Zhu(hqzhu@utexas.edu)
  3 | # @Date:   1969-12-31 18:00:00
  4 | # @Last Modified by:   Hanqing Zhu(hqzhu@utexas.edu)
  5 | # @Last Modified time: 2023-11-08 17:23:00
  6 | # Basci class for photonic core
  7 | from .ADC import ADC
  8 | from .DAC import DAC
  9 | 
 10 | __all__ = ["PhotoniceCore"]
 11 | 
 12 | class PhotonicCore():
 13 |     def __init__(self, *args, **kwargs):
 14 |         super().__init__(*args, **kwargs)
 15 |         self.photonic_core_type = None
 16 |         self.width = None
 17 |         self.height = None
 18 | 
 19 |     ## obtain params for photonic devices
 20 |     def _obtain_laser_param(self, config=None):
 21 |         if config is not None:
 22 |             self.laser_power = config.power
 23 |             self.laser_length = config.length
 24 |             self.laser_width = config.width
 25 |             self.laser_area = self.laser_length * self.laser_width
 26 |             self.laser_wall_plug_eff = config.wall_plug_eff
 27 |         else:
 28 |             self.laser_power = 0.5
 29 |             self.laser_length = 400
 30 |             self.laser_width = 300
 31 |             self.laser_area = self.laser_length * self.laser_width
 32 |             self.laser_wall_plug_eff = 0.25
 33 | 
 34 |     def _obtain_micro_comb_param(self, config=None):
 35 |         if config is not None:
 36 |             self.micro_comb_length = config.length
 37 |             self.micro_comb_width = config.width
 38 |         else:
 39 |             self.micro_comb_length = 1184
 40 |             self.micro_comb_width = 1184
 41 |         self.micro_comb_area = self.micro_comb_length * self.micro_comb_width
 42 | 
 43 |     # modulator
 44 |     def _obtain_modulator_param(self, config=None):
 45 |         if config is not None:
 46 |             self.modulator_type = config.type
 47 |             self.modulator_energy_per_bit = config.energy_per_bit
 48 |             self.modulator_power_static = config.static_power
 49 |             self.modulator_length = config.length
 50 |             self.modulator_width = config.width
 51 |             self.modulator_insertion_loss = config.insertion_loss
 52 |         else:
 53 |             self.modulator_energy_per_bit = 400
 54 |             self.modulator_static_power = 0
 55 |             self.modulator_length = 300
 56 |             self.modulator_width = 50
 57 |             self.modulator_insertion_loss = 0.8
 58 | 
 59 |     # basic devices
 60 |     def _obtain_y_branch_param(self, config=None):
 61 |         if config is not None:
 62 |             self.y_branch_length = config.length
 63 |             self.y_branch_width = config.width
 64 |             self.y_branch_insertion_loss = config.insertion_loss
 65 |         else:
 66 |             self.y_branch_length = 75
 67 |             self.y_branch_width = 3.9
 68 |             self.y_branch_insertion_loss = 0.1
 69 | 
 70 |     def _obtain_photodetector_param(self, config=None):
 71 |         if config is not None:
 72 |             self.photo_detector_power = config.power
 73 |             self.photo_detector_length = config.length
 74 |             self.photo_detector_width = config.width
 75 |             self.photo_detector_sensitivity = config.sensitivity
 76 |         else:
 77 |             self.photo_detector_power = 2.8
 78 |             self.photo_detector_length = 40
 79 |             self.photo_detector_width = 40
 80 |             self.photo_detector_sensitivity = -25
 81 | 
 82 |     def _obtain_direction_coupler_param(self, config=None):
 83 |         if config is not None:
 84 |             self.direction_coupler_length = config.length
 85 |             self.direction_coupler_width = config.width
 86 |             self.direction_coupler_insertion_loss = config.insertion_loss
 87 |         else:
 88 |             self.direction_coupler_length = 75
 89 |             self.direction_coupler_width = 10
 90 |             self.direction_coupler_insertion_loss = 0.3
 91 | 
 92 |     def _obtain_phase_shifter_param(self, config=None):
 93 |         if config is not None:
 94 |             self.phase_shifter_power_dynamic = config.dynamic_power
 95 |             self.phase_shifter_power_static = config.static_power
 96 |             self.phase_shifter_length = config.length
 97 |             self.phase_shifter_width = config.width
 98 |             self.phase_shifter_insertion_loss = config.insertion_loss
 99 |             # self.phase_shifter_programming_time = config.programming_time
100 |         else:
101 |             self.phase_shifter_power_dynamic = 0
102 |             self.phase_shifter_power_static = 0
103 |             self.phase_shifter_length = 200
104 |             self.phase_shifter_width = 34
105 |             self.phase_shifter_insertion_loss = 0.2
106 |             # self.phase_shifter_programming_time = 10 # ns based on NEOM
107 |             
108 | 
109 |     def _obtain_mrr_router_param(self, config=None):
110 |         if config is not None:
111 |             self.mrr_router_power = config.static_power
112 |             self.mrr_router_length = config.length
113 |             self.mrr_router_width = config.width
114 |             self.mrr_router_insertion_loss = config.insertion_loss
115 |         else:
116 |             self.mrr_router_power = 2.4
117 |             self.mrr_router_length = 20
118 |             self.mrr_router_width = 20
119 |             self.mrr_router_insertion_loss = 0.25   
120 | 
121 |     def _obtain_TIA_param(self, config=None):
122 |         if config is not None:
123 |             self.TIA_power = config.power
124 |             self.TIA_area = config.area
125 |         else:
126 |             raise NotImplementedError
127 |             self.TIA_power = 3
128 |             self.TIA_area = 5200
129 |             
130 |     def _obtain_ADC_param(self, config=None):
131 |         if config is not None:
132 |             ADC_choice = config.choice
133 |             self.core_ADC_sharing_factor = config.sharing_factor
134 |             self.ADC = ADC(ADC_choice)
135 |         else:
136 |             raise NotImplementedError
137 |     
138 |     def _obtain_DAC_param(self, config=None):
139 |         if config is not None:
140 |             DAC_choice = config.choice
141 |             self.DAC = DAC(DAC_choice)
142 |         else:
143 |             raise NotImplementedError
144 |     
145 |     
146 |     ## calculate area, insertion loss and energy cost
147 |     def cal_insertion_loss(self):
148 |         raise NotImplementedError
149 | 
150 |     def cal_TX_energy(self):
151 |         raise NotImplementedError
152 | 
153 |     def cal_D2A_energy(self):
154 |         raise NotImplementedError
155 | 
156 |     def cal_RX_energy(self):
157 |         raise NotImplementedError
158 | 
159 |     def cal_A2D_energy(self):
160 |         raise NotImplementedError
161 | 
162 |     def cal_comp_energy(self):
163 |         raise NotImplementedError
164 | 
165 |     def cal_laser_energy(self):
166 |         raise NotImplementedError
167 | 
168 |     def cal_core_area(self):
169 |         raise NotImplementedError
170 | 
171 |     def cal_core_power(self):
172 |         raise NotImplementedError
173 |     


--------------------------------------------------------------------------------
/hardware_simulator/utils/model.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Author: Hanqing Zhu(hqzhu@utexas.edu)
  3 | # @Date:   1969-12-31 18:00:00
  4 | # @Last Modified by:   Hanqing Zhu(hqzhu@utexas.edu)
  5 | # @Last Modified time: 2023-11-11 02:41:52
  6 | from utils.config import configs
  7 | from utils.cal_flops_for_transformer import get_infer_ops
  8 | 
  9 | model_zoo ={
 10 |     'deit-t': {'patch': 16, 'depth': 12, 'embed_dim': 192, 'num_heads': 3, 'mlp_ratio': 4, 'tokens': 197},
 11 |     'deit-s': {'patch': 16, 'depth': 12, 'embed_dim': 384, 'num_heads': 6, 'mlp_ratio': 4, 'tokens': 197},
 12 |     'deit-b': {'patch': 16, 'depth': 12, 'embed_dim': 768, 'num_heads': 12, 'mlp_ratio': 4, 'tokens': 197},
 13 |     'bert-b': {'depth': 12, 'embed_dim': 768, 'num_heads': 12, 'mlp_ratio': 4, 'tokens': 384},
 14 |     'bert-l': {'depth': 24, 'embed_dim': 1024, 'num_heads': 16, 'mlp_ratio': 4, 'tokens': 320},
 15 | }
 16 | 
 17 | 
 18 | class modelParams():
 19 |     # generate op list based on model param
 20 |     def __init__(self) -> None:
 21 |         super().__init__()
 22 |         
 23 |     def obtain_other_costs(self, model_name='deit-t', tokens=None):
 24 |         """Function to return estimated energy and latency for non-GEMM ops"""
 25 |         energy, latency = 0, 0
 26 |         
 27 |         tokens = model_zoo[model_name]["tokens"] if tokens is None else tokens
 28 |         softmax_ops, layer_norm_ops, residual_ops, activation_ops = get_infer_ops(
 29 |             h_d=model_zoo[model_name]["embed_dim"], 
 30 |             l_s=model_zoo[model_name]["depth"], 
 31 |             seq= tokens,
 32 |             heads=model_zoo[model_name]["num_heads"],
 33 |             head_size=model_zoo[model_name]["embed_dim"] //model_zoo[model_name]["num_heads"]
 34 |         )
 35 |         bits = 4 # default is 4 bits
 36 |         
 37 |         # softmax estimation use 
 38 |         # "high-speed and low-complexity architecture for softmax function in deep learning,” in 2018 IEEE asia pacific conference on circuits and systems (APCCAS
 39 |         softmax_energy_byte = 51.6 / 44.8 * 1e-9 # mJ/Byte
 40 |         # other uses mac * ops
 41 |         LAYER_NORM_FLOPS = 5
 42 |         # GELU: 0.5 * x * (1 + tanh(sqrt(2 / np.pi) * (x + 0.044715 * pow(x, 3))))
 43 |         ACTIVATION_FLOPS = 8
 44 | 
 45 |         comp_energy = (activation_ops*ACTIVATION_FLOPS + layer_norm_ops * LAYER_NORM_FLOPS + residual_ops) * 100 * 1e-12 + softmax_energy_byte * softmax_ops * bits /8
 46 |         datamovement_energy = (activation_ops + residual_ops + layer_norm_ops + softmax_ops) * 1.655e-9 * bits / 16 * 2
 47 |         energy = comp_energy + datamovement_energy
 48 |         
 49 |         # latency: 
 50 |         # estimated as memory access latency since all activations are stored on-chip
 51 |         bandwidth_sram = 1 / 0.604347 * 64 * 64* 1024 * 1024 * 1024 * 8
 52 |         clock_frequency = 500 * 1e6
 53 |         latency = (softmax_ops + layer_norm_ops + residual_ops + activation_ops ) * bits / bandwidth_sram
 54 | 
 55 |         return energy, latency
 56 |         
 57 |     def obtain_ops_list(self, model_name='deit-t', tokens=None):
 58 |         """Function to return the GEMM workloads dict"""
 59 |         ops_list = []
 60 |         if 'deit' in model_name:
 61 |             model_params = model_zoo[model_name]
 62 |             patch = model_params['patch']
 63 |             depth = model_params['depth']
 64 |             embed_dim = model_params['embed_dim']
 65 |             num_heads = model_params['num_heads']
 66 |             mlp_ratio = model_params['mlp_ratio']
 67 |             num_classes = 1000
 68 |             tokens = tokens if tokens is not None else model_params['tokens']
 69 |             idx = 0
 70 |             # deit family
 71 |             # first a 3 by 3 conv
 72 |             ops_list.append(
 73 |                 {"idx": idx, "name": 'embed', "type": "fc", "in_features": 3*patch*patch, "out_features": embed_dim, "bs": 196}
 74 |             )
 75 |             idx += 1
 76 |             # atten block
 77 |             ops_list.append(
 78 |                 {"idx": idx, "name": 'qkv', "type": "fc", "in_features": embed_dim, "out_features": embed_dim*3, "bs": tokens}
 79 |             )
 80 |             idx += 1
 81 |             ops_list.append(
 82 |                 {"idx": idx, "name": 'attn', "type": "attn", "num_heads": num_heads, "embed_dim": embed_dim, "num_tokens": tokens}
 83 |             )
 84 |             idx += 1
 85 |             ops_list.append(
 86 |                 {"idx": idx, "name": 'proj', "type": "fc", "in_features": embed_dim, "out_features": embed_dim, "bs": tokens}
 87 |             )
 88 |             idx += 1
 89 |             ops_list.append(
 90 |                 {"idx": idx, "name": 'FFN1', "type": "fc", "in_features": embed_dim, "out_features": embed_dim*mlp_ratio, "bs": tokens}
 91 |             )
 92 |             idx += 1
 93 |             ops_list.append(
 94 |                 {"idx": idx, "name": 'FFN2', "type": "fc", "in_features": embed_dim*mlp_ratio, "out_features": embed_dim, "bs": tokens}
 95 |             )
 96 |             idx += 1
 97 |             ops_list.append(
 98 |                 {"idx": idx, "name": 'head', "type": "fc", "in_features": embed_dim, "out_features": num_classes, "bs": 1}
 99 |             )
100 |         elif 'bert' in model_name:
101 |             model_params = model_zoo[model_name]
102 |             depth = model_params['depth']
103 |             embed_dim = model_params['embed_dim']
104 |             num_heads = model_params['num_heads']
105 |             mlp_ratio = model_params['mlp_ratio']
106 |             num_classes = 2
107 |             tokens = tokens if tokens is not None else model_params['tokens']
108 |             idx = 0
109 |             ops_list.append(
110 |                 {"idx": idx, "name": 'qkv', "type": "fc", "in_features": embed_dim, "out_features": embed_dim*3, "bs": tokens}
111 |             )
112 |             idx += 1
113 |             ops_list.append(
114 |                 {"idx": idx, "name": 'attn', "type": "attn", "num_heads": num_heads, "embed_dim": embed_dim, "num_tokens": tokens}
115 |             )
116 |             idx += 1
117 |             ops_list.append(
118 |                 {"idx": idx, "name": 'proj', "type": "fc", "in_features": embed_dim, "out_features": embed_dim, "bs": tokens}
119 |             )
120 |             idx += 1
121 |             ops_list.append(
122 |                 {"idx": idx, "name": 'FFN1', "type": "fc", "in_features": embed_dim, "out_features": embed_dim*mlp_ratio, "bs": tokens}
123 |             )
124 |             idx += 1
125 |             ops_list.append(
126 |                 {"idx": idx, "name": 'FFN2', "type": "fc", "in_features": embed_dim*mlp_ratio, "out_features": embed_dim, "bs": tokens}
127 |             )
128 |             idx += 1
129 |             ops_list.append(
130 |                 {"idx": idx, "name": 'head', "type": "fc", "in_features": embed_dim, "out_features": num_classes, "bs": 1}
131 |             )
132 |             
133 |         return ops_list
134 | 
135 | if __name__ == "__main__":
136 |     test = modelParams()
137 |     ops_list = test.obtain_ops_list('bert-l', tokens=384)
138 |     print(ops_list)
139 |     
140 |     test.obtain_other_costs('bert-l', tokens=384)


--------------------------------------------------------------------------------
/software_model/resmlp_models.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2015-present, Facebook, Inc.
  2 | # All rights reserved.
  3 | import torch
  4 | import torch.nn as nn
  5 | from functools import partial
  6 | 
  7 | from timm.models.vision_transformer import Mlp, PatchEmbed , _cfg
  8 | from timm.models.registry import register_model
  9 | from timm.models.layers import trunc_normal_,  DropPath
 10 | 
 11 | 
 12 | __all__ = [
 13 |     'resmlp_12', 'resmlp_24', 'resmlp_36', 'resmlpB_24'
 14 | ]
 15 | 
 16 | class Affine(nn.Module):
 17 |     def __init__(self, dim):
 18 |         super().__init__()
 19 |         self.alpha = nn.Parameter(torch.ones(dim))
 20 |         self.beta = nn.Parameter(torch.zeros(dim))
 21 | 
 22 |     def forward(self, x):
 23 |         return self.alpha * x + self.beta    
 24 |     
 25 | class layers_scale_mlp_blocks(nn.Module):
 26 | 
 27 |     def __init__(self, dim, drop=0., drop_path=0., act_layer=nn.GELU,init_values=1e-4,num_patches = 196):
 28 |         super().__init__()
 29 |         self.norm1 = Affine(dim)
 30 |         self.attn = nn.Linear(num_patches, num_patches)
 31 |         self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
 32 |         self.norm2 = Affine(dim)
 33 |         self.mlp = Mlp(in_features=dim, hidden_features=int(4.0 * dim), act_layer=act_layer, drop=drop)
 34 |         self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True)
 35 |         self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True)
 36 | 
 37 |     def forward(self, x):
 38 |         x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x).transpose(1,2)).transpose(1,2))
 39 |         x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
 40 |         return x 
 41 | 
 42 | 
 43 | class resmlp_models(nn.Module):
 44 | 
 45 |     def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12,drop_rate=0.,
 46 |                  Patch_layer=PatchEmbed,act_layer=nn.GELU,
 47 |                 drop_path_rate=0.0,init_scale=1e-4):
 48 |         super().__init__()
 49 | 
 50 | 
 51 | 
 52 |         self.num_classes = num_classes
 53 |         self.num_features = self.embed_dim = embed_dim  
 54 | 
 55 |         self.patch_embed = Patch_layer(
 56 |                 img_size=img_size, patch_size=patch_size, in_chans=int(in_chans), embed_dim=embed_dim)
 57 |         num_patches = self.patch_embed.num_patches
 58 |         dpr = [drop_path_rate for i in range(depth)]
 59 | 
 60 |         self.blocks = nn.ModuleList([
 61 |             layers_scale_mlp_blocks(
 62 |                 dim=embed_dim,drop=drop_rate,drop_path=dpr[i],
 63 |                 act_layer=act_layer,init_values=init_scale,
 64 |                 num_patches=num_patches)
 65 |             for i in range(depth)])
 66 | 
 67 | 
 68 |         self.norm = Affine(embed_dim)
 69 | 
 70 | 
 71 | 
 72 |         self.feature_info = [dict(num_chs=embed_dim, reduction=0, module='head')]
 73 |         self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
 74 |         self.apply(self._init_weights)
 75 | 
 76 |     def _init_weights(self, m):
 77 |         if isinstance(m, nn.Linear):
 78 |             trunc_normal_(m.weight, std=0.02)
 79 |             if m.bias is not None:
 80 |                 nn.init.constant_(m.bias, 0)
 81 |         elif isinstance(m, nn.LayerNorm):
 82 |             nn.init.constant_(m.bias, 0)
 83 |             nn.init.constant_(m.weight, 1.0)
 84 | 
 85 | 
 86 | 
 87 |     def get_classifier(self):
 88 |         return self.head
 89 | 
 90 |     def reset_classifier(self, num_classes, global_pool=''):
 91 |         self.num_classes = num_classes
 92 |         self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
 93 | 
 94 |     def forward_features(self, x):
 95 |         B = x.shape[0]
 96 | 
 97 |         x = self.patch_embed(x)
 98 | 
 99 |         for i , blk in enumerate(self.blocks):
100 |             x  = blk(x)
101 | 
102 |         x = self.norm(x)
103 |         x = x.mean(dim=1).reshape(B,1,-1)
104 | 
105 |         return x[:, 0]
106 | 
107 |     def forward(self, x):
108 |         x  = self.forward_features(x)
109 |         x = self.head(x)
110 |         return x 
111 | 
112 | @register_model
113 | def resmlp_12(pretrained=False,dist=False, **kwargs):
114 |     model = resmlp_models(
115 |         patch_size=16, embed_dim=384, depth=12,
116 |         Patch_layer=PatchEmbed,
117 |         init_scale=0.1,**kwargs)
118 |     
119 |     model.default_cfg = _cfg()
120 |     if pretrained:
121 |         if dist:
122 |           url_path = "https://dl.fbaipublicfiles.com/deit/resmlp_12_dist.pth"
123 |         else:
124 |           url_path = "https://dl.fbaipublicfiles.com/deit/resmlp_12_no_dist.pth"
125 |         checkpoint = torch.hub.load_state_dict_from_url(
126 |             url=url_path,
127 |             map_location="cpu", check_hash=True
128 |         )
129 |             
130 |         model.load_state_dict(checkpoint)
131 |     return model
132 |   
133 | @register_model
134 | def resmlp_24(pretrained=False,dist=False,dino=False, **kwargs):
135 |     model = resmlp_models(
136 |         patch_size=16, embed_dim=384, depth=24,
137 |         Patch_layer=PatchEmbed,
138 |         init_scale=1e-5,**kwargs)
139 |     model.default_cfg = _cfg()
140 |     if pretrained:
141 |         if dist:
142 |           url_path = "https://dl.fbaipublicfiles.com/deit/resmlp_24_dist.pth"
143 |         elif dino:
144 |           url_path = "https://dl.fbaipublicfiles.com/deit/resmlp_24_dino.pth"
145 |         else:
146 |           url_path = "https://dl.fbaipublicfiles.com/deit/resmlp_24_no_dist.pth"
147 |         checkpoint = torch.hub.load_state_dict_from_url(
148 |             url=url_path,
149 |             map_location="cpu", check_hash=True
150 |         )
151 |             
152 |         model.load_state_dict(checkpoint)
153 |     return model
154 |   
155 | @register_model
156 | def resmlp_36(pretrained=False,dist=False, **kwargs):
157 |     model = resmlp_models(
158 |         patch_size=16, embed_dim=384, depth=36,
159 |         Patch_layer=PatchEmbed,
160 |         init_scale=1e-6,**kwargs)
161 |     model.default_cfg = _cfg()
162 |     if pretrained:
163 |         if dist:
164 |           url_path = "https://dl.fbaipublicfiles.com/deit/resmlp_36_dist.pth"
165 |         else:
166 |           url_path = "https://dl.fbaipublicfiles.com/deit/resmlp_36_no_dist.pth"
167 |         checkpoint = torch.hub.load_state_dict_from_url(
168 |             url=url_path,
169 |             map_location="cpu", check_hash=True
170 |         )
171 |             
172 |         model.load_state_dict(checkpoint)
173 |     return model
174 | 
175 | @register_model
176 | def resmlpB_24(pretrained=False,dist=False, in_22k = False, **kwargs):
177 |     model = resmlp_models(
178 |         patch_size=8, embed_dim=768, depth=24,
179 |         Patch_layer=PatchEmbed,
180 |         init_scale=1e-6,**kwargs)
181 |     model.default_cfg = _cfg()
182 |     if pretrained:
183 |         if dist:
184 |           url_path = "https://dl.fbaipublicfiles.com/deit/resmlpB_24_dist.pth"
185 |         elif in_22k:
186 |           url_path = "https://dl.fbaipublicfiles.com/deit/resmlpB_24_22k.pth"
187 |         else:
188 |           url_path = "https://dl.fbaipublicfiles.com/deit/resmlpB_24_no_dist.pth"
189 |             
190 |         checkpoint = torch.hub.load_state_dict_from_url(
191 |             url=url_path,
192 |             map_location="cpu", check_hash=True
193 |         )
194 |             
195 |         model.load_state_dict(checkpoint)
196 |     
197 |     return model
198 | 


--------------------------------------------------------------------------------
/software_model/models.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2015-present, Facebook, Inc.
  2 | # All rights reserved.
  3 | import torch
  4 | import torch.nn as nn
  5 | from functools import partial
  6 | 
  7 | from timm.models.vision_transformer import VisionTransformer, _cfg
  8 | from timm.models.registry import register_model
  9 | from timm.models.layers import trunc_normal_
 10 | 
 11 | 
 12 | __all__ = [
 13 |     'deit_tiny_patch16_224', 'deit_small_patch16_224', 'deit_base_patch16_224',
 14 |     'deit_tiny_distilled_patch16_224', 'deit_small_distilled_patch16_224',
 15 |     'deit_base_distilled_patch16_224', 'deit_base_patch16_384',
 16 |     'deit_base_distilled_patch16_384',
 17 | ]
 18 | 
 19 | 
 20 | class DistilledVisionTransformer(VisionTransformer):
 21 |     def __init__(self, *args, **kwargs):
 22 |         super().__init__(*args, **kwargs)
 23 |         self.dist_token = nn.Parameter(torch.zeros(1, 1, self.embed_dim))
 24 |         num_patches = self.patch_embed.num_patches
 25 |         self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 2, self.embed_dim))
 26 |         self.head_dist = nn.Linear(self.embed_dim, self.num_classes) if self.num_classes > 0 else nn.Identity()
 27 | 
 28 |         trunc_normal_(self.dist_token, std=.02)
 29 |         trunc_normal_(self.pos_embed, std=.02)
 30 |         self.head_dist.apply(self._init_weights)
 31 | 
 32 |     def forward_features(self, x):
 33 |         # taken from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
 34 |         # with slight modifications to add the dist_token
 35 |         B = x.shape[0]
 36 |         x = self.patch_embed(x)
 37 | 
 38 |         cls_tokens = self.cls_token.expand(B, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
 39 |         dist_token = self.dist_token.expand(B, -1, -1)
 40 |         x = torch.cat((cls_tokens, dist_token, x), dim=1)
 41 | 
 42 |         x = x + self.pos_embed
 43 |         x = self.pos_drop(x)
 44 | 
 45 |         for blk in self.blocks:
 46 |             x = blk(x)
 47 | 
 48 |         x = self.norm(x)
 49 |         return x[:, 0], x[:, 1]
 50 | 
 51 |     def forward(self, x):
 52 |         x, x_dist = self.forward_features(x)
 53 |         x = self.head(x)
 54 |         x_dist = self.head_dist(x_dist)
 55 |         if self.training:
 56 |             return x, x_dist
 57 |         else:
 58 |             # during inference, return the average of both classifier predictions
 59 |             return (x + x_dist) / 2
 60 | 
 61 | 
 62 | @register_model
 63 | def deit_tiny_patch16_224(pretrained=False, **kwargs):
 64 |     model = VisionTransformer(
 65 |         patch_size=16, embed_dim=192, depth=12, num_heads=3, mlp_ratio=4, qkv_bias=True,
 66 |         norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
 67 |     model.default_cfg = _cfg()
 68 |     if pretrained:
 69 |         checkpoint = torch.hub.load_state_dict_from_url(
 70 |             url="https://dl.fbaipublicfiles.com/deit/deit_tiny_patch16_224-a1311bcf.pth",
 71 |             map_location="cpu", check_hash=True
 72 |         )
 73 |         model.load_state_dict(checkpoint["model"])
 74 |     return model
 75 | 
 76 | 
 77 | @register_model
 78 | def deit_small_patch16_224(pretrained=False, **kwargs):
 79 |     model = VisionTransformer(
 80 |         patch_size=16, embed_dim=384, depth=12, num_heads=6, mlp_ratio=4, qkv_bias=True,
 81 |         norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
 82 |     model.default_cfg = _cfg()
 83 |     if pretrained:
 84 |         checkpoint = torch.hub.load_state_dict_from_url(
 85 |             url="https://dl.fbaipublicfiles.com/deit/deit_small_patch16_224-cd65a155.pth",
 86 |             map_location="cpu", check_hash=True
 87 |         )
 88 |         model.load_state_dict(checkpoint["model"])
 89 |     return model
 90 | 
 91 | 
 92 | @register_model
 93 | def deit_base_patch16_224(pretrained=False, **kwargs):
 94 |     model = VisionTransformer(
 95 |         patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True,
 96 |         norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
 97 |     model.default_cfg = _cfg()
 98 |     if pretrained:
 99 |         checkpoint = torch.hub.load_state_dict_from_url(
100 |             url="https://dl.fbaipublicfiles.com/deit/deit_base_patch16_224-b5f2ef4d.pth",
101 |             map_location="cpu", check_hash=True
102 |         )
103 |         model.load_state_dict(checkpoint["model"])
104 |     return model
105 | 
106 | 
107 | @register_model
108 | def deit_tiny_distilled_patch16_224(pretrained=False, **kwargs):
109 |     model = DistilledVisionTransformer(
110 |         patch_size=16, embed_dim=192, depth=12, num_heads=3, mlp_ratio=4, qkv_bias=True,
111 |         norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
112 |     model.default_cfg = _cfg()
113 |     if pretrained:
114 |         checkpoint = torch.hub.load_state_dict_from_url(
115 |             url="https://dl.fbaipublicfiles.com/deit/deit_tiny_distilled_patch16_224-b40b3cf7.pth",
116 |             map_location="cpu", check_hash=True
117 |         )
118 |         model.load_state_dict(checkpoint["model"])
119 |     return model
120 | 
121 | 
122 | @register_model
123 | def deit_small_distilled_patch16_224(pretrained=False, **kwargs):
124 |     model = DistilledVisionTransformer(
125 |         patch_size=16, embed_dim=384, depth=12, num_heads=6, mlp_ratio=4, qkv_bias=True,
126 |         norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
127 |     model.default_cfg = _cfg()
128 |     if pretrained:
129 |         checkpoint = torch.hub.load_state_dict_from_url(
130 |             url="https://dl.fbaipublicfiles.com/deit/deit_small_distilled_patch16_224-649709d9.pth",
131 |             map_location="cpu", check_hash=True
132 |         )
133 |         model.load_state_dict(checkpoint["model"])
134 |     return model
135 | 
136 | 
137 | @register_model
138 | def deit_base_distilled_patch16_224(pretrained=False, **kwargs):
139 |     model = DistilledVisionTransformer(
140 |         patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True,
141 |         norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
142 |     model.default_cfg = _cfg()
143 |     if pretrained:
144 |         checkpoint = torch.hub.load_state_dict_from_url(
145 |             url="https://dl.fbaipublicfiles.com/deit/deit_base_distilled_patch16_224-df68dfff.pth",
146 |             map_location="cpu", check_hash=True
147 |         )
148 |         model.load_state_dict(checkpoint["model"])
149 |     return model
150 | 
151 | 
152 | # tested on this model
153 | @register_model
154 | def deit_base_patch16_384(pretrained=False, **kwargs):
155 |     model = VisionTransformer(
156 |         img_size=384, patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True,
157 |         norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
158 |     model.default_cfg = _cfg()
159 |     if pretrained:
160 |         checkpoint = torch.hub.load_state_dict_from_url(
161 |             url="https://dl.fbaipublicfiles.com/deit/deit_base_patch16_384-8de9b5d1.pth",
162 |             map_location="cpu", check_hash=True
163 |         )
164 |         model.load_state_dict(checkpoint["model"])
165 |     return model
166 | 
167 | 
168 | @register_model
169 | def deit_base_distilled_patch16_384(pretrained=False, **kwargs):
170 |     model = DistilledVisionTransformer(
171 |         img_size=384, patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True,
172 |         norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
173 |     model.default_cfg = _cfg()
174 |     if pretrained:
175 |         checkpoint = torch.hub.load_state_dict_from_url(
176 |             url="https://dl.fbaipublicfiles.com/deit/deit_base_distilled_patch16_384-d0272ac0.pth",
177 |             map_location="cpu", check_hash=True
178 |         )
179 |         model.load_state_dict(checkpoint["model"])
180 |     return model
181 | 


--------------------------------------------------------------------------------
/software_model/ops/_quant_base.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Author: Hanqing Zhu
  3 | # @Date:   2023-01-03 21:20:31
  4 | # @Last Modified by:   Hanqing Zhu
  5 | # @Last Modified time: 2023-03-30 03:50:36
  6 | """
  7 |     Quantized modules: the base class
  8 | """
  9 | import torch
 10 | import torch.nn as nn
 11 | from torch.nn.parameter import Parameter
 12 | 
 13 | import math
 14 | from enum import Enum
 15 | 
 16 | __all__ = ['Qmodes',  '_Conv2dQ', '_LinearQ', '_ActQ',
 17 |            'truncation', 'get_sparsity_mask', 'FunStopGradient', 'round_pass', 'grad_scale']
 18 | 
 19 | 
 20 | class Qmodes(Enum):
 21 |     layer_wise = 1
 22 |     kernel_wise = 2
 23 | 
 24 | 
 25 | def grad_scale(x, scale):
 26 |     y = x
 27 |     y_grad = x * scale
 28 |     return y.detach() - y_grad.detach() + y_grad
 29 | 
 30 | 
 31 | def get_sparsity_mask(param, sparsity):
 32 |     bottomk, _ = torch.topk(param.abs().view(-1), int(sparsity * param.numel()), largest=False, sorted=True)
 33 |     threshold = bottomk.data[-1]  # This is the largest element from the group of elements that we prune away
 34 |     return torch.gt(torch.abs(param), threshold).type(param.type())
 35 | 
 36 | 
 37 | def round_pass(x):
 38 |     y = x.round()
 39 |     y_grad = x
 40 |     return y.detach() - y_grad.detach() + y_grad
 41 | 
 42 | 
 43 | class FunStopGradient(torch.autograd.Function):
 44 | 
 45 |     @staticmethod
 46 |     def forward(ctx, weight, stopGradientMask):
 47 |         ctx.save_for_backward(stopGradientMask)
 48 |         return weight
 49 | 
 50 |     @staticmethod
 51 |     def backward(ctx, grad_outputs):
 52 |         stopGradientMask, = ctx.saved_tensors
 53 |         grad_inputs = grad_outputs * stopGradientMask
 54 |         return grad_inputs, None
 55 | 
 56 | 
 57 | def log_shift(value_fp):
 58 |     value_shift = 2 ** (torch.log2(value_fp).ceil())
 59 |     return value_shift
 60 | 
 61 | 
 62 | def clamp(input, min, max, inplace=False):
 63 |     if inplace:
 64 |         input.clamp_(min, max)
 65 |         return input
 66 |     return torch.clamp(input, min, max)
 67 | 
 68 | 
 69 | def get_quantized_range(num_bits, signed=True):
 70 |     if signed:
 71 |         n = 2 ** (num_bits - 1)
 72 |         return -n, n - 1
 73 |     return 0, 2 ** num_bits - 1
 74 | 
 75 | 
 76 | def linear_quantize(input, scale_factor, inplace=False):
 77 |     if inplace:
 78 |         input.mul_(scale_factor).round_()
 79 |         return input
 80 |     return torch.round(scale_factor * input)
 81 | 
 82 | 
 83 | def linear_quantize_clamp(input, scale_factor, clamp_min, clamp_max, inplace=False):
 84 |     output = linear_quantize(input, scale_factor, inplace)
 85 |     return clamp(output, clamp_min, clamp_max, inplace)
 86 | 
 87 | 
 88 | def linear_dequantize(input, scale_factor, inplace=False):
 89 |     if inplace:
 90 |         input.div_(scale_factor)
 91 |         return input
 92 |     return input / scale_factor
 93 | 
 94 | 
 95 | def truncation(fp_data, nbits=8):
 96 |     il = torch.log2(torch.max(fp_data.max(), fp_data.min().abs())) + 1
 97 |     il = math.ceil(il - 1e-5)
 98 |     qcode = nbits - il
 99 |     scale_factor = 2 ** qcode
100 |     clamp_min, clamp_max = get_quantized_range(nbits, signed=True)
101 |     q_data = linear_quantize_clamp(fp_data, scale_factor, clamp_min, clamp_max)
102 |     q_data = linear_dequantize(q_data, scale_factor)
103 |     return q_data, qcode
104 | 
105 | 
106 | def get_default_kwargs_q(kwargs_q, layer_type):
107 |     default = {
108 |         'nbits': 4
109 |     }
110 |     if isinstance(layer_type, _Conv2dQ):
111 |         default.update({
112 |             'mode': Qmodes.layer_wise})
113 |     elif isinstance(layer_type, _LinearQ):
114 |         pass
115 |     elif isinstance(layer_type, _ActQ):
116 |         pass
117 |         # default.update({
118 |         #     'signed': 'Auto'})
119 |     else:
120 |         assert NotImplementedError
121 |         return
122 |     for k, v in default.items():
123 |         if k not in kwargs_q:
124 |             kwargs_q[k] = v
125 |     return kwargs_q
126 | 
127 | 
128 | class _Conv2dQ(nn.Conv2d):
129 |     def __init__(self, in_channels, out_channels, kernel_size, stride=1,
130 |                  padding=0, dilation=1, groups=1, bias=True, **kwargs_q):
131 |         super(_Conv2dQ, self).__init__(in_channels, out_channels, kernel_size, stride=stride,
132 |                                        padding=padding, dilation=dilation, groups=groups, bias=bias)
133 |         self.kwargs_q = get_default_kwargs_q(kwargs_q, layer_type=self)
134 |         self.nbits = kwargs_q['nbits']
135 |         if self.nbits < 0:
136 |             self.register_parameter('alpha', None)
137 |             return
138 |         self.q_mode = kwargs_q['mode']
139 |         if self.q_mode == Qmodes.kernel_wise:
140 |             self.alpha = Parameter(torch.Tensor(out_channels))
141 |         else:  # layer-wise quantization
142 |             self.alpha = Parameter(torch.Tensor(1))
143 |         self.register_buffer('init_state', torch.zeros(1))
144 | 
145 |     def add_param(self, param_k, param_v):
146 |         self.kwargs_q[param_k] = param_v
147 | 
148 |     def set_bit(self, nbits):
149 |         self.kwargs_q['nbits'] = nbits
150 | 
151 |     def extra_repr(self):
152 |         s_prefix = super(_Conv2dQ, self).extra_repr()
153 |         if self.alpha is None:
154 |             return '{}, fake'.format(s_prefix)
155 |         return '{}, {}'.format(s_prefix, self.kwargs_q)
156 | 
157 | 
158 | class _LinearQ(nn.Linear):
159 |     def __init__(self, in_features, out_features, bias=True, **kwargs_q):
160 |         super(_LinearQ, self).__init__(in_features=in_features, out_features=out_features, bias=bias)
161 |         self.kwargs_q = get_default_kwargs_q(kwargs_q, layer_type=self)
162 |         self.nbits = kwargs_q['nbits']
163 |         if self.nbits < 0:
164 |             self.register_parameter('alpha', None)
165 |             return
166 |         self.q_mode = kwargs_q['mode']
167 |         self.alpha = Parameter(torch.Tensor(1))
168 |         if self.q_mode == Qmodes.kernel_wise:
169 |             self.alpha = Parameter(torch.Tensor(out_features))
170 |         self.register_buffer('init_state', torch.zeros(1))
171 | 
172 |     def add_param(self, param_k, param_v):
173 |         self.kwargs_q[param_k] = param_v
174 | 
175 |     def extra_repr(self):
176 |         s_prefix = super(_LinearQ, self).extra_repr()
177 |         if self.alpha is None:
178 |             return '{}, fake'.format(s_prefix)
179 |         return '{}, {}'.format(s_prefix, self.kwargs_q)
180 | 
181 | 
182 | class _ActQ(nn.Module):
183 |     def __init__(self, in_features, **kwargs_q):
184 |         super(_ActQ, self).__init__()
185 |         self.kwargs_q = get_default_kwargs_q(kwargs_q, layer_type=self)
186 |         self.nbits = kwargs_q['nbits']
187 |         if self.nbits < 0:
188 |             self.register_parameter('alpha', None)
189 |             # self.register_parameter('zero_point', None)
190 |             return
191 |         # self.signed = kwargs_q['signed']
192 |         self.q_mode = kwargs_q['mode']
193 |         # print(kwargs_q)
194 |         self.offset = kwargs_q['offset']
195 |         self.zero_point = None
196 |         if self.q_mode == Qmodes.kernel_wise:
197 |             self.alpha = Parameter(torch.Tensor(in_features))
198 |             if self.offset:
199 |                 self.zero_point = Parameter(torch.Tensor(in_features))
200 |                 torch.nn.init.zeros_(self.zero_point)
201 |         else:
202 |             self.alpha = Parameter(torch.Tensor(1))
203 |             if self.offset:
204 |                 self.zero_point = Parameter(torch.Tensor([0]))
205 |         # self.zero_point = Parameter(torch.Tensor([0]))
206 |         self.register_buffer('init_state', torch.zeros(1))
207 |         self.register_buffer('signed', torch.zeros(1))
208 | 
209 |     def add_param(self, param_k, param_v):
210 |         self.kwargs_q[param_k] = param_v
211 | 
212 |     def set_bit(self, nbits):
213 |         self.kwargs_q['nbits'] = nbits
214 | 
215 |     def extra_repr(self):
216 |         # s_prefix = super(_ActQ, self).extra_repr()
217 |         if self.alpha is None:
218 |             return 'fake'
219 |         return '{}'.format(self.kwargs_q)


--------------------------------------------------------------------------------
/software_model/utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2015-present, Facebook, Inc.
  2 | # All rights reserved.
  3 | """
  4 | Misc functions, including distributed helpers.
  5 | 
  6 | Mostly copy-paste from torchvision references.
  7 | """
  8 | import io
  9 | import os
 10 | import time
 11 | from collections import defaultdict, deque
 12 | import datetime
 13 | 
 14 | import torch
 15 | import torch.distributed as dist
 16 | 
 17 | 
 18 | class SmoothedValue(object):
 19 |     """Track a series of values and provide access to smoothed values over a
 20 |     window or the global series average.
 21 |     """
 22 | 
 23 |     def __init__(self, window_size=20, fmt=None):
 24 |         if fmt is None:
 25 |             fmt = "{median:.4f} ({global_avg:.4f})"
 26 |         self.deque = deque(maxlen=window_size)
 27 |         self.total = 0.0
 28 |         self.count = 0
 29 |         self.fmt = fmt
 30 | 
 31 |     def update(self, value, n=1):
 32 |         self.deque.append(value)
 33 |         self.count += n
 34 |         self.total += value * n
 35 | 
 36 |     def synchronize_between_processes(self):
 37 |         """
 38 |         Warning: does not synchronize the deque!
 39 |         """
 40 |         if not is_dist_avail_and_initialized():
 41 |             return
 42 |         t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
 43 |         dist.barrier()
 44 |         dist.all_reduce(t)
 45 |         t = t.tolist()
 46 |         self.count = int(t[0])
 47 |         self.total = t[1]
 48 | 
 49 |     @property
 50 |     def median(self):
 51 |         d = torch.tensor(list(self.deque))
 52 |         return d.median().item()
 53 | 
 54 |     @property
 55 |     def avg(self):
 56 |         d = torch.tensor(list(self.deque), dtype=torch.float32)
 57 |         return d.mean().item()
 58 | 
 59 |     @property
 60 |     def global_avg(self):
 61 |         return self.total / self.count
 62 | 
 63 |     @property
 64 |     def max(self):
 65 |         return max(self.deque)
 66 | 
 67 |     @property
 68 |     def value(self):
 69 |         return self.deque[-1]
 70 | 
 71 |     def __str__(self):
 72 |         return self.fmt.format(
 73 |             median=self.median,
 74 |             avg=self.avg,
 75 |             global_avg=self.global_avg,
 76 |             max=self.max,
 77 |             value=self.value)
 78 | 
 79 | 
 80 | class MetricLogger(object):
 81 |     def __init__(self, delimiter="\t"):
 82 |         self.meters = defaultdict(SmoothedValue)
 83 |         self.delimiter = delimiter
 84 | 
 85 |     def update(self, **kwargs):
 86 |         for k, v in kwargs.items():
 87 |             if isinstance(v, torch.Tensor):
 88 |                 v = v.item()
 89 |             assert isinstance(v, (float, int))
 90 |             self.meters[k].update(v)
 91 | 
 92 |     def __getattr__(self, attr):
 93 |         if attr in self.meters:
 94 |             return self.meters[attr]
 95 |         if attr in self.__dict__:
 96 |             return self.__dict__[attr]
 97 |         raise AttributeError("'{}' object has no attribute '{}'".format(
 98 |             type(self).__name__, attr))
 99 | 
100 |     def __str__(self):
101 |         loss_str = []
102 |         for name, meter in self.meters.items():
103 |             loss_str.append(
104 |                 "{}: {}".format(name, str(meter))
105 |             )
106 |         return self.delimiter.join(loss_str)
107 | 
108 |     def synchronize_between_processes(self):
109 |         for meter in self.meters.values():
110 |             meter.synchronize_between_processes()
111 | 
112 |     def add_meter(self, name, meter):
113 |         self.meters[name] = meter
114 | 
115 |     def log_every(self, iterable, print_freq, header=None):
116 |         i = 0
117 |         if not header:
118 |             header = ''
119 |         start_time = time.time()
120 |         end = time.time()
121 |         iter_time = SmoothedValue(fmt='{avg:.4f}')
122 |         data_time = SmoothedValue(fmt='{avg:.4f}')
123 |         space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
124 |         log_msg = [
125 |             header,
126 |             '[{0' + space_fmt + '}/{1}]',
127 |             'eta: {eta}',
128 |             '{meters}',
129 |             'time: {time}',
130 |             'data: {data}'
131 |         ]
132 |         if torch.cuda.is_available():
133 |             log_msg.append('max mem: {memory:.0f}')
134 |         log_msg = self.delimiter.join(log_msg)
135 |         MB = 1024.0 * 1024.0
136 |         for obj in iterable:
137 |             data_time.update(time.time() - end)
138 |             yield obj
139 |             iter_time.update(time.time() - end)
140 |             if i % print_freq == 0 or i == len(iterable) - 1:
141 |                 eta_seconds = iter_time.global_avg * (len(iterable) - i)
142 |                 eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
143 |                 if torch.cuda.is_available():
144 |                     print(log_msg.format(
145 |                         i, len(iterable), eta=eta_string,
146 |                         meters=str(self),
147 |                         time=str(iter_time), data=str(data_time),
148 |                         memory=torch.cuda.max_memory_allocated() / MB))
149 |                 else:
150 |                     print(log_msg.format(
151 |                         i, len(iterable), eta=eta_string,
152 |                         meters=str(self),
153 |                         time=str(iter_time), data=str(data_time)))
154 |             i += 1
155 |             end = time.time()
156 |         total_time = time.time() - start_time
157 |         total_time_str = str(datetime.timedelta(seconds=int(total_time)))
158 |         print('{} Total time: {} ({:.4f} s / it)'.format(
159 |             header, total_time_str, total_time / len(iterable)))
160 | 
161 | 
162 | def _load_checkpoint_for_ema(model_ema, checkpoint):
163 |     """
164 |     Workaround for ModelEma._load_checkpoint to accept an already-loaded object
165 |     """
166 |     mem_file = io.BytesIO()
167 |     torch.save({'state_dict_ema':checkpoint}, mem_file)
168 |     mem_file.seek(0)
169 |     model_ema._load_checkpoint(mem_file)
170 | 
171 | 
172 | def setup_for_distributed(is_master):
173 |     """
174 |     This function disables printing when not in master process
175 |     """
176 |     import builtins as __builtin__
177 |     builtin_print = __builtin__.print
178 | 
179 |     def print(*args, **kwargs):
180 |         force = kwargs.pop('force', False)
181 |         if is_master or force:
182 |             builtin_print(*args, **kwargs)
183 | 
184 |     __builtin__.print = print
185 | 
186 | 
187 | def is_dist_avail_and_initialized():
188 |     if not dist.is_available():
189 |         return False
190 |     if not dist.is_initialized():
191 |         return False
192 |     return True
193 | 
194 | 
195 | def get_world_size():
196 |     if not is_dist_avail_and_initialized():
197 |         return 1
198 |     return dist.get_world_size()
199 | 
200 | 
201 | def get_rank():
202 |     if not is_dist_avail_and_initialized():
203 |         return 0
204 |     return dist.get_rank()
205 | 
206 | 
207 | def is_main_process():
208 |     return get_rank() == 0
209 | 
210 | 
211 | def save_on_master(*args, **kwargs):
212 |     if is_main_process():
213 |         torch.save(*args, **kwargs)
214 | 
215 | 
216 | def init_distributed_mode(args):
217 |     if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
218 |         args.rank = int(os.environ["RANK"])
219 |         args.world_size = int(os.environ['WORLD_SIZE'])
220 |         args.gpu = int(os.environ['LOCAL_RANK'])
221 |     elif 'SLURM_PROCID' in os.environ:
222 |         args.rank = int(os.environ['SLURM_PROCID'])
223 |         args.gpu = args.rank % torch.cuda.device_count()
224 |     else:
225 |         print('Not using distributed mode')
226 |         args.distributed = False
227 |         return
228 | 
229 |     args.distributed = True
230 | 
231 |     torch.cuda.set_device(args.gpu)
232 |     args.dist_backend = 'nccl'
233 |     print('| distributed init (rank {}): {}'.format(
234 |         args.rank, args.dist_url), flush=True)
235 |     torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
236 |                                          world_size=args.world_size, rank=args.rank)
237 |     torch.distributed.barrier()
238 |     setup_for_distributed(args.rank == 0)
239 | 


--------------------------------------------------------------------------------
/hardware_simulator/utils/cal_flops_for_transformer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Author: Hanqing Zhu(hqzhu@utexas.edu)
  3 | # @Date:   1969-12-31 18:00:00
  4 | # @Last Modified by:   Hanqing Zhu(hqzhu@utexas.edu)
  5 | # @Last Modified time: 2023-11-09 01:18:58
  6 | """Computes the flops needed for training/running transformer networks.
  7 | https://github.com/google-research/electra/blob/master/flops_computation.py
  8 | """
  9 | 
 10 | # We checked this code with TensorFlow"s FLOPs counting, although we had to
 11 | # correct for this issue: https://github.com/tensorflow/tensorflow/issues/22071
 12 | # Assumptions going into the FLOPs counting
 13 | #   - An "operation" is a mathematical operation, not a machine instruction. So
 14 | #     an "exp" takes one opp like and add, even though in practice an exp
 15 | #     might be slower. This is not too bad an assumption because
 16 | #     matrix-multiplies dominate the compute for most models, so minor details
 17 | #     about activation functions don"t matter too much. Similarly, we count
 18 | #     matrix-multiplies as 2*m*n flops instead of m*n, as one might if
 19 | #     if considering fused multiply-add ops.
 20 | #   - Backward pass takes the same number of FLOPs as forward pass. No exactly
 21 | #     right (e.g., for softmax cross entropy loss the backward pass is faster).
 22 | #     Importantly, it really is the same for matrix-multiplies, which is most of
 23 | #     the compute anyway.
 24 | #   - We assume "dense" embedding lookups (i.e., multiplication by a one-hot
 25 | #     vector). On some hardware accelerators, these dense operations are
 26 | #     actually faster than sparse lookups.
 27 | # Please open a github issue if you spot a problem with this code!
 28 | 
 29 | # I am not sure if the below constants are 100% right, but they are only applied
 30 | # to O(hidden_size) activations, which is generally a lot less compute than the
 31 | # matrix-multiplies, which are O(hidden_size^2), so they don't affect the total
 32 | # number of FLOPs much.
 33 | 
 34 | # random number, >=, multiply activations by dropout mask, multiply activations
 35 | # by correction (1 / (1 - dropout_rate))
 36 | DROPOUT_FLOPS = 4
 37 | 
 38 | # compute mean activation (sum), computate variance of activation
 39 | # (square and sum), bias (add), scale (multiply)
 40 | LAYER_NORM_FLOPS = 5
 41 | 
 42 | # GELU: 0.5 * x * (1 + tanh(sqrt(2 / np.pi) * (x + 0.044715 * pow(x, 3))))
 43 | ACTIVATION_FLOPS = 8
 44 | 
 45 | # max/substract (for stability), exp, sum, divide
 46 | SOFTMAX_FLOPS = 5
 47 | 
 48 | __all__ = [
 49 |     "get_infer_ops"
 50 | ]
 51 | 
 52 | class TransformerHparams(object):
 53 |     """Computes the train/inference FLOPs for transformers."""
 54 | 
 55 |     def __init__(self, h, l, s=512, v=30522, e=None, i=None, heads=None,
 56 |                  head_size=None, output_frac=0.15625, sparse_embed_lookup=False,
 57 |                  decoder=False):
 58 |         self.h = h  # hidden size
 59 |         self.l = l  # number of layers
 60 |         self.s = s  # sequence length
 61 |         self.v = v  # vocab size
 62 |         self.e = h if e is None else e  # embedding size
 63 |         self.i = h * 4 if i is None else i  # intermediate size
 64 |         self.kqv = h if head_size is None else head_size * heads  # attn proj sizes
 65 |         # attention heads
 66 |         self.heads = max(h // 64, 1) if heads is None else heads
 67 |         self.output_frac = output_frac  # percent of tokens using an output softmax
 68 |         self.sparse_embed_lookup = sparse_embed_lookup  # sparse embedding lookups
 69 |         self.decoder = decoder  # decoder has extra attn to encoder states
 70 | 
 71 |         self.residual_flops = 0
 72 |         self.activation_flops = 0
 73 |         self.layer_norm_flops = 0
 74 |         self.softmax_flops = 0
 75 | 
 76 |     def get_block_flops(self):
 77 |         """Get the forward-pass FLOPs for a single transformer block."""
 78 |         attn_mul = 2 if self.decoder else 1
 79 |         block_flops = dict(
 80 |             kqv=3 * 2 * self.h * self.kqv * attn_mul,
 81 |             kqv_bias=3 * self.kqv * attn_mul,
 82 |             attention_scores=2 * self.kqv * self.s * attn_mul,
 83 |             attn_softmax=SOFTMAX_FLOPS * self.s * self.heads * attn_mul,
 84 |             attention_dropout=DROPOUT_FLOPS * self.s * self.heads * attn_mul,
 85 |             attention_scale=self.s * self.heads * attn_mul,
 86 |             attention_weighted_avg_values=2 * self.h * self.s * attn_mul,
 87 |             attn_output=2 * self.h * self.h * attn_mul,
 88 |             attn_output_bias=self.h * attn_mul,
 89 |             attn_output_dropout=DROPOUT_FLOPS * self.h * attn_mul,
 90 |             attn_output_residual=self.h * attn_mul,
 91 |             attn_output_layer_norm=LAYER_NORM_FLOPS * attn_mul,
 92 |             intermediate=2 * self.h * self.i,
 93 |             intermediate_act=ACTIVATION_FLOPS * self.i,
 94 |             intermediate_bias=self.i,
 95 |             output=2 * self.h * self.i,
 96 |             output_bias=self.h,
 97 |             output_dropout=DROPOUT_FLOPS * self.h,
 98 |             output_residual=self.h,
 99 |             output_layer_norm=LAYER_NORM_FLOPS * self.h,
100 |         )
101 | 
102 |         self.softmax_flops += self.s * self.s * self.heads * attn_mul # tokens * tokens * head
103 |         self.residual_flops += self.s * (self.h + self.h) # tokens * hidden size
104 |         self.layer_norm_flops += self.s * (self.h + 1) # tokens * hidden_size 
105 |         self.activation_flops += self.s * self.i # GELU tokens * hidden_size * 4
106 | 
107 |         return sum(block_flops.values()) * self.s
108 | 
109 |     def get_embedding_flops(self, output=False):
110 |         """Get the forward-pass FLOPs the transformer inputs or output softmax."""
111 |         embedding_flops = {}
112 |         if output or (not self.sparse_embed_lookup):
113 |             embedding_flops["main_multiply"] = 2 * self.e * self.v
114 |         # input embedding post-processing
115 |         if not output:
116 |             embedding_flops.update(dict(
117 |                 tok_type_and_position=2 * self.e * (self.s + 2),
118 |                 add_tok_type_and_position=2 * self.e,
119 |                 emb_layer_norm=LAYER_NORM_FLOPS * self.e,
120 |                 emb_dropout=DROPOUT_FLOPS * self.e
121 |             ))
122 |         # projection layer if e != h
123 |         if self.e != self.h or output:
124 |             embedding_flops.update(dict(
125 |                 hidden_kernel=2 * self.h * self.e,
126 |                 hidden_bias=self.e if output else self.h
127 |             ))
128 |             # extra hidden layer and output softmax
129 |             if output:
130 |                 embedding_flops.update(dict(
131 |                     hidden_activation=ACTIVATION_FLOPS * self.e,
132 |                     hidden_layernorm=LAYER_NORM_FLOPS * self.e,
133 |                     output_softmax=SOFTMAX_FLOPS * self.v,
134 |                     output_target_word=2 * self.v
135 |                 ))
136 |                 return self.output_frac * sum(embedding_flops.values()) * self.s
137 |         return sum(embedding_flops.values()) * self.s
138 | 
139 |     def get_binary_classification_flops(self):
140 |         classification_flops = dict(
141 |             hidden=2 * self.h * self.h,
142 |             hidden_bias=self.h,
143 |             hidden_act=ACTIVATION_FLOPS * self.h,
144 |             logits=2 * self.h
145 |         )
146 |         return sum(classification_flops.values()) * self.s
147 | 
148 |     def get_train_flops(self, batch_size, train_steps, discriminator=False):
149 |         """Get the FLOPs for pre-training the transformer."""
150 |         # 2* for forward/backward pass
151 |         return 2 * batch_size * train_steps * (
152 |             (self.l * self.get_block_flops()) +
153 |             self.get_embedding_flops(output=False) +
154 |             (self.get_binary_classification_flops() if discriminator else
155 |              self.get_embedding_flops(output=True))
156 |         )
157 | 
158 |     def get_infer_flops(self):
159 |         """Get the FLOPs for running inference with the transformer on a
160 |         classification task."""
161 |         (self.l * self.get_block_flops()) + self.get_embedding_flops(output=False) + self.get_binary_classification_flops()
162 | 
163 | 
164 | def get_electra_train_flops(
165 |         h_d, l_d, h_g, l_g, batch_size, train_steps, tied_embeddings,
166 |         e=None, s=512, output_frac=0.15625):
167 |     """Get the FLOPs needed for  pre-training ELECTRA."""
168 |     if e is None:
169 |         e = h_d
170 |     disc = TransformerHparams(
171 |         h_d, l_d, s=s, e=e,
172 |         output_frac=output_frac).get_train_flops(batch_size, train_steps, True)
173 |     gen = TransformerHparams(
174 |         h_g, l_g, s=s, e=e if tied_embeddings else None,
175 |         output_frac=output_frac).get_train_flops(batch_size, train_steps)
176 |     return disc + gen
177 | 
178 | def get_infer_ops(
179 |     h_d, l_s, seq, heads, head_size=64
180 | ):
181 |     """Get the ops needed for Transformer inference. Softmax, layernorm, residual add, activation"""
182 |     estimator = TransformerHparams(h=h_d, l=l_s, s=seq, heads=heads, head_size=head_size)
183 |     estimator.get_infer_flops()
184 |     
185 |     return estimator.softmax_flops, estimator.layer_norm_flops, estimator.residual_flops, estimator.activation_flops
186 | 


--------------------------------------------------------------------------------
/profile/benckmark_logs/deit-s_power.csv:
--------------------------------------------------------------------------------
  1 | # gpu    pwr  gtemp  mtemp     sm    mem    enc    dec    jpg    ofa   mclk   pclk 
  2 | # Idx      W      C      C      %      %      %      %      %      %    MHz    MHz 
  3 |     0     61     36     52      0      0      0      0      0      0   1593    210 
  4 |     0     61     36     52      0      0      0      0      0      0   1593    210 
  5 |     0     61     36     52      0      0      0      0      0      0   1593    210 
  6 |     0     61     36     52      0      0      0      0      0      0   1593    210 
  7 |     0     61     36     52      0      0      0      0      0      0   1593    210 
  8 |     0     61     36     52      0      0      0      0      0      0   1593    210 
  9 |     0     72     37     52     36      1      0      0      0      0   1593    855 
 10 |     0     72     37     52     40      1      0      0      0      0   1593    855 
 11 |     0     72     37     52     39      1      0      0      0      0   1593    855 
 12 |     0     72     37     53     40      1      0      0      0      0   1593    855 
 13 |     0     72     37     52     40      1      0      0      0      0   1593    855 
 14 |     0     72     37     52     39      1      0      0      0      0   1593    855 
 15 |     0     72     37     53     39      1      0      0      0      0   1593    855 
 16 |     0     73     37     53     40      1      0      0      0      0   1593    855 
 17 |     0     72     37     53     40      1      0      0      0      0   1593    855 
 18 |     0     72     37     53     39      1      0      0      0      0   1593    855 
 19 |     0     72     37     53     37      1      0      0      0      0   1593    840 
 20 |     0     72     37     53     39      1      0      0      0      0   1593    870 
 21 |     0     72     37     53     39      1      0      0      0      0   1593    870 
 22 |     0     72     37     53     39      1      0      0      0      0   1593    855 
 23 |     0     72     37     53     38      1      0      0      0      0   1593    840 
 24 |     0     72     37     54     38      1      0      0      0      0   1593    825 
 25 |     0     72     37     53     40      1      0      0      0      0   1593    855 
 26 |     0     72     37     54     40      1      0      0      0      0   1593    855 
 27 |     0     72     37     54     41      1      0      0      0      0   1593    855 
 28 |     0     72     37     53     40      1      0      0      0      0   1593    855 
 29 |     0     72     37     54     40      1      0      0      0      0   1593    855 
 30 |     0     72     37     54     39      1      0      0      0      0   1593    855 
 31 |     0     72     37     53     39      1      0      0      0      0   1593    840 
 32 |     0     72     37     54     39      1      0      0      0      0   1593    840 
 33 |     0     72     37     53     40      1      0      0      0      0   1593    840 
 34 |     0     72     37     54     39      1      0      0      0      0   1593    870 
 35 |     0     72     37     53     39      1      0      0      0      0   1593    870 
 36 |     0     73     37     54     39      1      0      0      0      0   1593    870 
 37 |     0     72     37     52     39      1      0      0      0      0   1593    870 
 38 |     0     73     37     53     39      1      0      0      0      0   1593    870 
 39 |     0     72     37     54     38      1      0      0      0      0   1593    855 
 40 |     0     72     37     53     39      1      0      0      0      0   1593    840 
 41 |     0     72     37     53     39      1      0      0      0      0   1593    870 
 42 |     0     72     37     53     39      1      0      0      0      0   1593    870 
 43 |     0     72     37     53     39      1      0      0      0      0   1593    870 
 44 |     0     72     37     53     40      1      0      0      0      0   1593    855 
 45 |     0     73     37     54     40      1      0      0      0      0   1593    855 
 46 |     0     72     37     53     40      1      0      0      0      0   1593    855 
 47 | # gpu    pwr  gtemp  mtemp     sm    mem    enc    dec    jpg    ofa   mclk   pclk 
 48 | # Idx      W      C      C      %      %      %      %      %      %    MHz    MHz 
 49 |     0     72     37     54     39      1      0      0      0      0   1593    840 
 50 |     0     72     37     54     39      1      0      0      0      0   1593    840 
 51 |     0     72     37     54     39      1      0      0      0      0   1593    840 
 52 |     0     72     37     53     40      1      0      0      0      0   1593    840 
 53 |     0     72     37     54     39      1      0      0      0      0   1593    870 
 54 |     0     73     37     54     39      1      0      0      0      0   1593    870 
 55 |     0     73     37     54     39      1      0      0      0      0   1593    870 
 56 |     0     72     37     53     39      1      0      0      0      0   1593    870 
 57 |     0     72     37     53     39      1      0      0      0      0   1593    870 
 58 |     0     72     37     53     39      1      0      0      0      0   1593    855 
 59 |     0     72     37     53     39      1      0      0      0      0   1593    855 
 60 |     0     72     37     53     38      1      0      0      0      0   1593    855 
 61 |     0     72     37     54     39      1      0      0      0      0   1593    840 
 62 |     0     72     37     53     39      1      0      0      0      0   1593    840 
 63 |     0     72     37     54     39      1      0      0      0      0   1593    840 
 64 |     0     72     37     53     39      1      0      0      0      0   1593    855 
 65 |     0     73     37     53     40      1      0      0      0      0   1593    855 
 66 |     0     72     37     53     40      1      0      0      0      0   1593    855 
 67 |     0     73     37     53     40      1      0      0      0      0   1593    855 
 68 |     0     72     37     54     37      1      0      0      0      0   1593    855 
 69 |     0     72     37     53     39      1      0      0      0      0   1593    840 
 70 |     0     72     37     53     39      1      0      0      0      0   1593    840 
 71 |     0     72     37     53     39      1      0      0      0      0   1593    870 
 72 |     0     72     37     53     37      1      0      0      0      0   1593    870 
 73 |     0     72     37     53     39      1      0      0      0      0   1593    840 
 74 |     0     72     37     54     39      1      0      0      0      0   1593    840 
 75 |     0     72     37     53     39      1      0      0      0      0   1593    840 
 76 |     0     72     37     53     40      1      0      0      0      0   1593    855 
 77 |     0     72     37     54     40      1      0      0      0      0   1593    855 
 78 |     0     73     37     53     40      1      0      0      0      0   1593    855 
 79 |     0     72     37     53     40      1      0      0      0      0   1593    855 
 80 |     0     72     37     53     38      1      0      0      0      0   1593    840 
 81 |     0     72     37     53     39      1      0      0      0      0   1593    840 
 82 |     0     72     37     53     39      1      0      0      0      0   1593    840 
 83 |     0     72     37     53     39      1      0      0      0      0   1593    840 
 84 |     0     72     37     53     40      1      0      0      0      0   1593    825 
 85 |     0     72     37     53     39      1      0      0      0      0   1593    825 
 86 |     0     72     37     53     39      1      0      0      0      0   1593    825 
 87 |     0     72     37     53     41      1      0      0      0      0   1593    855 
 88 |     0     72     37     53     40      1      0      0      0      0   1593    855 
 89 |     0     73     37     53     40      1      0      0      0      0   1593    855 
 90 |     0     72     37     53     39      1      0      0      0      0   1593    855 
 91 |     0     73     37     53     39      1      0      0      0      0   1593    855 
 92 |     0     72     37     53     38      1      0      0      0      0   1593    840 
 93 | # gpu    pwr  gtemp  mtemp     sm    mem    enc    dec    jpg    ofa   mclk   pclk 
 94 | # Idx      W      C      C      %      %      %      %      %      %    MHz    MHz 
 95 |     0     72     37     53     38      1      0      0      0      0   1593    840 
 96 |     0     72     37     53     40      1      0      0      0      0   1593    825 
 97 |     0     72     37     54     39      1      0      0      0      0   1593    825 
 98 |     0     72     37     53     39      1      0      0      0      0   1593    825 
 99 |     0     72     37     53     40      1      0      0      0      0   1593    825 
100 |     0     72     37     53     41      1      0      0      0      0   1593    855 
101 |     0     72     37     53     39      1      0      0      0      0   1593    855 
102 |     0     73     37     53     40      1      0      0      0      0   1593    855 
103 |     0     72     37     53     40      1      0      0      0      0   1593    855 
104 |     0     72     37     53     40      1      0      0      0      0   1593    855 
105 |     0     72     37     53     40      1      0      0      0      0   1593    855 
106 |     0     72     37     53     39      1      0      0      0      0   1593    840 
107 |     0     72     37     53     39      1      0      0      0      0   1593    840 
108 |     0     73     37     53     40      1      0      0      0      0   1593    855 
109 |     0     73     37     53     40      1      0      0      0      0   1593    855 
110 |     0     72     37     53     39      1      0      0      0      0   1593    840 
111 |     0     72     37     53     39      1      0      0      0      0   1593    870 
112 |     0     64     37     53     15      0      0      0      0      0   1593    870 
113 |     0     64     37     54      0      0      0      0      0      0   1593    825 
114 |     0     61     37     54      0      0      0      0      0      0   1593    240 
115 |     0     61     37     53      0      0      0      0      0      0   1593    210 
116 | 


--------------------------------------------------------------------------------
/profile/benckmark_logs/bert-b-128_power.csv:
--------------------------------------------------------------------------------
  1 | # gpu    pwr  gtemp  mtemp     sm    mem    enc    dec    jpg    ofa   mclk   pclk 
  2 | # Idx      W      C      C      %      %      %      %      %      %    MHz    MHz 
  3 |     0     62     38     55      0      0      0      0      0      0   1593    210 
  4 |     0     62     38     55      0      0      0      0      0      0   1593    210 
  5 |     0     62     38     55      0      0      0      0      0      0   1593    210 
  6 |     0     62     38     55      0      0      0      0      0      0   1593    210 
  7 |     0     62     38     55      0      0      0      0      0      0   1593    210 
  8 |     0     62     38     55      0      0      0      0      0      0   1593    825 
  9 |     0     81     39     54     40      3      0      0      0      0   1593    960 
 10 |     0     81     39     54     40      3      0      0      0      0   1593    975 
 11 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 12 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 13 |     0     81     39     54     40      3      0      0      0      0   1593    975 
 14 |     0     81     39     54     40      3      0      0      0      0   1593    975 
 15 |     0     80     39     55     40      3      0      0      0      0   1593    975 
 16 |     0     81     39     55     39      2      0      0      0      0   1593    975 
 17 |     0     80     39     55     39      2      0      0      0      0   1593    975 
 18 |     0     81     39     55     40      2      0      0      0      0   1593    960 
 19 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 20 |     0     81     39     54     40      3      0      0      0      0   1593    975 
 21 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 22 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 23 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 24 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 25 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 26 |     0     81     39     54     40      3      0      0      0      0   1593    975 
 27 |     0     81     39     54     40      3      0      0      0      0   1593    975 
 28 |     0     81     39     54     40      3      0      0      0      0   1593    975 
 29 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 30 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 31 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 32 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 33 |     0     81     39     54     40      3      0      0      0      0   1593    975 
 34 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 35 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 36 |     0     81     39     54     40      3      0      0      0      0   1593    975 
 37 |     0     78     39     55     27      2      0      0      0      0   1593    945 
 38 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 39 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 40 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 41 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 42 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 43 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 44 |     0     81     39     54     40      3      0      0      0      0   1593    975 
 45 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 46 |     0     81     39     55     39      3      0      0      0      0   1593    975 
 47 | # gpu    pwr  gtemp  mtemp     sm    mem    enc    dec    jpg    ofa   mclk   pclk 
 48 | # Idx      W      C      C      %      %      %      %      %      %    MHz    MHz 
 49 |     0     81     39     54     39      3      0      0      0      0   1593    975 
 50 |     0     81     39     55     39      3      0      0      0      0   1593    975 
 51 |     0     81     39     55     39      3      0      0      0      0   1593    975 
 52 |     0     81     39     55     39      2      0      0      0      0   1593    975 
 53 |     0     81     39     55     39      2      0      0      0      0   1593    960 
 54 |     0     81     39     55     39      2      0      0      0      0   1593    960 
 55 |     0     81     39     55     40      3      0      0      0      0   1593    960 
 56 |     0     81     39     54     40      3      0      0      0      0   1593    975 
 57 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 58 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 59 |     0     81     39     54     40      3      0      0      0      0   1593    975 
 60 |     0     81     39     55     39      3      0      0      0      0   1593    975 
 61 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 62 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 63 |     0     81     39     54     39      3      0      0      0      0   1593    975 
 64 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 65 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 66 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 67 |     0     81     39     54     40      3      0      0      0      0   1593    975 
 68 |     0     81     39     54     39      2      0      0      0      0   1593    975 
 69 |     0     71     39     54     39      3      0      0      0      0   1593    945 
 70 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 71 |     0     81     39     55     39      3      0      0      0      0   1593    975 
 72 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 73 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 74 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 75 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 76 |     0     81     39     56     40      3      0      0      0      0   1593    975 
 77 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 78 |     0     81     39     54     39      3      0      0      0      0   1593    975 
 79 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 80 |     0     81     39     54     40      3      0      0      0      0   1593    975 
 81 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 82 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 83 |     0     81     39     54     40      3      0      0      0      0   1593    975 
 84 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 85 |     0     81     39     54     39      3      0      0      0      0   1593    975 
 86 |     0     81     39     56     39      2      0      0      0      0   1593    975 
 87 |     0     81     39     55     40      2      0      0      0      0   1593    960 
 88 |     0     81     39     55     39      2      0      0      0      0   1593    960 
 89 |     0     81     39     55     40      3      0      0      0      0   1593    960 
 90 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 91 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 92 |     0     80     39     55     38      2      0      0      0      0   1593    975 
 93 | # gpu    pwr  gtemp  mtemp     sm    mem    enc    dec    jpg    ofa   mclk   pclk 
 94 | # Idx      W      C      C      %      %      %      %      %      %    MHz    MHz 
 95 |     0     80     39     55     38      2      0      0      0      0   1593    960 
 96 |     0     80     39     56     39      2      0      0      0      0   1593    945 
 97 |     0     80     39     56     39      2      0      0      0      0   1593    945 
 98 |     0     81     39     55     40      3      0      0      0      0   1593    975 
 99 |     0     81     40     55     40      3      0      0      0      0   1593    975 
100 |     0     81     40     56     40      3      0      0      0      0   1593    975 
101 |     0     81     39     55     39      3      0      0      0      0   1593    975 
102 |     0     80     39     55     40      3      0      0      0      0   1593    975 
103 |     0     80     39     55     38      2      0      0      0      0   1593    975 
104 |     0     80     39     56     39      2      0      0      0      0   1593    945 
105 |     0     81     39     55     40      2      0      0      0      0   1593    945 
106 |     0     81     40     55     40      3      0      0      0      0   1593    975 
107 |     0     81     40     55     39      2      0      0      0      0   1593    975 
108 |     0     81     40     55     40      3      0      0      0      0   1593    975 
109 |     0     81     40     55     40      3      0      0      0      0   1593    975 
110 |     0     81     40     54     40      3      0      0      0      0   1593    975 
111 |     0     81     40     55     40      3      0      0      0      0   1593    975 
112 |     0     65     39     55      7      0      0      0      0      0   1593    825 
113 |     0     65     39     55      0      0      0      0      0      0   1593    825 
114 |     0     62     39     55      0      0      0      0      0      0   1593    330 
115 |     0     62     39     55      0      0      0      0      0      0   1593    210 
116 |     0     62     38     55      0      0      0      0      0      0   1593    210 
117 | 


--------------------------------------------------------------------------------
/profile/benckmark_logs/bert-b-384_power.csv:
--------------------------------------------------------------------------------
  1 | # gpu    pwr  gtemp  mtemp     sm    mem    enc    dec    jpg    ofa   mclk   pclk 
  2 | # Idx      W      C      C      %      %      %      %      %      %    MHz    MHz 
  3 |     0     62     38     55      0      0      0      0      0      0   1593    210 
  4 |     0     62     38     55      0      0      0      0      0      0   1593    210 
  5 |     0     62     38     55      0      0      0      0      0      0   1593    210 
  6 |     0     62     38     55      0      0      0      0      0      0   1593    210 
  7 |     0     62     38     55      0      0      0      0      0      0   1593    210 
  8 |     0     65     38     55      0      0      0      0      0      0   1593    825 
  9 |     0    113     41     55     39      3      0      0      0      0   1593   1290 
 10 |     0    113     41     55     38      3      0      0      0      0   1593   1290 
 11 |     0    111     41     56     40      3      0      0      0      0   1593   1245 
 12 |     0    115     41     56     40      4      0      0      0      0   1593   1290 
 13 |     0    115     41     56     40      4      0      0      0      0   1593   1290 
 14 |     0    112     41     56     40      4      0      0      0      0   1593   1275 
 15 |     0    113     41     56     40      3      0      0      0      0   1593   1275 
 16 |     0    114     41     56     40      4      0      0      0      0   1593   1290 
 17 |     0    116     41     55     40      4      0      0      0      0   1593   1290 
 18 |     0    116     41     56     40      4      0      0      0      0   1593   1290 
 19 |     0    116     41     55     40      4      0      0      0      0   1593   1290 
 20 |     0    115     41     55     40      4      0      0      0      0   1593   1290 
 21 |     0    116     41     56     40      4      0      0      0      0   1593   1290 
 22 |     0    116     42     55     40      4      0      0      0      0   1593   1290 
 23 |     0    116     42     55     40      4      0      0      0      0   1593   1290 
 24 |     0    116     42     56     40      4      0      0      0      0   1593   1290 
 25 |     0    114     42     56     38      3      0      0      0      0   1593   1290 
 26 |     0    116     42     55     40      4      0      0      0      0   1593   1290 
 27 |     0    116     42     56     40      4      0      0      0      0   1593   1290 
 28 |     0    115     42     56     40      4      0      0      0      0   1593   1290 
 29 |     0    114     42     56     38      3      0      0      0      0   1593   1290 
 30 |     0    111     41     56     39      3      0      0      0      0   1593   1260 
 31 |     0    116     42     56     40      4      0      0      0      0   1593   1290 
 32 |     0    116     42     56     40      4      0      0      0      0   1593   1290 
 33 |     0    116     42     56     40      4      0      0      0      0   1593   1290 
 34 |     0    116     42     56     40      4      0      0      0      0   1593   1290 
 35 |     0    116     42     56     40      4      0      0      0      0   1593   1290 
 36 |     0    115     42     56     40      4      0      0      0      0   1593   1290 
 37 |     0    116     42     56     39      3      0      0      0      0   1593   1290 
 38 |     0    116     42     56     40      4      0      0      0      0   1593   1290 
 39 |     0    116     42     56     40      4      0      0      0      0   1593   1290 
 40 |     0    114     42     56     38      3      0      0      0      0   1593   1290 
 41 |     0    116     42     56     39      3      0      0      0      0   1593   1290 
 42 |     0    116     42     56     40      4      0      0      0      0   1593   1290 
 43 |     0    116     42     56     40      4      0      0      0      0   1593   1290 
 44 |     0    116     42     56     40      4      0      0      0      0   1593   1290 
 45 |     0    115     42     56     39      3      0      0      0      0   1593   1290 
 46 |     0    115     42     56     35      3      0      0      0      0   1593   1290 
 47 | # gpu    pwr  gtemp  mtemp     sm    mem    enc    dec    jpg    ofa   mclk   pclk 
 48 | # Idx      W      C      C      %      %      %      %      %      %    MHz    MHz 
 49 |     0    113     42     56     40      4      0      0      0      0   1593   1275 
 50 |     0    113     42     56     40      4      0      0      0      0   1593   1275 
 51 |     0    113     42     56     40      3      0      0      0      0   1593   1275 
 52 |     0    116     42     56     39      4      0      0      0      0   1593   1290 
 53 |     0    116     42     56     40      4      0      0      0      0   1593   1290 
 54 |     0    114     42     56     38      3      0      0      0      0   1593   1290 
 55 |     0    114     42     56     38      3      0      0      0      0   1593   1290 
 56 |     0    109     42     56     40      3      0      0      0      0   1593   1245 
 57 |     0    111     42     56     40      3      0      0      0      0   1593   1245 
 58 |     0    116     42     56     40      4      0      0      0      0   1593   1290 
 59 |     0    116     42     56     40      4      0      0      0      0   1593   1290 
 60 |     0    115     42     56     40      4      0      0      0      0   1593   1290 
 61 |     0    116     42     56     40      4      0      0      0      0   1593   1290 
 62 |     0    112     42     56     38      3      0      0      0      0   1593   1275 
 63 |     0    111     42     56     39      3      0      0      0      0   1593   1260 
 64 |     0    116     42     56     40      4      0      0      0      0   1593   1290 
 65 |     0    116     42     56     40      4      0      0      0      0   1593   1290 
 66 |     0    115     42     55     40      4      0      0      0      0   1593   1290 
 67 |     0    116     42     55     40      4      0      0      0      0   1593   1290 
 68 |     0    116     42     56     40      4      0      0      0      0   1593   1290 
 69 |     0    116     42     55     40      4      0      0      0      0   1593   1290 
 70 |     0    113     42     55     40      4      0      0      0      0   1593   1290 
 71 |     0    116     42     55     40      4      0      0      0      0   1593   1290 
 72 |     0    116     42     55     40      4      0      0      0      0   1593   1290 
 73 |     0    116     42     55     40      4      0      0      0      0   1593   1290 
 74 |     0    112     42     55     39      3      0      0      0      0   1593   1275 
 75 |     0    111     42     56     39      3      0      0      0      0   1593   1260 
 76 |     0    109     42     55     39      3      0      0      0      0   1593   1245 
 77 |     0    109     42     55     40      3      0      0      0      0   1593   1245 
 78 |     0     84     41     55     39      3      0      0      0      0   1593   1245 
 79 |     0    116     42     55     40      4      0      0      0      0   1593   1290 
 80 |     0    116     42     56     40      4      0      0      0      0   1593   1290 
 81 |     0    116     42     56     40      4      0      0      0      0   1593   1290 
 82 |     0    116     42     55     40      4      0      0      0      0   1593   1290 
 83 |     0    116     42     56     40      4      0      0      0      0   1593   1290 
 84 |     0    115     42     56     40      4      0      0      0      0   1593   1290 
 85 |     0    116     42     56     40      4      0      0      0      0   1593   1290 
 86 |     0    110     42     56     39      3      0      0      0      0   1593   1260 
 87 |     0    116     42     56     40      4      0      0      0      0   1593   1290 
 88 |     0    116     42     56     40      4      0      0      0      0   1593   1290 
 89 |     0    116     42     56     40      4      0      0      0      0   1593   1290 
 90 |     0    112     42     56     39      3      0      0      0      0   1593   1275 
 91 |     0    111     42     56     39      3      0      0      0      0   1593   1260 
 92 |     0    109     42     56     40      3      0      0      0      0   1593   1245 
 93 | # gpu    pwr  gtemp  mtemp     sm    mem    enc    dec    jpg    ofa   mclk   pclk 
 94 | # Idx      W      C      C      %      %      %      %      %      %    MHz    MHz 
 95 |     0    109     42     56     40      3      0      0      0      0   1593   1245 
 96 |     0    111     42     56     41      4      0      0      0      0   1593   1290 
 97 |     0    116     42     56     40      4      0      0      0      0   1593   1290 
 98 |     0    116     42     56     40      4      0      0      0      0   1593   1290 
 99 |     0    115     42     56     40      4      0      0      0      0   1593   1290 
100 |     0    116     42     56     39      4      0      0      0      0   1593   1290 
101 |     0    116     42     56     40      4      0      0      0      0   1593   1290 
102 |     0    116     42     56     40      4      0      0      0      0   1593   1290 
103 |     0    114     42     55     40      4      0      0      0      0   1593   1290 
104 |     0    112     42     56     39      3      0      0      0      0   1593   1275 
105 |     0    111     42     57     39      3      0      0      0      0   1593   1260 
106 |     0    110     42     56     40      3      0      0      0      0   1593   1245 
107 |     0    110     42     56     40      3      0      0      0      0   1593   1245 
108 |     0    109     42     56     40      3      0      0      0      0   1593   1245 
109 |     0    109     42     56     40      3      0      0      0      0   1593   1245 
110 |     0    103     42     56     40      3      0      0      0      0   1593   1230 
111 |     0    116     42     56     40      4      0      0      0      0   1593   1290 
112 |     0     74     40     56     16      0      0      0      0      0   1593   1290 
113 |     0     63     40     56      0      0      0      0      0      0   1593    555 
114 |     0     63     39     56      0      0      0      0      0      0   1593    345 
115 |     0     62     39     56      0      0      0      0      0      0   1593    225 
116 |     0     62     39     56      0      0      0      0      0      0   1593    210 
117 | 


--------------------------------------------------------------------------------
/profile/benckmark_logs/deit-t_power.csv:
--------------------------------------------------------------------------------
  1 | # gpu    pwr  gtemp  mtemp     sm    mem    enc    dec    jpg    ofa   mclk   pclk 
  2 | # Idx      W      C      C      %      %      %      %      %      %    MHz    MHz 
  3 |     0     61     36     52      0      0      0      0      0      0   1593    210 
  4 |     0     61     36     52      0      0      0      0      0      0   1593    210 
  5 |     0     61     36     53      0      0      0      0      0      0   1593    210 
  6 |     0     61     36     52      0      0      0      0      0      0   1593    210 
  7 |     0     61     36     53      0      0      0      0      0      0   1593    210 
  8 |     0     61     36     52      0      0      0      0      0      0   1593    210 
  9 |     0     61     36     52      0      0      0      0      0      0   1593    210 
 10 |     0     68     37     52     32      0      0      0      0      0   1593    825 
 11 |     0     68     37     54     37      0      0      0      0      0   1593    825 
 12 |     0     68     37     52     38      0      0      0      0      0   1593    825 
 13 |     0     68     37     53     38      0      0      0      0      0   1593    825 
 14 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 15 |     0     68     37     54     37      0      0      0      0      0   1593    825 
 16 |     0     68     37     54     38      0      0      0      0      0   1593    825 
 17 |     0     68     37     54     38      0      0      0      0      0   1593    825 
 18 |     0     68     37     53     38      0      0      0      0      0   1593    825 
 19 |     0     68     37     52     38      0      0      0      0      0   1593    825 
 20 |     0     68     37     52     38      0      0      0      0      0   1593    825 
 21 |     0     68     37     53     38      0      0      0      0      0   1593    825 
 22 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 23 |     0     68     37     53     38      0      0      0      0      0   1593    825 
 24 |     0     68     37     54     38      0      0      0      0      0   1593    825 
 25 |     0     68     37     53     38      0      0      0      0      0   1593    825 
 26 |     0     68     37     53     38      0      0      0      0      0   1593    825 
 27 |     0     68     37     53     38      0      0      0      0      0   1593    825 
 28 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 29 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 30 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 31 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 32 |     0     68     37     53     38      0      0      0      0      0   1593    825 
 33 |     0     68     37     53     25      0      0      0      0      0   1593    825 
 34 |     0     68     37     53     38      0      0      0      0      0   1593    825 
 35 |     0     68     37     54     38      0      0      0      0      0   1593    825 
 36 |     0     68     37     53     38      0      0      0      0      0   1593    825 
 37 |     0     68     37     53     38      0      0      0      0      0   1593    825 
 38 |     0     68     37     53     38      0      0      0      0      0   1593    825 
 39 |     0     68     37     53     38      0      0      0      0      0   1593    825 
 40 |     0     68     37     53     38      0      0      0      0      0   1593    825 
 41 |     0     68     37     53     38      0      0      0      0      0   1593    825 
 42 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 43 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 44 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 45 |     0     68     37     54     37      0      0      0      0      0   1593    825 
 46 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 47 | # gpu    pwr  gtemp  mtemp     sm    mem    enc    dec    jpg    ofa   mclk   pclk 
 48 | # Idx      W      C      C      %      %      %      %      %      %    MHz    MHz 
 49 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 50 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 51 |     0     68     37     53     38      0      0      0      0      0   1593    825 
 52 |     0     68     37     53     38      0      0      0      0      0   1593    825 
 53 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 54 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 55 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 56 |     0     68     37     53     38      0      0      0      0      0   1593    825 
 57 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 58 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 59 |     0     68     37     53     38      0      0      0      0      0   1593    825 
 60 |     0     68     37     53     38      0      0      0      0      0   1593    825 
 61 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 62 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 63 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 64 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 65 |     0     68     37     53     22      0      0      0      0      0   1593    825 
 66 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 67 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 68 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 69 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 70 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 71 |     0     68     37     53     38      0      0      0      0      0   1593    825 
 72 |     0     68     37     53     38      0      0      0      0      0   1593    825 
 73 |     0     68     37     53     38      0      0      0      0      0   1593    825 
 74 |     0     68     37     53     38      0      0      0      0      0   1593    825 
 75 |     0     68     37     53     38      0      0      0      0      0   1593    825 
 76 |     0     68     37     53     38      0      0      0      0      0   1593    825 
 77 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 78 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 79 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 80 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 81 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 82 |     0     68     37     53     38      0      0      0      0      0   1593    825 
 83 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 84 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 85 |     0     68     37     53     38      0      0      0      0      0   1593    825 
 86 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 87 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 88 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 89 |     0     68     37     53     38      0      0      0      0      0   1593    825 
 90 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 91 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 92 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 93 | # gpu    pwr  gtemp  mtemp     sm    mem    enc    dec    jpg    ofa   mclk   pclk 
 94 | # Idx      W      C      C      %      %      %      %      %      %    MHz    MHz 
 95 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 96 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 97 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 98 |     0     68     37     53     37      0      0      0      0      0   1593    825 
 99 |     0     68     37     53     37      0      0      0      0      0   1593    825 
100 |     0     68     37     53     37      0      0      0      0      0   1593    825 
101 |     0     68     37     53     37      0      0      0      0      0   1593    825 
102 |     0     68     37     53     37      0      0      0      0      0   1593    825 
103 |     0     68     37     53     37      0      0      0      0      0   1593    825 
104 |     0     68     37     53     38      0      0      0      0      0   1593    825 
105 |     0     68     37     53     37      0      0      0      0      0   1593    825 
106 |     0     68     37     53     37      0      0      0      0      0   1593    825 
107 |     0     68     37     53     37      0      0      0      0      0   1593    825 
108 |     0     68     37     53     37      0      0      0      0      0   1593    825 
109 |     0     68     37     53     38      0      0      0      0      0   1593    825 
110 |     0     68     37     53     37      0      0      0      0      0   1593    825 
111 |     0     68     37     53     37      0      0      0      0      0   1593    825 
112 |     0     68     37     53     38      0      0      0      0      0   1593    825 
113 |     0     64     37     53     37      0      0      0      0      0   1593    825 
114 |     0     64     37     53      0      0      0      0      0      0   1593    825 
115 |     0     64     37     53      0      0      0      0      0      0   1593    825 
116 |     0     61     37     53      0      0      0      0      0      0   1593    240 
117 |     0     61     37     53      0      0      0      0      0      0   1593    210 
118 | 


--------------------------------------------------------------------------------
/profile/benckmark_logs/bert-l-320_power.csv:
--------------------------------------------------------------------------------
  1 | # gpu    pwr  gtemp  mtemp     sm    mem    enc    dec    jpg    ofa   mclk   pclk 
  2 | # Idx      W      C      C      %      %      %      %      %      %    MHz    MHz 
  3 |     0     61     36     53      0      0      0      0      0      0   1593    210 
  4 |     0     61     36     53      0      0      0      0      0      0   1593    210 
  5 |     0     61     36     52      0      0      0      0      0      0   1593    210 
  6 |     0     61     36     53      0      0      0      0      0      0   1593    210 
  7 |     0     61     36     52      0      0      0      0      0      0   1593    210 
  8 |     0     61     36     53      0      0      0      0      0      0   1593    210 
  9 |     0     64     38     54     33      0      0      0      0      0   1593    825 
 10 |     0    153     41     53     41      7      0      0      0      0   1593   1410 
 11 |     0    150     41     54     40      6      0      0      0      0   1593   1410 
 12 |     0    153     41     54     41      7      0      0      0      0   1593   1410 
 13 |     0    153     42     54     41      7      0      0      0      0   1593   1410 
 14 |     0    154     42     54     41      7      0      0      0      0   1593   1410 
 15 |     0    152     42     55     41      6      0      0      0      0   1593   1410 
 16 |     0    154     42     54     41      7      0      0      0      0   1593   1410 
 17 |     0    154     42     54     42      7      0      0      0      0   1593   1410 
 18 |     0    153     42     54     41      7      0      0      0      0   1593   1410 
 19 |     0    154     42     55     42      7      0      0      0      0   1593   1410 
 20 |     0    151     42     55     41      7      0      0      0      0   1593   1410 
 21 |     0    154     42     55     42      7      0      0      0      0   1593   1410 
 22 |     0    155     42     55     42      7      0      0      0      0   1593   1410 
 23 |     0    153     42     55     42      7      0      0      0      0   1593   1410 
 24 |     0    153     42     55     42      7      0      0      0      0   1593   1410 
 25 |     0    154     42     55     42      7      0      0      0      0   1593   1410 
 26 |     0    154     42     55     41      7      0      0      0      0   1593   1410 
 27 |     0    153     42     55     42      7      0      0      0      0   1593   1410 
 28 |     0    154     42     55     42      7      0      0      0      0   1593   1410 
 29 |     0    154     42     56     42      7      0      0      0      0   1593   1410 
 30 |     0    153     42     55     41      7      0      0      0      0   1593   1410 
 31 |     0    153     42     55     42      7      0      0      0      0   1593   1410 
 32 |     0    152     42     55     42      7      0      0      0      0   1593   1410 
 33 |     0    155     42     56     42      7      0      0      0      0   1593   1410 
 34 |     0    152     42     55     41      7      0      0      0      0   1593   1410 
 35 |     0    153     42     56     41      7      0      0      0      0   1593   1410 
 36 |     0    153     42     56     41      7      0      0      0      0   1593   1410 
 37 |     0    154     42     55     41      7      0      0      0      0   1593   1410 
 38 |     0    153     42     56     41      7      0      0      0      0   1593   1410 
 39 |     0    153     42     56     41      7      0      0      0      0   1593   1410 
 40 |     0    154     42     56     41      6      0      0      0      0   1593   1410 
 41 |     0    153     43     55     41      7      0      0      0      0   1593   1410 
 42 |     0    153     42     56     41      6      0      0      0      0   1593   1410 
 43 |     0    154     43     55     42      7      0      0      0      0   1593   1410 
 44 |     0    154     43     55     41      7      0      0      0      0   1593   1410 
 45 |     0    154     43     56     42      7      0      0      0      0   1593   1410 
 46 |     0    154     43     56     41      7      0      0      0      0   1593   1410 
 47 | # gpu    pwr  gtemp  mtemp     sm    mem    enc    dec    jpg    ofa   mclk   pclk 
 48 | # Idx      W      C      C      %      %      %      %      %      %    MHz    MHz 
 49 |     0    155     43     55     41      7      0      0      0      0   1593   1410 
 50 |     0    154     43     56     42      7      0      0      0      0   1593   1410 
 51 |     0    155     43     55     41      7      0      0      0      0   1593   1410 
 52 |     0    154     43     56     41      7      0      0      0      0   1593   1410 
 53 |     0    154     43     56     42      7      0      0      0      0   1593   1410 
 54 |     0    154     43     56     41      7      0      0      0      0   1593   1410 
 55 |     0    154     43     56     42      7      0      0      0      0   1593   1410 
 56 |     0    154     43     56     42      7      0      0      0      0   1593   1410 
 57 |     0    155     43     55     42      7      0      0      0      0   1593   1410 
 58 |     0    154     43     55     41      7      0      0      0      0   1593   1410 
 59 |     0    153     43     56     42      7      0      0      0      0   1593   1410 
 60 |     0    154     43     55     41      7      0      0      0      0   1593   1410 
 61 |     0    154     43     55     41      7      0      0      0      0   1593   1410 
 62 |     0    154     43     56     42      7      0      0      0      0   1593   1410 
 63 |     0    154     43     55     42      7      0      0      0      0   1593   1410 
 64 |     0    155     43     56     42      7      0      0      0      0   1593   1410 
 65 |     0    154     43     56     42      7      0      0      0      0   1593   1410 
 66 |     0    154     43     56     42      7      0      0      0      0   1593   1410 
 67 |     0    154     43     55     42      7      0      0      0      0   1593   1410 
 68 |     0    154     43     56     41      7      0      0      0      0   1593   1410 
 69 |     0    154     43     56     42      7      0      0      0      0   1593   1410 
 70 |     0    155     43     55     42      7      0      0      0      0   1593   1410 
 71 |     0    155     43     56     41      7      0      0      0      0   1593   1410 
 72 |     0    154     43     55     42      7      0      0      0      0   1593   1410 
 73 |     0    153     43     56     41      7      0      0      0      0   1593   1410 
 74 |     0    153     43     56     41      6      0      0      0      0   1593   1410 
 75 |     0    153     43     56     41      6      0      0      0      0   1593   1410 
 76 |     0    153     43     56     41      6      0      0      0      0   1593   1410 
 77 |     0    155     43     56     41      7      0      0      0      0   1593   1410 
 78 |     0    155     43     56     42      7      0      0      0      0   1593   1410 
 79 |     0    154     43     55     41      7      0      0      0      0   1593   1410 
 80 |     0    155     43     56     42      7      0      0      0      0   1593   1410 
 81 |     0    154     43     56     42      7      0      0      0      0   1593   1410 
 82 |     0    155     43     56     41      7      0      0      0      0   1593   1410 
 83 |     0    154     43     55     42      7      0      0      0      0   1593   1410 
 84 |     0    155     43     55     42      7      0      0      0      0   1593   1410 
 85 |     0    154     43     56     42      7      0      0      0      0   1593   1410 
 86 |     0    155     43     55     42      7      0      0      0      0   1593   1410 
 87 |     0    154     43     56     41      7      0      0      0      0   1593   1410 
 88 |     0    155     43     55     41      6      0      0      0      0   1593   1410 
 89 |     0    155     43     56     42      7      0      0      0      0   1593   1410 
 90 |     0    155     43     55     42      7      0      0      0      0   1593   1410 
 91 |     0    153     43     56     41      7      0      0      0      0   1593   1410 
 92 |     0    155     43     56     41      7      0      0      0      0   1593   1410 
 93 | # gpu    pwr  gtemp  mtemp     sm    mem    enc    dec    jpg    ofa   mclk   pclk 
 94 | # Idx      W      C      C      %      %      %      %      %      %    MHz    MHz 
 95 |     0    155     43     56     42      7      0      0      0      0   1593   1410 
 96 |     0    155     43     56     42      7      0      0      0      0   1593   1410 
 97 |     0    154     43     56     42      7      0      0      0      0   1593   1410 
 98 |     0    155     43     56     42      7      0      0      0      0   1593   1410 
 99 |     0    155     43     56     42      7      0      0      0      0   1593   1410 
100 |     0    155     43     56     42      7      0      0      0      0   1593   1410 
101 |     0    154     43     57     41      7      0      0      0      0   1593   1410 
102 |     0    155     43     56     41      7      0      0      0      0   1593   1410 
103 |     0    155     43     57     41      7      0      0      0      0   1593   1410 
104 |     0    155     43     56     41      7      0      0      0      0   1593   1410 
105 |     0    154     43     56     41      7      0      0      0      0   1593   1410 
106 |     0    155     43     56     41      7      0      0      0      0   1593   1410 
107 |     0    154     43     56     41      7      0      0      0      0   1593   1410 
108 |     0    153     43     56     41      7      0      0      0      0   1593   1410 
109 |     0    153     43     56     41      6      0      0      0      0   1593   1410 
110 |     0    153     43     55     41      6      0      0      0      0   1593   1410 
111 |     0    154     43     56     41      6      0      0      0      0   1593   1410 
112 |     0    153     44     57     41      6      0      0      0      0   1593   1410 
113 |     0     85     42     56     41      7      0      0      0      0   1593   1410 
114 |     0     82     40     55      0      0      0      0      0      0   1593   1410 
115 |     0     64     39     55      0      0      0      0      0      0   1593    825 
116 |     0     62     38     55      0      0      0      0      0      0   1593    345 
117 |     0     62     38     55      0      0      0      0      0      0   1593    225 
118 |     0     61     38     55      0      0      0      0      0      0   1593    210 
119 |     0     62     38     55      0      0      0      0      0      0   1593    210 
120 |     0     62     38     55      0      0      0      0      0      0   1593    210 
121 |     0     62     38     55      0      0      0      0      0      0   1593    210 
122 | 


--------------------------------------------------------------------------------
/profile/benckmark_logs/deit-b-power.csv:
--------------------------------------------------------------------------------
  1 | # gpu    pwr  gtemp  mtemp     sm    mem    enc    dec    jpg    ofa   mclk   pclk 
  2 | # Idx      W      C      C      %      %      %      %      %      %    MHz    MHz 
  3 |     0     61     37     53      0      0      0      0      0      0   1593    210 
  4 |     0     61     37     53      0      0      0      0      0      0   1593    210 
  5 |     0     61     37     53      0      0      0      0      0      0   1593    210 
  6 |     0     61     37     54      0      0      0      0      0      0   1593    210 
  7 |     0     61     37     54      0      0      0      0      0      0   1593    210 
  8 |     0     61     37     54      0      0      0      0      0      0   1593    210 
  9 |     0     61     37     53      0      0      0      0      0      0   1593    825 
 10 |     0     86     38     54     41      3      0      0      0      0   1593   1080 
 11 |     0     86     38     54     39      3      0      0      0      0   1593   1080 
 12 |     0     87     38     53     40      3      0      0      0      0   1593   1065 
 13 |     0     87     38     53     40      3      0      0      0      0   1593   1065 
 14 |     0     86     38     53     40      3      0      0      0      0   1593   1065 
 15 |     0     86     38     53     39      3      0      0      0      0   1593   1065 
 16 |     0     87     38     53     39      3      0      0      0      0   1593   1065 
 17 |     0     86     38     53     40      3      0      0      0      0   1593   1065 
 18 |     0     86     38     53     39      3      0      0      0      0   1593   1065 
 19 |     0     87     38     53     40      3      0      0      0      0   1593   1065 
 20 |     0     87     38     53     39      3      0      0      0      0   1593   1065 
 21 |     0     86     38     53     40      3      0      0      0      0   1593   1065 
 22 |     0     87     38     53     40      3      0      0      0      0   1593   1065 
 23 |     0     86     38     53     40      3      0      0      0      0   1593   1065 
 24 |     0     87     38     53     39      3      0      0      0      0   1593   1065 
 25 |     0     86     38     53     40      3      0      0      0      0   1593   1050 
 26 |     0     86     38     53     40      3      0      0      0      0   1593   1065 
 27 |     0     86     38     53     39      3      0      0      0      0   1593   1065 
 28 |     0     87     38     53     39      3      0      0      0      0   1593   1065 
 29 |     0     86     38     53     39      3      0      0      0      0   1593   1065 
 30 |     0     86     38     53     40      3      0      0      0      0   1593   1065 
 31 |     0     86     38     54     39      3      0      0      0      0   1593   1050 
 32 |     0     87     38     53     40      3      0      0      0      0   1593   1065 
 33 |     0     86     38     53     40      3      0      0      0      0   1593   1065 
 34 |     0     87     38     54     40      3      0      0      0      0   1593   1065 
 35 |     0     86     38     53     39      3      0      0      0      0   1593   1065 
 36 |     0     86     38     54     40      3      0      0      0      0   1593   1065 
 37 |     0     87     38     54     40      3      0      0      0      0   1593   1065 
 38 |     0     87     38     53     40      3      0      0      0      0   1593   1065 
 39 |     0     87     38     53     39      3      0      0      0      0   1593   1065 
 40 |     0     86     38     53     39      3      0      0      0      0   1593   1065 
 41 |     0     87     38     53     40      3      0      0      0      0   1593   1065 
 42 |     0     87     38     54     40      3      0      0      0      0   1593   1065 
 43 |     0     87     38     54     39      3      0      0      0      0   1593   1065 
 44 |     0     87     38     53     40      3      0      0      0      0   1593   1065 
 45 |     0     87     38     53     39      3      0      0      0      0   1593   1065 
 46 |     0     87     38     53     40      3      0      0      0      0   1593   1065 
 47 | # gpu    pwr  gtemp  mtemp     sm    mem    enc    dec    jpg    ofa   mclk   pclk 
 48 | # Idx      W      C      C      %      %      %      %      %      %    MHz    MHz 
 49 |     0     84     38     53     39      3      0      0      0      0   1593   1065 
 50 |     0     87     38     53     40      3      0      0      0      0   1593   1065 
 51 |     0     86     38     54     39      3      0      0      0      0   1593   1065 
 52 |     0     86     38     53     39      3      0      0      0      0   1593   1065 
 53 |     0     87     38     54     40      3      0      0      0      0   1593   1065 
 54 |     0     86     38     54     39      3      0      0      0      0   1593   1065 
 55 |     0     87     38     54     39      3      0      0      0      0   1593   1065 
 56 |     0     87     38     53     40      3      0      0      0      0   1593   1065 
 57 |     0     87     38     54     40      3      0      0      0      0   1593   1065 
 58 |     0     87     38     54     39      3      0      0      0      0   1593   1065 
 59 |     0     86     38     54     39      3      0      0      0      0   1593   1050 
 60 |     0     86     38     54     39      3      0      0      0      0   1593   1065 
 61 |     0     86     38     54     39      3      0      0      0      0   1593   1065 
 62 |     0     86     38     53     40      3      0      0      0      0   1593   1050 
 63 |     0     86     38     54     40      3      0      0      0      0   1593   1050 
 64 |     0     86     38     54     39      3      0      0      0      0   1593   1065 
 65 |     0     87     38     54     39      3      0      0      0      0   1593   1065 
 66 |     0     86     38     54     41      3      0      0      0      0   1593   1050 
 67 |     0     87     38     54     39      3      0      0      0      0   1593   1065 
 68 |     0     87     38     53     39      3      0      0      0      0   1593   1065 
 69 |     0     87     38     53     39      3      0      0      0      0   1593   1065 
 70 |     0     86     38     54     39      3      0      0      0      0   1593   1065 
 71 |     0     86     38     54     39      3      0      0      0      0   1593   1065 
 72 |     0     87     38     53     39      3      0      0      0      0   1593   1065 
 73 |     0     87     38     54     39      3      0      0      0      0   1593   1065 
 74 |     0     87     38     53     39      3      0      0      0      0   1593   1065 
 75 |     0     87     38     54     39      3      0      0      0      0   1593   1065 
 76 |     0     87     38     54     39      3      0      0      0      0   1593   1065 
 77 |     0     87     38     54     39      3      0      0      0      0   1593   1065 
 78 |     0     87     38     53     39      3      0      0      0      0   1593   1065 
 79 |     0     87     38     54     39      3      0      0      0      0   1593   1065 
 80 |     0     87     38     54     39      3      0      0      0      0   1593   1065 
 81 |     0     87     38     54     39      3      0      0      0      0   1593   1065 
 82 |     0     86     38     54     39      3      0      0      0      0   1593   1065 
 83 |     0     87     38     54     39      3      0      0      0      0   1593   1065 
 84 |     0     87     38     54     39      3      0      0      0      0   1593   1065 
 85 |     0     87     38     54     40      3      0      0      0      0   1593   1065 
 86 |     0     87     38     53     39      3      0      0      0      0   1593   1065 
 87 |     0     87     38     54     39      3      0      0      0      0   1593   1065 
 88 |     0     87     38     53     39      3      0      0      0      0   1593   1065 
 89 |     0     86     38     54     40      3      0      0      0      0   1593   1065 
 90 |     0     87     38     54     39      3      0      0      0      0   1593   1065 
 91 |     0     86     38     55     40      3      0      0      0      0   1593   1050 
 92 |     0     87     38     53     40      3      0      0      0      0   1593   1050 
 93 | # gpu    pwr  gtemp  mtemp     sm    mem    enc    dec    jpg    ofa   mclk   pclk 
 94 | # Idx      W      C      C      %      %      %      %      %      %    MHz    MHz 
 95 |     0     86     38     54     40      3      0      0      0      0   1593   1050 
 96 |     0     86     38     54     40      3      0      0      0      0   1593   1050 
 97 |     0     86     38     54     39      3      0      0      0      0   1593   1050 
 98 |     0     86     38     54     41      3      0      0      0      0   1593   1050 
 99 |     0     87     38     54     39      3      0      0      0      0   1593   1065 
100 |     0     86     38     54     39      3      0      0      0      0   1593   1065 
101 |     0     87     38     54     39      3      0      0      0      0   1593   1065 
102 |     0     87     38     55     39      3      0      0      0      0   1593   1065 
103 |     0     86     38     54     39      3      0      0      0      0   1593   1065 
104 |     0     87     38     55     39      3      0      0      0      0   1593   1065 
105 |     0     87     38     55     39      3      0      0      0      0   1593   1065 
106 |     0     87     38     54     40      3      0      0      0      0   1593   1065 
107 |     0     87     38     54     39      3      0      0      0      0   1593   1065 
108 |     0     86     38     54     39      3      0      0      0      0   1593   1065 
109 |     0     87     38     54     40      3      0      0      0      0   1593   1065 
110 |     0     87     38     54     39      3      0      0      0      0   1593   1065 
111 |     0     87     38     54     39      3      0      0      0      0   1593   1065 
112 |     0     87     38     55     39      3      0      0      0      0   1593   1065 
113 |     0     64     37     53     23      0      0      0      0      0   1593    840 
114 |     0     64     37     53      0      0      0      0      0      0   1593    825 
115 |     0     62     37     53      0      0      0      0      0      0   1593    375 
116 |     0     61     37     53      0      0      0      0      0      0   1593    240 
117 |     0     61     37     53      0      0      0      0      0      0   1593    210 
118 |     0     61     37     53      0      0      0      0      0      0   1593    210 
119 |     0     61     37     53      0      0      0      0      0      0   1593    210 
120 |     0     61     37     53      0      0      0      0      0      0   1593    210 
121 |     0     61     37     53      0      0      0      0      0      0   1593    210 
122 |     0     61     37     53      0      0      0      0      0      0   1593    210 
123 |     0     61     37     53      0      0      0      0      0      0   1593    210 
124 |     0     61     37     53      0      0      0      0      0      0   1593    210 
125 |     0     61     37     53      0      0      0      0      0      0   1593    210 
126 |     0     61     37     53      0      0      0      0      0      0   1593    210 
127 |     0     61     37     53      0      0      0      0      0      0   1593    210 
128 |     0     61     37     53      0      0      0      0      0      0   1593    210 
129 |     0     61     37     53      0      0      0      0      0      0   1593    210 
130 |     0     61     37     53      0      0      0      0      0      0   1593    210 
131 |     0     61     37     53      0      0      0      0      0      0   1593    210 
132 |     0     61     37     53      0      0      0      0      0      0   1593    210 
133 | 


--------------------------------------------------------------------------------
/software_model/readme.md:
--------------------------------------------------------------------------------
  1 | # Train and Inference of DeiT on our lightining-transformer
  2 | 
  3 | ---
  4 | 
  5 | For deit, we built on the official implementation (https://github.com/facebookresearch/deit).
  6 | 
  7 | To model the inference accuracy on our photonic accelerator, we explictly inject the analytic transformation of our photonic tensor core during computation. We consider several nonidealties and inject them during inference, including **input encoding magnitude varation**, **input encoding phase variaion**, **output computation variation**, and **WDM dispersion introduced by multiple wavelength**.
  8 | 
  9 | Please ensure that you have install the required dependencies following instructions in `../readme.md`, before you run jobs.
 10 | 
 11 | ---
 12 | 
 13 | ## Structures
 14 | Our code is built upon the offical [DeiT](https://github.com/facebookresearch/deit).
 15 | * `./models/quant_vit.py`. The ViT model definition with quantization and analytic transformation of our PTC computation considering different noise resources.
 16 | * `./ops/`. Useful utils functions, including the implemented learned-step-size quantization [LSQ](https://github.com/hustzxd/LSQuantization) for transformer quantization.
 17 | * `main.py`. The main python file.
 18 | * `/scripts/`. This folder contains the scripts for implementing noise-aware training of low-bit DeiT models and testing inference accuracy.
 19 | 
 20 | 
 21 | 
 22 | ## Data preparation
 23 | 
 24 | ### Dataset
 25 | Download and extract ImageNet train and val images from http://image-net.org/.
 26 | The directory structure is the standard layout for the torchvision [`datasets.ImageFolder`](https://pytorch.org/docs/stable/torchvision/datasets.html#imagefolder), and the training and validation data is expected to be in the `train/` folder and `val` folder respectively:
 27 | 
 28 | ```
 29 | /path/to/imagenet/
 30 |   train/
 31 |     class1/
 32 |       img1.jpeg
 33 |     class2/
 34 |       img2.jpeg
 35 |   val/
 36 |     class1/
 37 |       img3.jpeg
 38 |     class2/
 39 |       img4.jpeg
 40 | ```
 41 | ### Pretrained checkpoints
 42 | Download baseline DeiT models pretrained on ImageNet 2012 and put in the `pretrained` directory.
 43 | 
 44 | | name | acc@1 | acc@5 | #params | url |
 45 | | --- | --- | --- | --- | --- |
 46 | | DeiT-tiny | 72.2 | 91.1 | 5M | [model](https://dl.fbaipublicfiles.com/deit/deit_tiny_patch16_224-a1311bcf.pth) |
 47 | | DeiT-small | 79.9 | 95.0 | 22M| [model](https://dl.fbaipublicfiles.com/deit/deit_small_patch16_224-cd65a155.pth) |
 48 | | DeiT-base | 81.8 | 95.6 | 86M | [model](https://dl.fbaipublicfiles.com/deit/deit_base_patch16_224-b5f2ef4d.pth) |
 49 | 
 50 | ```
 51 | mkdir pretrained
 52 | curl -o ./pretrained/deit_tiny_patch16_224-a1311bcf.pth https://dl.fbaipublicfiles.com/deit/deit_tiny_patch16_224-a1311bcf.pth
 53 | curl -o ./pretrained/deit_small_patch16_224-cd65a155.pth https://dl.fbaipublicfiles.com/deit/deit_small_patch16_224-cd65a155.pth
 54 | curl -o ./pretrained/deit_base_patch16_224-b5f2ef4d.pth https://dl.fbaipublicfiles.com/deit/deit_base_patch16_224-b5f2ef4d.pth
 55 | ```
 56 | 
 57 | ### Provided checkpoint for 4-bit DeiT-T
 58 | Training with DeiT may takes days given the quantization and the dedicated model of computation based on photonic tensor core.
 59 | 
 60 | We provide our checkpoint of DeiT-4bit for you to help you quickly perform evaluation and reproduce our results.
 61 | The model is in [google drive link](https://drive.google.com/uc?id=1EZjEnkqyBaBU8pUrYqNLTYMq4Mn0cbKV). 
 62 | 
 63 | ```
 64 | mkdir resumed_ckpt
 65 | gdown https://drive.google.com/uc?id=1EZjEnkqyBaBU8pUrYqNLTYMq4Mn0cbKV -O resumed_ckpt/
 66 | ```
 67 | 
 68 | ## How to use
 69 | 
 70 | ### Noise-aware training with a pretrained checkpoint
 71 | 
 72 | Train a quantized DeiT model using `./scripts/train_quant_transformer_with_noise.sh` by setting the bit-precision, input noise std. and output noise std and other training settings.
 73 | 
 74 | You need to replace the path in `--data-path /path/to/imagenet/data` by the path you put imagenet.
 75 | 
 76 | The `--finetune pretrained/deit_tiny_patch16_224-a1311bcf.pth` should be the path to the downloaded pretrained model.
 77 | 
 78 | 
 79 | ```
 80 | wbits=4
 81 | abits=4
 82 | id=4bit
 83 | lr=5e-4
 84 | weight_decay=1e-8
 85 | batch_size=512
 86 | epochs=300
 87 | port=47771
 88 | headwise=1
 89 | input_noise_std=0.03
 90 | output_noise_std=0.05
 91 | 
 92 | torchrun \
 93 | --master_port ${port} \
 94 | --nproc_per_node=4 main.py \
 95 | --model deit_tiny_patch16_224_quant \
 96 | --drop-path 0 \
 97 | --batch-size ${batch_size} \
 98 | --lr ${lr} \
 99 | --min-lr 0 \
100 | --epochs ${epochs} \
101 | --warmup-epochs 0 \
102 | --weight-decay ${weight_decay} \
103 | --wbits ${wbits} \
104 | --abits ${abits} \
105 | --dist-eval \
106 | --output_dir test/deit_tiny_${id}/${wbits}w${abits}a_bs${batch_size}_baselr${lr}_weightdecay${weight_decay}_ft${epochs}_headwise${headwise}_noise_i_${input_noise_std}_o_${output_noise_std}_linear_noise \
107 | --finetune pretrained/deit_tiny_patch16_224-a1311bcf.pth \
108 | --data-path /path/to/imagenet/data \
109 | --headwise \
110 | --input_noise_std ${input_noise_std} \
111 | --output_noise_std ${output_noise_std} \
112 | --enable_linear_noise
113 | ```
114 | 
115 | ### Evaluation of a trained model with noise injection
116 | 
117 | Test the inference accuracy of a trained DeiT model using `./scripts/evaluate_quant_transformer.sh` and setting the corresponding noise levels.
118 | 
119 | * input_noise_std: Noise std of the input magtitude encoding. Default is 0.03.
120 | * phase_noise_std: Noise std of the input phase encoding. Default is $2^{\circ}$.
121 | * output_noise_std: Noise std of the computed outputs. Default is 0.05.
122 | * num_wavelength: number of wavelength used in the system. We will calculate the wavelength-induced dispersion error. Default is 12.
123 | 
124 | Set `resumed_ckpt_path='./your/path/to/best_checkpoint.pth'` in the script.
125 | 
126 | ```
127 | exp='eval_accuracy'
128 | wbits=4
129 | abits=4
130 | id=4bit
131 | headwise=1
132 | 
133 | # noise settings
134 | input_noise_std=0.03
135 | output_noise_std=0.05
136 | # following setting is added for inference only
137 | phase_noise_std=2
138 | num_wavelength=12
139 | channel_spacing=0.4
140 | seed=0
141 | 
142 | resumed_ckpt_path='./your/path/to/best_checkpoint.pth'
143 | 
144 | 
145 | for i in {1..1}
146 | do
147 |     for input_noise_std in 0.03
148 |     do
149 |         CUDA_VISIBLE_DEVICES=0 python main.py --eval \
150 |         --resume ${resumed_ckpt_path} \
151 |         --model deit_tiny_patch16_224_quant \
152 |         --drop-path 0 \
153 |         --wbits ${wbits} \
154 |         --abits ${abits} \
155 |         --data-path /path/to/imagenet/data \
156 |         --headwise \
157 |         --input_noise_std ${input_noise_std} \
158 |         --output_noise_std ${output_noise_std} \
159 |         --phase_noise_std ${phase_noise_std} \
160 |         --num_wavelength ${num_wavelength} \
161 |         --channel_spacing ${channel_spacing} \
162 |         --seed ${seed+$i} \
163 |         --enable_wdm_noise \
164 |         --enable_linear_noise
165 |     done
166 | done
167 | ```
168 | It will give you the outputs as follows
169 | ```
170 | Test: Total time: 0:29:16 (3.3723 s / it)
171 | * Acc@1 71.052 Acc@5 90.432 loss 1.287
172 | Accuracy of the network on the 50000 test images: 71.1%
173 | ```
174 | 
175 | ---
176 | 
177 | ## AE experiments: Reproduce reported results in accuracy and robustness analysis
178 | 
179 | We test the robustness of model running on our photonic accelerator by sweeping various on-chip noise-resources.
180 | * input_noise_std: Noise std of the input magtitude encoding. Default is 0.03.
181 | * phase_noise_std: Noise std of the input phase encoding. Default is $2^{\circ}$.
182 | * output_noise_std: Noise std of the computed outputs. Default is 0.05.
183 | * num_wavelength: number of wavelength used in the system. We will calculate the wavelength-induced dispersion error. Default is 12.
184 | 
185 | ### Download our checkpoint
186 | 
187 | One trained DeiT-T-4bit model is provided for quickly reproducing the results.
188 | 
189 | Download it as follows:
190 | ```
191 | mkdir resumed_ckpt
192 | gdown https://drive.google.com/uc?id=1EZjEnkqyBaBU8pUrYqNLTYMq4Mn0cbKV -O resumed_ckpt/
193 | ```
194 | It will in the `./resumed_ckpt`.
195 | 
196 | ### Launch jobs with noise sweeping.
197 | 
198 | You can run `./scripts/evaluate_quant_transformer_scan_noise.sh` to measure the accuracy with varing noise levels.
199 | We will test accurcy by three times.
200 | 
201 | By uncommenting the corresponding line in the script, you can reproduce the experiments for sweeping input noise std, phase noise std, and number of wavelengths.
202 | ```
203 | for input_noise_std in 0.03 0.04 0.05 0.06 0.07 0.08 ## uncomment this line when scanning input noise
204 |     # for phase_noise_std in 2 3 4 5 6 7 ## uncomment this line when scanning phase noise
205 |     # for num_wavelength in 8 12 16 20 24 ## uncomment this line when scanning # wavelength
206 | ```
207 | 
208 | 
209 | You can redirect the output of running `./scripts/evaluate_quant_transformer_scan_noise.sh` to a log file.
210 | Then use our provided scripts `./scripts/process_output_logs.sh` to process the logs.
211 | You will get the parsed accuracy as well as the mean and std in a CSV file.
212 | 
213 | ```
214 | ./scripts/evaluate_quant_transformer.sh &> results.log # redirect results to a log file
215 | ./scripts/process_output_logs.sh # set the log_file path and number of iters and how many variations you sweep in the script
216 | ```
217 | 
218 | The expetced results will be like
219 | 
220 | ```
221 | test1,test2,test3,mean,std
222 | 71.174,71.014,70.99,71.05933333333333,0.10002666311206546
223 | 71.052,71.1,70.972,71.04133333333333,0.06466323014923916
224 | 71.034,70.924,70.924,70.96066666666667,0.06350852961085851
225 | 70.99,70.952,71.144,71.02866666666667,0.10167267741794891
226 | 71.206,70.82,71.184,71.07,0.21678560837842034
227 | ```
228 | The first three columns represent the accurcay of 3 different runs with different seeds, followed by two columns being the mean and std of the three data.
229 | 
230 | Different rows represent the accuracy for different noise values, which is sweeped as in the `./scripts/evaluate_quant_transformer_scan_noise.sh`.
231 | 
232 | ### Scripts `./scripts/evaluate_quant_transformer_scan_noise.sh`.
233 | ```
234 | exp='eval_accuracy_scan_noise'
235 | wbits=4
236 | abits=4
237 | id=4bit
238 | headwise=1
239 | 
240 | # noise settings
241 | input_noise_std=0.03
242 | output_noise_std=0.05
243 | # following setting is added for inference only
244 | phase_noise_std=2
245 | num_wavelength=12
246 | channel_spacing=0.4
247 | seed=0
248 | 
249 | resumed_ckpt_path='./resumed_ckpt/best_checkpoint.pth'
250 | 
251 | for i in {1..3}
252 | do
253 |     for input_noise_std in 0.03 0.04 0.05 0.06 0.07 0.08 ## uncomment this line when scanning input noise
254 |     # for phase_noise_std in 2 3 4 5 6 7 ## uncomment this line when scanning phase noise
255 |     # for num_wavelength in 8 12 16 20 24 ## uncomment this line when scanning # wavelength
256 |     do
257 |         CUDA_VISIBLE_DEVICES=0 python main.py --eval \
258 |         --resume ${resumed_ckpt_path} \
259 |         --model deit_tiny_patch16_224_quant \
260 |         --drop-path 0 \
261 |         --wbits ${wbits} \
262 |         --abits ${abits} \
263 |         --data-path /home/usr1/zixuan/ImageNet/data \
264 |         --headwise \
265 |         --input_noise_std ${input_noise_std} \
266 |         --output_noise_std ${output_noise_std} \
267 |         --phase_noise_std ${phase_noise_std} \
268 |         --num_wavelength ${num_wavelength} \
269 |         --channel_spacing ${channel_spacing} \
270 |         --seed ${seed+$i} \
271 |         --enable_wdm_noise \
272 |         --enable_linear_noise
273 |     done
274 | done
275 | ```
276 | 


--------------------------------------------------------------------------------