├── .gitignore
├── README.md
├── __init__.py
├── cli_benchmark.py
├── cli_perf_visual.py
├── cli_structure_analyzer.py
├── figures
    ├── Qwen3-32B_a100-sxm-80gb_flops_vs_seq_len.png
    ├── Qwen3-32B_a100-sxm-80gb_interactive.html
    ├── Qwen3-32B_a100-sxm-80gb_latency_vs_seq_len.png
    ├── Qwen3-32B_a100-sxm-80gb_memory_vs_seq_len.png
    ├── Qwen3-32B_a100-sxm-80gb_overview.png
    ├── Qwen3-32B_a100-sxm-80gb_throughput_vs_seq_len.png
    ├── grpah_decode_llama2-70B_tp4_bs16_seqlen1024_genlen128.png
    ├── grpah_prefill_llama2-70B_tp4_bs16_seqlen1024_genlen128.png
    └── roofline_analysis_optimized.png
├── images
    ├── flops_decode_llama2-70b_tp8_bs32_seqlen1024_genlen128.png
    ├── flops_prefill_llama2-70b_tp8_bs32_seqlen1024_genlen128.png
    ├── grpah_decode_llama2-70b_tp8_bs32_seqlen1024_genlen128.png
    ├── grpah_prefill_llama2-70b_tp8_bs32_seqlen1024_genlen128.png
    ├── latency_decode_llama2-70b_tp8_bs32_seqlen1024_genlen128.png
    ├── latency_prefill_llama2-70b_tp8_bs32_seqlen1024_genlen128.png
    └── params_llama2-70b_tp8_bs32_seqlen1024_genlen128.png
├── llm_counts
    ├── benchmark_analyzer.py
    ├── configs
    │   ├── gpu_configs.json
    │   ├── gpu_perf.ini
    │   └── model_configs.json
    ├── count_flops.py
    ├── count_latency.py
    ├── count_memory.py
    ├── count_params.py
    ├── layer_graph_visualizer.py
    ├── roofline_model.py
    └── utils
    │   ├── __pycache__
    │       ├── config.cpython-310.pyc
    │       ├── config.cpython-311.pyc
    │       ├── config.cpython-312.pyc
    │       ├── constants.cpython-310.pyc
    │       ├── constants.cpython-311.pyc
    │       ├── constants.cpython-312.pyc
    │       ├── utils.cpython-310.pyc
    │       └── utils.cpython-311.pyc
    │   ├── config.py
    │   ├── constants.py
    │   ├── utils.py
    │   └── visualizer.py
└── test_torch_info.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | tmp
3 | .ruff_cache
4 | **/__pycache__/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # llm_profiler
  2 | 
  3 | llm theoretical performance analysis tools and support params, flops, memory and latency analysis.
  4 | 
  5 | ## 主要功能
  6 | 
  7 | - 支持 qwen2.5、qwen3 dense 系列模型。
  8 | - 支持张量并行推理模式。
  9 | - 支持 `A100`、`V100`、`T4` 等硬件以及主流 decoder-only 的自回归模型，可自行在配置文件中增加。
 10 | - 支持分析性能瓶颈，不同 `layer` 是 `memory bound` 还是 `compute bound`，以及 `kv_cache` 的性能瓶颈。
 11 | - 支持输出每层和整个模型的参数量、计算量，内存和 `latency`。
 12 | - 推理时支持预填充和解码阶段分别计算内存和 latency、以及理论支持的最大 `bs` 等等。
 13 | - 支持设置计算效率、内存读取效率（不同推理框架可能不一样，这个设置好后，可推测输出实际值）。
 14 | - 推理性能理论分析结果的格式化输出。
 15 | 
 16 | ## 如何使用
 17 | 
 18 | 使用方法，直接调用 `llm_profiler/llm_profiler.py` 文件中函数 `llm_profile()` 函数并输入相关参数即可。
 19 | 
 20 | ```python
 21 | def llm_profile(model_name="llama-13b",
 22 |                 gpu_name: str = "v100-sxm-32gb",
 23 |                 bytes_per_param: int = BYTES_FP16,
 24 |                 bs: int = 1,
 25 |                 seq_len: int = 522,
 26 |                 generate_len=1526,
 27 |                 ds_zero: int = 0,
 28 |                 dp_size: int = 1,
 29 |                 tp_size: int = 1,
 30 |                 pp_size: int = 1,
 31 |                 sp_size: int = 1,
 32 |                 layernorm_dtype_bytes: int = BYTES_FP16,
 33 |                 kv_cache_bytes: int = BYTES_FP16,
 34 |                 flops_efficiency: float = FLOPS_EFFICIENCY,
 35 |                 hbm_memory_efficiency: float = HBM_MEMORY_EFFICIENCY,
 36 |                 intra_node_memory_efficiency=INTRA_NODE_MEMORY_EFFICIENCY,
 37 |                 inter_node_memory_efficiency=INTER_NODE_MEMORY_EFFICIENCY,
 38 |                 mode: str = "inference",
 39 |             ) -> dict:
 40 | 
 41 |     """format print dicts of the total floating-point operations, MACs, parameters and latency of a llm.
 42 | 
 43 |     Args:
 44 |         model_name (str, optional): model name to query the pre-defined `model_configs.json`. Defaults to "llama-13b".
 45 |         gpu_name (str, optional): gpu name to query the pre-defined `model_configs.json`. Defaults to "v100-sxm2-32gb".
 46 |         bs (int, optional): _description_. Defaults to 1.
 47 |         seq_len (int, optional): batch size per GPU.. Defaults to 522.
 48 |         generate_len (int, optional): The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt. Defaults to 1526.
 49 |         dp_size (int, optional): data parallelism size. Defaults to 1.
 50 |         tp_size (int, optional): tensor parallelism size. Defaults to 1.
 51 |         pp_size (int, optional): pipeline parallelism size. Defaults to 1.
 52 |         sp_size (int, optional): sequence parallelism size. Defaults to 1.
 53 |             speed up decoding. Defaults to True.
 54 |         layernorm_dtype_bytes (int, optional): number of bytes in the data type for the layernorm activations.. Defaults to BYTES_FP16.
 55 |         kv_cache_bytes (int, optional): number of bytes in the data type for the kv_cache. Defaults to None.
 56 |         flops_efficiency (float, optional): flops efficiency, ranging from 0 to 1. Defaults to None.
 57 |         hbm_memory_efficiency (float, optional): GPU HBM memory efficiency, ranging from 0 to 1. Defaults to HBM_MEMORY_EFFICIENCY.
 58 |         intra_node_memory_efficiency (_type_, optional): intra-node memory efficiency, ranging from 0 to 1.. Defaults to INTRA_NODE_MEMORY_EFFICIENCY.
 59 |         inter_node_memory_efficiency (_type_, optional): inter-node memory efficiency, ranging from 0 to 1.. Defaults to INTER_NODE_MEMORY_EFFICIENCY.
 60 | 
 61 |     Returns:
 62 |         None: format print some summary dictionary of the inference analysis
 63 |     """
 64 | ```
 65 | 
 66 | `llama2-70` 模型，tp_size = 8 和 bs = 20，输出示例信息如下所示：
 67 | 
 68 | ```bash
 69 | -------------------------- LLM main infer config --------------------------
 70 | {   'inference_config': {   'model_name': 'llama2-70b',
 71 |                             'num_attention_heads': 64,
 72 |                             'num_kv_heads': 8,
 73 |                             'head_dim': 128,
 74 |                             'hidden_size': 8192,
 75 |                             'intermediate_size': 28672,
 76 |                             'vocab_size': 32000,
 77 |                             'max_seq_len': 4096,
 78 |                             'bs': 32,
 79 |                             'seq_len': 1024,
 80 |                             'tp_size': 8,
 81 |                             'pp_size': 1,
 82 |                             'generate_len': 128},
 83 |     'gpu_config': {   'name': 'a100-sxm-40gb',
 84 |                       'memory_GPU_in_GB': '40 GB',
 85 |                       'gpu_hbm_bandwidth': '1555 GB/s',
 86 |                       'gpu_intra_node_bandwidth': '600 GB/s',
 87 |                       'gpu_fp16_TFLOPS': '312 TFLOPS'}}
 88 | 
 89 | -------------------------- LLM infer performance analysis --------------------------
 90 | {   'weight_memory_per_gpu': '17.18 GB',
 91 |     'consume_memory_per_gpu': '20.57 GB',
 92 |     'prefill_flops': '4574.25 T',
 93 |     'decode_flops_per_step': '4.38 T',
 94 |     'TTFT': 2.7060724961666294,
 95 |     'TTOT': 0.040541745771914876,
 96 |     'kv_cache_latency': '959.04 us',
 97 |     'total_infer_latency': '7.9 s',
 98 |     'support_max_batch_total_tokens': 240249}
 99 | 
100 | ---------------------------- LLM Params per_layer analysis ----------------------------
101 | {   'qkvo_proj': '150.99 M',
102 |     'mlp': '704.64 M',
103 |     'rmsnorm': '16.38 K',
104 |     'input_embedding': '262.14 M',
105 |     'output_embedding': '262.14 M'}
106 | {'params_model': '68.71 G'}
107 | 
108 | ---------------------------- LLM Prefill Flops per_layer analysis ----------------------------
109 | {   'attention_kernel': '1.1 T',
110 |     'qkvo_proj': '9.9 T',
111 |     'mlp': '46.18 T',
112 |     'rmsnorm': '4.29 G',
113 |     'positional_embedding': '536.87 M',
114 |     'input_embedding': '0'}
115 | {'prefill flops_model': '4574.25 T'}
116 | 
117 | ---------------------------- LLM Memory analysis (Prefill) ----------------------------
118 | {   'weight_memory_per_gpu': '17.18 GB',
119 |     'prefill_max_bs': '388B',
120 |     'prefill_act_per_gpu': '1.88 GB'}
121 | 
122 | ---------------------------- LLM Memory analysis (Decode) ----------------------------
123 | {   'decode_act_per_gpu': '1.88 GB',
124 |     'kv_cache_memory_per_gpu': '1.51 GB',
125 |     'consume_memory_per_gpu': '20.57 GB',
126 |     'decode_max_bs': '215.0B',
127 |     'max_batch_total_tokens': '240.25 KB'}
128 | 
129 | ---------------------------- LLM Latency analysis (Prefill) ----------------------------
130 | {   'prefill_qkvo_proj': '352.41 ms',
131 |     'prefill_attn_kernel': '131.39 ms',
132 |     'prefill_mlp': '1.64 s',
133 |     'prefill_rmsnorm': '61.38 ms',
134 |     'prefill_tp_comm': '501.08 ms',
135 |     'prefill_kv_cache_rw': '959.04 us',
136 |     'prefill_latency': '2.71 s'}
137 | 
138 | ---------------------------- LLM Latency analysis (Decode) ----------------------------
139 | {   'decode_qkvo_proj': '6.5 ms',
140 |     'decode_attn_kernel': '2.56 ms',
141 |     'decode_mlp': '30.26 ms',
142 |     'decode_rmsnorm': '64.62 us',
143 |     'decode_tp_comm': '640.0 us',
144 |     'decode_kv_cache_rw': '121.75 us',
145 |     'kv_cache_latency': '959.04 us',
146 |     'decode_latency': '40.54 ms'}
147 | ```
148 | 
149 | ## 模型结构可视化
150 | 
151 | llama2-70b 模型，A100-SXM40GB，tp_size = 8 和 bs = 20，prefill 阶段:
152 | 
153 | <div align="center">
154 | <img src="images/grpah_prefill_llama2-70b_tp8_bs32_seqlen1024_genlen128.png" width="50%" alt="prefill 阶段">
155 | </div>
156 | 
157 | llama2-70b 模型，A100-SXM40GB，tp_size = 8 和 bs = 20， decode 阶段:
158 | 
159 | <div align="center">
160 | <img src="images/grpah_decode_llama2-70b_tp8_bs32_seqlen1024_genlen128.png" width="50%" alt="decode 阶段">
161 | </div>
162 | 
163 | ## 模型参数量、计算量、latency 分布
164 | 
165 | llama2-70b 模型，A100-SXM40GB，tp_size = 8 和 bs = 20，参数量统计分布:
166 | 
167 | <div align="center">
168 | <img src="images/params_llama2-70b_tp8_bs32_seqlen1024_genlen128.png" width="50%" alt="prefill 阶段">
169 | </div>
170 | 
171 | llama2-70b 模型，A100-SXM40GB，tp_size = 8 和 bs = 20，prefill 阶段计算量统计分布:
172 | 
173 | <div align="center">
174 | <img src="images/flops_prefill_llama2-70b_tp8_bs32_seqlen1024_genlen128.png" width="50%" alt="prefill 阶段计算量统计分布">
175 | </div>
176 | 
177 | llama2-70b 模型，A100-SXM40GB，tp_size = 8 和 bs = 20，generate_len = 128, decode 阶段计算量统计分布:
178 | 
179 | <div align="center">
180 | <img src="images/flops_decode_llama2-70b_tp8_bs32_seqlen1024_genlen128.png" width="50%" alt="decode 阶段计算量统计分布">
181 | </div>
182 | 
183 | llama2-70b 模型，A100-SXM40GB，tp_size = 8 和 bs = 20，prefill 阶段 latency 统计分布:
184 | 
185 | <div align="center">
186 | <img src="images/latency_prefill_llama2-70b_tp8_bs32_seqlen1024_genlen128.png" width="50%" alt="prefill 阶段 latency 统计分布">
187 | </div>
188 | 
189 | llama2-70b 模型，A100-SXM40GB，tp_size = 8 和 bs = 20，decode 阶段 latency 统计分布:
190 | 
191 | <div align="center">
192 | <img src="images/latency_decode_llama2-70b_tp8_bs32_seqlen1024_genlen128.png" width="50%" alt="decode 阶段 latency 统计分布">
193 | </div>
194 | 
195 | ## 参考链接
196 | - [Transformer 性能分析理论基础](https://github.com/HarleysZhang/dl_note/blob/main/6-llm_note/transformer_basic/Transformer%E6%80%A7%E8%83%BD%E5%88%86%E6%9E%90%E7%90%86%E8%AE%BA%E5%9F%BA%E7%A1%80.md)
197 | - [llm_analysis](https://github.com/cli99/llm-analysis)
198 | - [Transformer Inference Arithmetic](https://kipp.ly/blog/transformer-inference-arithmetic/)
199 | - [LLM-Viewer](https://github.com/hahnyuan/LLM-Viewer.git)


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harleyszhang/llm_counts/2cd9bc0dd0aa757000b8bd9d5867e986d42828aa/__init__.py


--------------------------------------------------------------------------------
/cli_benchmark.py:
--------------------------------------------------------------------------------
 1 | # usage: python cli_benchmark.py --model_name Qwen3-32B --gpu_name a100-sxm-80gb --batch_size 16 --generate_len 1024 --tp_size 4
 2 | import pandas as pd
 3 | import argparse
 4 | from llm_counts.utils.config import *
 5 | from llm_counts.benchmark_analyzer import llm_profile
 6 | from llm_counts.utils.visualizer import SeqLenVisualizer
 7 | 
 8 | 
 9 | def sweep_seq_len(model_name, gpu_name="h100-sxm-80gb", batch_size=16, generate_len=1024, tp_size=2, seq_len_list=None, **kwargs):
10 |     """Profile a model over several sequence lengths and print / plot a table.
11 | 
12 |     Args:
13 |         model_name (str): name of the LLM
14 |         gpu_name (str): target GPU
15 |         seq_len_list (List[int]): list of sequence lengths to test
16 |         **kwargs: forwarded to llm_profile
17 |     Returns:
18 |         pandas.DataFrame: one row per sequence length with metrics
19 |     """
20 |     if seq_len_list is None:
21 |         seq_len_list = [128, 256, 512, 1024, 1334, 1567, 1678, 2567, 3072, 
22 |                         4096, 5120, 6144, 8192, 10240, 12288, 16384,
23 |                         21472, 24576, 30346, 32768, 33792, 34980, 36790]
24 | 
25 |     records1 = []
26 |     records2 = []
27 |     for seq in seq_len_list:
28 |         res1, res2 = llm_profile(
29 |             model_name=model_name,
30 |             gpu_name=gpu_name,
31 |             batch_size=batch_size,
32 |             seq_len=seq,
33 |             generate_len=generate_len,
34 |             tp_size=tp_size,
35 |             print_flag=False,
36 |             visual_flag=False,
37 |         )
38 |         print("=" * 80)
39 |         print(f"model_name: {model_name}, gpu_name: {gpu_name}, tp_size: {tp_size}, batch_size: {batch_size}, seq_len: {seq}, generate_len: {generate_len}")
40 | 
41 |         records1.append(res1)
42 |         records2.append(res2)
43 | 
44 |     df1 = pd.DataFrame(records1)
45 |     print("=" * 80)
46 |     print(df1.to_string(index=False))
47 |     print("=" * 80)
48 | 
49 |     df2 = pd.DataFrame(records2)
50 |     # Derive throughput in tokens / second for visualisation
51 |     if "TTFT" in df2.columns:
52 |         df2["throughput_tok_per_second"] = df2["seq_len"] * batch_size / df2["TTFT"].replace(0, float("nan"))
53 |     # Visualise the results using *plot_seq_len_sweep*
54 |     if kwargs.get("visual_flag", True):
55 |         viz = SeqLenVisualizer(df2, model_name, gpu_name, show=True)
56 |         viz.visualize()
57 | 
58 |     return df1
59 | 
60 | 
61 | def _parse_args():
62 |     parser = argparse.ArgumentParser(
63 |         description="Sweep sequence lengths, profile an LLM, and generate visualisations."
64 |     )
65 |     parser.add_argument("--model_name", required=True, help="LLM model name, e.g. Qwen3-32B")
66 |     parser.add_argument("--gpu_name", default="h100-sxm-80gb", help="Target GPU name")
67 |     parser.add_argument("--batch_size", type=int, default=16, help="Batch size")
68 |     parser.add_argument("--generate_len", type=int, default=1024, help="Generation length")
69 |     parser.add_argument("--tp_size", type=int, default=2, help="Tensor‑parallel size")
70 |     parser.add_argument(
71 |         "--seq_lens",
72 |         type=int,
73 |         nargs="*",
74 |         default=None,
75 |         help="Space‑separated list of sequence lengths (tokens) to sweep",
76 |     )
77 |     parser.add_argument(
78 |         "--no_visual",
79 |         action="store_true",
80 |         help="Disable visualisation (figures will not be generated)",
81 |     )
82 |     return parser.parse_args()
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     args = _parse_args()
87 |     sweep_seq_len(
88 |         model_name=args.model_name,
89 |         gpu_name=args.gpu_name,
90 |         batch_size=args.batch_size,
91 |         generate_len=args.generate_len,
92 |         tp_size=args.tp_size,
93 |         seq_len_list=args.seq_lens,
94 |         visual_flag=not args.no_visual,
95 |     )
96 | 


--------------------------------------------------------------------------------
/cli_perf_visual.py:
--------------------------------------------------------------------------------
 1 | from llm_counts.utils.config import *
 2 | from framework_tools.LLMCounts.llm_counts.benchmark_analyzer import llm_profile
 3 | import math
 4 | 
 5 | 
 6 | ####################################################################################################################
 7 | def print_list(list):
 8 |     """print one-dimensional list
 9 | 
10 |     :param list: List[int]
11 |     :return: None
12 |     """
13 |     for _, x in enumerate(list):
14 |         print(x, end="\n")
15 | 
16 | ####################################################################################################################
17 | def print_all_llm_analyzer():
18 |     model_name_list = [
19 |         "llama-7b",
20 |         "llama-13b",
21 |         "llama-65b",
22 |         "llama2-70b",
23 |         "internlm-20b",
24 |     ]
25 |     gpu_name_list = [
26 |         "a30-sxm-24gb",
27 |         "a40-pcie-48gb",
28 |         "a100-sxm-40gb",
29 |         "a100-sxm-80gb",
30 |         "910b-64gb",
31 |         "v100-sxm-32gb",
32 |         "t4-pcie-15gb",
33 |     ]
34 |     tp_nums_list = [1, 2, 4, 8]
35 |     tgi_service_dict_list = []
36 |     seq_len, generate_len = 1024, 1024
37 | 
38 |     for model_name in model_name_list:
39 |         if model_name in ["llama2-70b", "internlm-20b"]:
40 |             seq_len, generate_len = 1024, 1024
41 | 
42 |         for gpu_name in gpu_name_list:
43 |             for tp_size in tp_nums_list:
44 |                 try:
45 |                     res1, _ = llm_profile(
46 |                         model_name=model_name,
47 |                         gpu_name=gpu_name,
48 |                         tp_size=tp_size,
49 |                         seq_len=seq_len,
50 |                         generate_len=generate_len,
51 |                         print_flag=False,
52 |                         visual_flag=False,
53 |                     )
54 |                     max_batch_total_tokens = int(res1["max_batch_total_tokens"])
55 |                 except Exception as e:
56 |                     print(
57 |                         f"model_name: {model_name}, gpu_name: {gpu_name}, tp_size: {tp_size}, error: {e}"
58 |                     )
59 |                     continue
60 | 
61 |                 tgi_service_dict = {
62 |                     "model_name": model_name,
63 |                     "gpu_name": gpu_name,
64 |                     "tp_size": tp_size,
65 |                     "max_batch_total_tokens": max_batch_total_tokens,
66 |                     "max_bs": math.floor(
67 |                         max_batch_total_tokens / (seq_len + generate_len)
68 |                     ),
69 |                 }
70 |                 tgi_service_dict_list.append(tgi_service_dict)
71 | 
72 |     print(
73 |         "============================ TGI+LightLLM service max_batch_total_tokens params list ======================"
74 |     )
75 |     print_list(tgi_service_dict_list)
76 | 
77 | if __name__ == "__main__":
78 |     # llm_profile(model_name="llama-7b", tp_size=1, print_flag=True, visual_flag=True)
79 |     llm_profile(model_name="llama2-70b", gpu_name = "a100-sxm-40gb", tp_size=8, 
80 |                 batch_size = 32, seq_len = 1024, generate_len=128,
81 |                 print_flag=True, visual_flag=True)
82 | 


--------------------------------------------------------------------------------
/cli_structure_analyzer.py:
--------------------------------------------------------------------------------
 1 | from llm_counts.layer_graph_visualizer import LayerAnalyzer, LayerGraphVisualizer
 2 | from llm_counts.utils.utils import *
 3 | from llm_counts.utils.config import get_model_and_gpu_config_by_name
 4 | import pprint
 5 | import argparse
 6 | 
 7 | 
 8 | def print_format_summary_dict(summary_dict: dict, depth: int) -> str:
 9 |     """打印时对 params / flops / latency / memory 等进行统一转换显示。"""
10 |     for key, value in summary_dict.items():
11 |         if "params" in key or "flops" in key:
12 |             if not isinstance(value, dict):
13 |                 summary_dict.update({key: num_to_string(value)})
14 |             else:
15 |                 print_format_summary_dict(
16 |                     value, get_dict_depth(value) - 1
17 |                 )  # 递归
18 |         if "latency" in key:
19 |             if not isinstance(value, dict):
20 |                 summary_dict.update({key: latency_to_string(value)})
21 |             else:
22 |                 print_format_summary_dict(value, get_dict_depth(value) - 1)
23 |         if "memory" in key:
24 |             if not isinstance(value, dict):
25 |                 summary_dict.update({key: f"{num_to_string(value)}B"})
26 |             else:
27 |                 print_format_summary_dict(value, get_dict_depth(value) - 1)
28 |     if depth >= 1:
29 |         pprint.pprint(summary_dict, indent=4, sort_dicts=False)
30 | 
31 | def test_llm_analyzer(
32 |         model_name: str = "Qwen/Qwen3-8B",
33 |         gpu_name="a100-sxm-80gb",
34 |         bs: int = 1,
35 |         seq_len: int = 522,
36 |         generate_len: int = 1526,
37 |         tp_size: int = 1,
38 |     ):
39 |     model_config, gpu_config = get_model_and_gpu_config_by_name(model_name, gpu_name)
40 |     model_type = model_config.model_type
41 |     llm_analyzer = LayerAnalyzer(model_config, gpu_config, tp_size=tp_size)
42 |     results = llm_analyzer.analyze_model(bs=bs, seq_len=seq_len, generate_len=generate_len)
43 | 
44 |     # -------------------------- 绘图：模型 graph 图示例 --------------------------
45 |     base_filename = f"{model_name.replace('/', '_')}_tp{tp_size}_bs{bs}_seqlen{seq_len}_genlen{generate_len}"
46 |     print("base_filename", base_filename)
47 |     LayerGraphVisualizer(model_type, results).render(base_filename)
48 |     depth = get_dict_depth(results)
49 |     # print_format_summary_dict(results, depth)
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     parser = argparse.ArgumentParser(
54 |         description="Run LayerAnalyzer, print a formatted summary, "
55 |                     "and generate per‑stage layer‑graph PNGs."
56 |     )
57 |     parser.add_argument("--model-name", default="Qwen3-32B")
58 |     parser.add_argument("--gpu-name", default="a100-sxm-80gb")
59 |     parser.add_argument("--bs",          type=int, default=16)
60 |     parser.add_argument("--seq-len",     type=int, default=1024)
61 |     parser.add_argument("--generate-len",type=int, default=128)
62 |     parser.add_argument("--tp-size",     type=int, default=4)
63 |     args = parser.parse_args()
64 | 
65 |     test_llm_analyzer(
66 |         model_name=args.model_name,
67 |         gpu_name=args.gpu_name,
68 |         bs=args.bs,
69 |         seq_len=args.seq_len,
70 |         generate_len=args.generate_len,
71 |         tp_size=args.tp_size,
72 |     )
73 | 
74 | """"
75 | python cli_structure_analyzer.py \
76 |   --model-name llama2-70B \
77 |   --gpu-name  a100-sxm-80gb \
78 |   --bs 16 \
79 |   --seq-len 1024 \
80 |   --generate-len 128 \
81 |   --tp-size 4
82 | """


--------------------------------------------------------------------------------
/figures/Qwen3-32B_a100-sxm-80gb_flops_vs_seq_len.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harleyszhang/llm_counts/2cd9bc0dd0aa757000b8bd9d5867e986d42828aa/figures/Qwen3-32B_a100-sxm-80gb_flops_vs_seq_len.png


--------------------------------------------------------------------------------
/figures/Qwen3-32B_a100-sxm-80gb_interactive.html:
--------------------------------------------------------------------------------
1 | <h2>TTFT (s)</h2><div>                        <script type="text/javascript">window.PlotlyConfig = {MathJaxConfig: 'local'};</script>
2 |         <script charset="utf-8" src="https://cdn.plot.ly/plotly-2.35.2.min.js"></script>                <div id="3df7d5eb-0d74-479a-8d5a-56a5b63dcdcf" class="plotly-graph-div" style="height:100%; width:100%;"></div>            <script type="text/javascript">                                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("3df7d5eb-0d74-479a-8d5a-56a5b63dcdcf")) {                    Plotly.newPlot(                        "3df7d5eb-0d74-479a-8d5a-56a5b63dcdcf",                        [{"mode":"lines+markers","x":[128,256,512,1024,1334,1567,1678,2567,3072,4096,5120,6144,8192,10240,12288,16384,21472,24576,30346,32768,33792,34980,36790],"y":[0.1344989260218134,0.27048866569301916,0.5494895072979777,1.135576639558083,1.5086345989886627,1.7980663933162564,1.9386775013339392,3.128329948736558,3.85439782260101,5.438492006523977,7.172375252047948,9.05604755917292,13.272759358225873,18.088627403682842,23.503651695543812,36.1311690184778,55.15447386104742,68.57607862119387,97.18256137439744,110.60748816637417,116.5354651367262,123.60053925847626,134.75224647934635],"type":"scatter"}],                        {"template":{"data":{"barpolar":[{"marker":{"line":{"color":"rgb(234,234,242)","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"bar":[{"error_x":{"color":"rgb(36,36,36)"},"error_y":{"color":"rgb(36,36,36)"},"marker":{"line":{"color":"rgb(234,234,242)","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"carpet":[{"aaxis":{"endlinecolor":"rgb(36,36,36)","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"rgb(36,36,36)"},"baxis":{"endlinecolor":"rgb(36,36,36)","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"rgb(36,36,36)"},"type":"carpet"}],"choropleth":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"type":"choropleth"}],"contourcarpet":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"type":"contourcarpet"}],"contour":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"colorscale":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]],"type":"contour"}],"heatmapgl":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"colorscale":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]],"type":"heatmapgl"}],"heatmap":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"colorscale":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]],"type":"heatmap"}],"histogram2dcontour":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"colorscale":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]],"type":"histogram2dcontour"}],"histogram2d":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"colorscale":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]],"type":"histogram2d"}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"mesh3d":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"type":"mesh3d"}],"parcoords":[{"line":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"parcoords"}],"pie":[{"automargin":true,"type":"pie"}],"scatter3d":[{"line":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scatter3d"}],"scattercarpet":[{"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scattercarpet"}],"scattergeo":[{"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scattergeo"}],"scattergl":[{"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scattergl"}],"scattermapbox":[{"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scattermapbox"}],"scatterpolargl":[{"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scatterpolargl"}],"scatterpolar":[{"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scatterpolar"}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"scatterternary":[{"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scatterternary"}],"surface":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"colorscale":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]],"type":"surface"}],"table":[{"cells":{"fill":{"color":"rgb(231,231,240)"},"line":{"color":"white"}},"header":{"fill":{"color":"rgb(183,183,191)"},"line":{"color":"white"}},"type":"table"}]},"layout":{"annotationdefaults":{"arrowcolor":"rgb(67,103,167)"},"autotypenumbers":"strict","coloraxis":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"colorscale":{"sequential":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]],"sequentialminus":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]]},"colorway":["rgb(76,114,176)","rgb(221,132,82)","rgb(85,168,104)","rgb(196,78,82)","rgb(129,114,179)","rgb(147,120,96)","rgb(218,139,195)","rgb(140,140,140)","rgb(204,185,116)","rgb(100,181,205)"],"font":{"color":"rgb(36,36,36)"},"geo":{"bgcolor":"white","lakecolor":"white","landcolor":"rgb(234,234,242)","showlakes":true,"showland":true,"subunitcolor":"white"},"hoverlabel":{"align":"left"},"hovermode":"closest","paper_bgcolor":"white","plot_bgcolor":"rgb(234,234,242)","polar":{"angularaxis":{"gridcolor":"white","linecolor":"white","showgrid":true,"ticks":""},"bgcolor":"rgb(234,234,242)","radialaxis":{"gridcolor":"white","linecolor":"white","showgrid":true,"ticks":""}},"scene":{"xaxis":{"backgroundcolor":"rgb(234,234,242)","gridcolor":"white","gridwidth":2,"linecolor":"white","showbackground":true,"showgrid":true,"ticks":"","zerolinecolor":"white"},"yaxis":{"backgroundcolor":"rgb(234,234,242)","gridcolor":"white","gridwidth":2,"linecolor":"white","showbackground":true,"showgrid":true,"ticks":"","zerolinecolor":"white"},"zaxis":{"backgroundcolor":"rgb(234,234,242)","gridcolor":"white","gridwidth":2,"linecolor":"white","showbackground":true,"showgrid":true,"ticks":"","zerolinecolor":"white"}},"shapedefaults":{"fillcolor":"rgb(67,103,167)","line":{"width":0},"opacity":0.5},"ternary":{"aaxis":{"gridcolor":"white","linecolor":"white","showgrid":true,"ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","showgrid":true,"ticks":""},"bgcolor":"rgb(234,234,242)","caxis":{"gridcolor":"white","linecolor":"white","showgrid":true,"ticks":""}},"xaxis":{"automargin":true,"gridcolor":"white","linecolor":"white","showgrid":true,"ticks":"","title":{"standoff":15},"zerolinecolor":"white"},"yaxis":{"automargin":true,"gridcolor":"white","linecolor":"white","showgrid":true,"ticks":"","title":{"standoff":15},"zerolinecolor":"white"}}},"title":{"text":"TTFT (s) vs seq_len"}},                        {"responsive": true}                    )                };                            </script>        </div><h2>TTOT (ms)</h2><div>                            <div id="2ea8a0af-6213-4edb-8eb5-7f0a654043eb" class="plotly-graph-div" style="height:100%; width:100%;"></div>            <script type="text/javascript">                                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("2ea8a0af-6213-4edb-8eb5-7f0a654043eb")) {                    Plotly.newPlot(                        "2ea8a0af-6213-4edb-8eb5-7f0a654043eb",                        [{"mode":"lines+markers","x":[128,256,512,1024,1334,1567,1678,2567,3072,4096,5120,6144,8192,10240,12288,16384,21472,24576,30346,32768,33792,34980,36790],"y":[105.57822254689508,105.65136173712993,105.79764011759967,106.09019687853912,106.26733085488918,106.40046703711359,106.46389242864538,106.97186696082346,107.26042392229695,107.84553744417587,108.43065096605478,109.0157644879337,110.18599153169154,111.35621857544936,112.52644561920721,114.86689970672288,117.77418251855873,119.54780788175421,122.8447854415602,124.22871605678554,124.81382957866445,125.49265268803178,126.52688654994668],"type":"scatter"}],                        {"template":{"data":{"barpolar":[{"marker":{"line":{"color":"rgb(234,234,242)","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"bar":[{"error_x":{"color":"rgb(36,36,36)"},"error_y":{"color":"rgb(36,36,36)"},"marker":{"line":{"color":"rgb(234,234,242)","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"carpet":[{"aaxis":{"endlinecolor":"rgb(36,36,36)","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"rgb(36,36,36)"},"baxis":{"endlinecolor":"rgb(36,36,36)","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"rgb(36,36,36)"},"type":"carpet"}],"choropleth":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"type":"choropleth"}],"contourcarpet":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"type":"contourcarpet"}],"contour":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"colorscale":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]],"type":"contour"}],"heatmapgl":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"colorscale":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]],"type":"heatmapgl"}],"heatmap":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"colorscale":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]],"type":"heatmap"}],"histogram2dcontour":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"colorscale":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]],"type":"histogram2dcontour"}],"histogram2d":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"colorscale":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]],"type":"histogram2d"}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"mesh3d":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"type":"mesh3d"}],"parcoords":[{"line":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"parcoords"}],"pie":[{"automargin":true,"type":"pie"}],"scatter3d":[{"line":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scatter3d"}],"scattercarpet":[{"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scattercarpet"}],"scattergeo":[{"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scattergeo"}],"scattergl":[{"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scattergl"}],"scattermapbox":[{"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scattermapbox"}],"scatterpolargl":[{"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scatterpolargl"}],"scatterpolar":[{"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scatterpolar"}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"scatterternary":[{"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scatterternary"}],"surface":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"colorscale":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]],"type":"surface"}],"table":[{"cells":{"fill":{"color":"rgb(231,231,240)"},"line":{"color":"white"}},"header":{"fill":{"color":"rgb(183,183,191)"},"line":{"color":"white"}},"type":"table"}]},"layout":{"annotationdefaults":{"arrowcolor":"rgb(67,103,167)"},"autotypenumbers":"strict","coloraxis":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"colorscale":{"sequential":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]],"sequentialminus":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]]},"colorway":["rgb(76,114,176)","rgb(221,132,82)","rgb(85,168,104)","rgb(196,78,82)","rgb(129,114,179)","rgb(147,120,96)","rgb(218,139,195)","rgb(140,140,140)","rgb(204,185,116)","rgb(100,181,205)"],"font":{"color":"rgb(36,36,36)"},"geo":{"bgcolor":"white","lakecolor":"white","landcolor":"rgb(234,234,242)","showlakes":true,"showland":true,"subunitcolor":"white"},"hoverlabel":{"align":"left"},"hovermode":"closest","paper_bgcolor":"white","plot_bgcolor":"rgb(234,234,242)","polar":{"angularaxis":{"gridcolor":"white","linecolor":"white","showgrid":true,"ticks":""},"bgcolor":"rgb(234,234,242)","radialaxis":{"gridcolor":"white","linecolor":"white","showgrid":true,"ticks":""}},"scene":{"xaxis":{"backgroundcolor":"rgb(234,234,242)","gridcolor":"white","gridwidth":2,"linecolor":"white","showbackground":true,"showgrid":true,"ticks":"","zerolinecolor":"white"},"yaxis":{"backgroundcolor":"rgb(234,234,242)","gridcolor":"white","gridwidth":2,"linecolor":"white","showbackground":true,"showgrid":true,"ticks":"","zerolinecolor":"white"},"zaxis":{"backgroundcolor":"rgb(234,234,242)","gridcolor":"white","gridwidth":2,"linecolor":"white","showbackground":true,"showgrid":true,"ticks":"","zerolinecolor":"white"}},"shapedefaults":{"fillcolor":"rgb(67,103,167)","line":{"width":0},"opacity":0.5},"ternary":{"aaxis":{"gridcolor":"white","linecolor":"white","showgrid":true,"ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","showgrid":true,"ticks":""},"bgcolor":"rgb(234,234,242)","caxis":{"gridcolor":"white","linecolor":"white","showgrid":true,"ticks":""}},"xaxis":{"automargin":true,"gridcolor":"white","linecolor":"white","showgrid":true,"ticks":"","title":{"standoff":15},"zerolinecolor":"white"},"yaxis":{"automargin":true,"gridcolor":"white","linecolor":"white","showgrid":true,"ticks":"","title":{"standoff":15},"zerolinecolor":"white"}}},"title":{"text":"TTOT (ms) vs seq_len"}},                        {"responsive": true}                    )                };                            </script>        </div><h2>Prefill TFLOPs</h2><div>                            <div id="cb384565-4607-47a9-812b-da22fcf52f8c" class="plotly-graph-div" style="height:100%; width:100%;"></div>            <script type="text/javascript">                                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("cb384565-4607-47a9-812b-da22fcf52f8c")) {                    Plotly.newPlot(                        "cb384565-4607-47a9-812b-da22fcf52f8c",                        [{"mode":"lines+markers","x":[128,256,512,1024,1334,1567,1678,2567,3072,4096,5120,6144,8192,10240,12288,16384,21472,24576,30346,32768,33792,34980,36790],"y":[1.28378048872448e+26,2.57855609372672e+26,5.20109265256448e+26,1.057810716557312e+27,1.391922482511872e+27,1.647290470105088e+27,1.770227706626048e+27,2.784663031513088e+27,3.384538382204928e+27,4.653455331295232e+27,5.9927410245632e+27,7.402395462008832e+27,1.0432810569433088e+28,1.3744700653568e+28,1.7338065714413568e+28,2.536922076623667e+28,3.6913359123120127e+28,4.480923059041075e+28,6.1204887850975235e+28,6.875284004195533e+28,7.206245057219789e+28,7.59902921293824e+28,8.2156708954112e+28],"type":"scatter"}],                        {"template":{"data":{"barpolar":[{"marker":{"line":{"color":"rgb(234,234,242)","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"bar":[{"error_x":{"color":"rgb(36,36,36)"},"error_y":{"color":"rgb(36,36,36)"},"marker":{"line":{"color":"rgb(234,234,242)","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"carpet":[{"aaxis":{"endlinecolor":"rgb(36,36,36)","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"rgb(36,36,36)"},"baxis":{"endlinecolor":"rgb(36,36,36)","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"rgb(36,36,36)"},"type":"carpet"}],"choropleth":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"type":"choropleth"}],"contourcarpet":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"type":"contourcarpet"}],"contour":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"colorscale":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]],"type":"contour"}],"heatmapgl":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"colorscale":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]],"type":"heatmapgl"}],"heatmap":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"colorscale":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]],"type":"heatmap"}],"histogram2dcontour":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"colorscale":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]],"type":"histogram2dcontour"}],"histogram2d":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"colorscale":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]],"type":"histogram2d"}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"mesh3d":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"type":"mesh3d"}],"parcoords":[{"line":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"parcoords"}],"pie":[{"automargin":true,"type":"pie"}],"scatter3d":[{"line":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scatter3d"}],"scattercarpet":[{"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scattercarpet"}],"scattergeo":[{"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scattergeo"}],"scattergl":[{"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scattergl"}],"scattermapbox":[{"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scattermapbox"}],"scatterpolargl":[{"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scatterpolargl"}],"scatterpolar":[{"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scatterpolar"}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"scatterternary":[{"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scatterternary"}],"surface":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"colorscale":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]],"type":"surface"}],"table":[{"cells":{"fill":{"color":"rgb(231,231,240)"},"line":{"color":"white"}},"header":{"fill":{"color":"rgb(183,183,191)"},"line":{"color":"white"}},"type":"table"}]},"layout":{"annotationdefaults":{"arrowcolor":"rgb(67,103,167)"},"autotypenumbers":"strict","coloraxis":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"colorscale":{"sequential":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]],"sequentialminus":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]]},"colorway":["rgb(76,114,176)","rgb(221,132,82)","rgb(85,168,104)","rgb(196,78,82)","rgb(129,114,179)","rgb(147,120,96)","rgb(218,139,195)","rgb(140,140,140)","rgb(204,185,116)","rgb(100,181,205)"],"font":{"color":"rgb(36,36,36)"},"geo":{"bgcolor":"white","lakecolor":"white","landcolor":"rgb(234,234,242)","showlakes":true,"showland":true,"subunitcolor":"white"},"hoverlabel":{"align":"left"},"hovermode":"closest","paper_bgcolor":"white","plot_bgcolor":"rgb(234,234,242)","polar":{"angularaxis":{"gridcolor":"white","linecolor":"white","showgrid":true,"ticks":""},"bgcolor":"rgb(234,234,242)","radialaxis":{"gridcolor":"white","linecolor":"white","showgrid":true,"ticks":""}},"scene":{"xaxis":{"backgroundcolor":"rgb(234,234,242)","gridcolor":"white","gridwidth":2,"linecolor":"white","showbackground":true,"showgrid":true,"ticks":"","zerolinecolor":"white"},"yaxis":{"backgroundcolor":"rgb(234,234,242)","gridcolor":"white","gridwidth":2,"linecolor":"white","showbackground":true,"showgrid":true,"ticks":"","zerolinecolor":"white"},"zaxis":{"backgroundcolor":"rgb(234,234,242)","gridcolor":"white","gridwidth":2,"linecolor":"white","showbackground":true,"showgrid":true,"ticks":"","zerolinecolor":"white"}},"shapedefaults":{"fillcolor":"rgb(67,103,167)","line":{"width":0},"opacity":0.5},"ternary":{"aaxis":{"gridcolor":"white","linecolor":"white","showgrid":true,"ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","showgrid":true,"ticks":""},"bgcolor":"rgb(234,234,242)","caxis":{"gridcolor":"white","linecolor":"white","showgrid":true,"ticks":""}},"xaxis":{"automargin":true,"gridcolor":"white","linecolor":"white","showgrid":true,"ticks":"","title":{"standoff":15},"zerolinecolor":"white"},"yaxis":{"automargin":true,"gridcolor":"white","linecolor":"white","showgrid":true,"ticks":"","title":{"standoff":15},"zerolinecolor":"white"}}},"title":{"text":"Prefill TFLOPs vs seq_len"}},                        {"responsive": true}                    )                };                            </script>        </div><h2>HBM (GiB)</h2><div>                            <div id="3cbf2295-52cd-4b5b-b7d7-a54f50cc5ad7" class="plotly-graph-div" style="height:100%; width:100%;"></div>            <script type="text/javascript">                                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("3cbf2295-52cd-4b5b-b7d7-a54f50cc5ad7")) {                    Plotly.newPlot(                        "3cbf2295-52cd-4b5b-b7d7-a54f50cc5ad7",                        [{"mode":"lines+markers","x":[128,256,512,1024,1334,1567,1678,2567,3072,4096,5120,6144,8192,10240,12288,16384,21472,24576,30346,32768,33792,34980,36790],"y":[1.959432634985192e+19,1.9738441537927774e+19,2.0026671914079486e+19,2.060313266638291e+19,2.0952161637504123e+19,2.1214496315798454e+19,2.1339471205457986e+19,2.234039622264108e+19,2.2908975675596603e+19,2.406189718020345e+19,2.5214818684810297e+19,2.6367740189417144e+19,2.867358319863084e+19,3.0979426207844532e+19,3.3285269217058226e+19,3.789695523548561e+19,4.3625533961500885e+19,4.712032727234039e+19,5.361676973482233e+19,5.6343699309195166e+19,5.749662081380201e+19,5.883418990313105e+19,6.08720687345162e+19],"type":"scatter"}],                        {"template":{"data":{"barpolar":[{"marker":{"line":{"color":"rgb(234,234,242)","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"bar":[{"error_x":{"color":"rgb(36,36,36)"},"error_y":{"color":"rgb(36,36,36)"},"marker":{"line":{"color":"rgb(234,234,242)","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"carpet":[{"aaxis":{"endlinecolor":"rgb(36,36,36)","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"rgb(36,36,36)"},"baxis":{"endlinecolor":"rgb(36,36,36)","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"rgb(36,36,36)"},"type":"carpet"}],"choropleth":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"type":"choropleth"}],"contourcarpet":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"type":"contourcarpet"}],"contour":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"colorscale":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]],"type":"contour"}],"heatmapgl":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"colorscale":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]],"type":"heatmapgl"}],"heatmap":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"colorscale":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]],"type":"heatmap"}],"histogram2dcontour":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"colorscale":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]],"type":"histogram2dcontour"}],"histogram2d":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"colorscale":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]],"type":"histogram2d"}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"mesh3d":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"type":"mesh3d"}],"parcoords":[{"line":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"parcoords"}],"pie":[{"automargin":true,"type":"pie"}],"scatter3d":[{"line":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scatter3d"}],"scattercarpet":[{"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scattercarpet"}],"scattergeo":[{"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scattergeo"}],"scattergl":[{"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scattergl"}],"scattermapbox":[{"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scattermapbox"}],"scatterpolargl":[{"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scatterpolargl"}],"scatterpolar":[{"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scatterpolar"}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"scatterternary":[{"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scatterternary"}],"surface":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"colorscale":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]],"type":"surface"}],"table":[{"cells":{"fill":{"color":"rgb(231,231,240)"},"line":{"color":"white"}},"header":{"fill":{"color":"rgb(183,183,191)"},"line":{"color":"white"}},"type":"table"}]},"layout":{"annotationdefaults":{"arrowcolor":"rgb(67,103,167)"},"autotypenumbers":"strict","coloraxis":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"colorscale":{"sequential":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]],"sequentialminus":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]]},"colorway":["rgb(76,114,176)","rgb(221,132,82)","rgb(85,168,104)","rgb(196,78,82)","rgb(129,114,179)","rgb(147,120,96)","rgb(218,139,195)","rgb(140,140,140)","rgb(204,185,116)","rgb(100,181,205)"],"font":{"color":"rgb(36,36,36)"},"geo":{"bgcolor":"white","lakecolor":"white","landcolor":"rgb(234,234,242)","showlakes":true,"showland":true,"subunitcolor":"white"},"hoverlabel":{"align":"left"},"hovermode":"closest","paper_bgcolor":"white","plot_bgcolor":"rgb(234,234,242)","polar":{"angularaxis":{"gridcolor":"white","linecolor":"white","showgrid":true,"ticks":""},"bgcolor":"rgb(234,234,242)","radialaxis":{"gridcolor":"white","linecolor":"white","showgrid":true,"ticks":""}},"scene":{"xaxis":{"backgroundcolor":"rgb(234,234,242)","gridcolor":"white","gridwidth":2,"linecolor":"white","showbackground":true,"showgrid":true,"ticks":"","zerolinecolor":"white"},"yaxis":{"backgroundcolor":"rgb(234,234,242)","gridcolor":"white","gridwidth":2,"linecolor":"white","showbackground":true,"showgrid":true,"ticks":"","zerolinecolor":"white"},"zaxis":{"backgroundcolor":"rgb(234,234,242)","gridcolor":"white","gridwidth":2,"linecolor":"white","showbackground":true,"showgrid":true,"ticks":"","zerolinecolor":"white"}},"shapedefaults":{"fillcolor":"rgb(67,103,167)","line":{"width":0},"opacity":0.5},"ternary":{"aaxis":{"gridcolor":"white","linecolor":"white","showgrid":true,"ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","showgrid":true,"ticks":""},"bgcolor":"rgb(234,234,242)","caxis":{"gridcolor":"white","linecolor":"white","showgrid":true,"ticks":""}},"xaxis":{"automargin":true,"gridcolor":"white","linecolor":"white","showgrid":true,"ticks":"","title":{"standoff":15},"zerolinecolor":"white"},"yaxis":{"automargin":true,"gridcolor":"white","linecolor":"white","showgrid":true,"ticks":"","title":{"standoff":15},"zerolinecolor":"white"}}},"title":{"text":"HBM (GiB) vs seq_len"}},                        {"responsive": true}                    )                };                            </script>        </div><h2>Throughput (tok/s)</h2><div>                            <div id="65315ccc-c59d-4fd6-baef-798e9dbc188e" class="plotly-graph-div" style="height:100%; width:100%;"></div>            <script type="text/javascript">                                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("65315ccc-c59d-4fd6-baef-798e9dbc188e")) {                    Plotly.newPlot(                        "65315ccc-c59d-4fd6-baef-798e9dbc188e",                        [{"mode":"lines+markers","x":[128,256,512,1024,1334,1567,1678,2567,3072,4096,5120,6144,8192,10240,12288,16384,21472,24576,30346,32768,33792,34980,36790],"y":[15226.887385463953,15142.963530488925,14908.382946715003,14427.912154283078,14147.892414974634,13943.867753269422,13848.615863921043,13129.049899799666,12752.186531392194,12050.399250634822,11421.59983564847,10855.06666762448,9875.263798764447,9057.624790626302,8364.998024425115,7255.342329663821,6228.90539878093,5734.01115820691,4996.122690463616,4740.076903395307,4639.548993653152,4528.135583855216,4368.313073654186],"type":"scatter"}],                        {"template":{"data":{"barpolar":[{"marker":{"line":{"color":"rgb(234,234,242)","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"bar":[{"error_x":{"color":"rgb(36,36,36)"},"error_y":{"color":"rgb(36,36,36)"},"marker":{"line":{"color":"rgb(234,234,242)","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"carpet":[{"aaxis":{"endlinecolor":"rgb(36,36,36)","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"rgb(36,36,36)"},"baxis":{"endlinecolor":"rgb(36,36,36)","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"rgb(36,36,36)"},"type":"carpet"}],"choropleth":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"type":"choropleth"}],"contourcarpet":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"type":"contourcarpet"}],"contour":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"colorscale":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]],"type":"contour"}],"heatmapgl":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"colorscale":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]],"type":"heatmapgl"}],"heatmap":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"colorscale":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]],"type":"heatmap"}],"histogram2dcontour":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"colorscale":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]],"type":"histogram2dcontour"}],"histogram2d":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"colorscale":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]],"type":"histogram2d"}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"mesh3d":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"type":"mesh3d"}],"parcoords":[{"line":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"parcoords"}],"pie":[{"automargin":true,"type":"pie"}],"scatter3d":[{"line":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scatter3d"}],"scattercarpet":[{"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scattercarpet"}],"scattergeo":[{"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scattergeo"}],"scattergl":[{"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scattergl"}],"scattermapbox":[{"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scattermapbox"}],"scatterpolargl":[{"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scatterpolargl"}],"scatterpolar":[{"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scatterpolar"}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"scatterternary":[{"marker":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"type":"scatterternary"}],"surface":[{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2},"colorscale":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]],"type":"surface"}],"table":[{"cells":{"fill":{"color":"rgb(231,231,240)"},"line":{"color":"white"}},"header":{"fill":{"color":"rgb(183,183,191)"},"line":{"color":"white"}},"type":"table"}]},"layout":{"annotationdefaults":{"arrowcolor":"rgb(67,103,167)"},"autotypenumbers":"strict","coloraxis":{"colorbar":{"outlinewidth":0,"tickcolor":"rgb(36,36,36)","ticklen":8,"ticks":"outside","tickwidth":2}},"colorscale":{"sequential":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]],"sequentialminus":[[0.0,"rgb(2,4,25)"],[0.06274509803921569,"rgb(24,15,41)"],[0.12549019607843137,"rgb(47,23,57)"],[0.18823529411764706,"rgb(71,28,72)"],[0.25098039215686274,"rgb(97,30,82)"],[0.3137254901960784,"rgb(123,30,89)"],[0.3764705882352941,"rgb(150,27,91)"],[0.4392156862745098,"rgb(177,22,88)"],[0.5019607843137255,"rgb(203,26,79)"],[0.5647058823529412,"rgb(223,47,67)"],[0.6274509803921569,"rgb(236,76,61)"],[0.6901960784313725,"rgb(242,107,73)"],[0.7529411764705882,"rgb(244,135,95)"],[0.8156862745098039,"rgb(245,162,122)"],[0.8784313725490196,"rgb(246,188,153)"],[0.9411764705882353,"rgb(247,212,187)"],[1.0,"rgb(250,234,220)"]]},"colorway":["rgb(76,114,176)","rgb(221,132,82)","rgb(85,168,104)","rgb(196,78,82)","rgb(129,114,179)","rgb(147,120,96)","rgb(218,139,195)","rgb(140,140,140)","rgb(204,185,116)","rgb(100,181,205)"],"font":{"color":"rgb(36,36,36)"},"geo":{"bgcolor":"white","lakecolor":"white","landcolor":"rgb(234,234,242)","showlakes":true,"showland":true,"subunitcolor":"white"},"hoverlabel":{"align":"left"},"hovermode":"closest","paper_bgcolor":"white","plot_bgcolor":"rgb(234,234,242)","polar":{"angularaxis":{"gridcolor":"white","linecolor":"white","showgrid":true,"ticks":""},"bgcolor":"rgb(234,234,242)","radialaxis":{"gridcolor":"white","linecolor":"white","showgrid":true,"ticks":""}},"scene":{"xaxis":{"backgroundcolor":"rgb(234,234,242)","gridcolor":"white","gridwidth":2,"linecolor":"white","showbackground":true,"showgrid":true,"ticks":"","zerolinecolor":"white"},"yaxis":{"backgroundcolor":"rgb(234,234,242)","gridcolor":"white","gridwidth":2,"linecolor":"white","showbackground":true,"showgrid":true,"ticks":"","zerolinecolor":"white"},"zaxis":{"backgroundcolor":"rgb(234,234,242)","gridcolor":"white","gridwidth":2,"linecolor":"white","showbackground":true,"showgrid":true,"ticks":"","zerolinecolor":"white"}},"shapedefaults":{"fillcolor":"rgb(67,103,167)","line":{"width":0},"opacity":0.5},"ternary":{"aaxis":{"gridcolor":"white","linecolor":"white","showgrid":true,"ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","showgrid":true,"ticks":""},"bgcolor":"rgb(234,234,242)","caxis":{"gridcolor":"white","linecolor":"white","showgrid":true,"ticks":""}},"xaxis":{"automargin":true,"gridcolor":"white","linecolor":"white","showgrid":true,"ticks":"","title":{"standoff":15},"zerolinecolor":"white"},"yaxis":{"automargin":true,"gridcolor":"white","linecolor":"white","showgrid":true,"ticks":"","title":{"standoff":15},"zerolinecolor":"white"}}},"title":{"text":"Throughput (tok\u002fs) vs seq_len"}},                        {"responsive": true}                    )                };                            </script>        </div>


--------------------------------------------------------------------------------
/figures/Qwen3-32B_a100-sxm-80gb_latency_vs_seq_len.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harleyszhang/llm_counts/2cd9bc0dd0aa757000b8bd9d5867e986d42828aa/figures/Qwen3-32B_a100-sxm-80gb_latency_vs_seq_len.png


--------------------------------------------------------------------------------
/figures/Qwen3-32B_a100-sxm-80gb_memory_vs_seq_len.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harleyszhang/llm_counts/2cd9bc0dd0aa757000b8bd9d5867e986d42828aa/figures/Qwen3-32B_a100-sxm-80gb_memory_vs_seq_len.png


--------------------------------------------------------------------------------
/figures/Qwen3-32B_a100-sxm-80gb_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harleyszhang/llm_counts/2cd9bc0dd0aa757000b8bd9d5867e986d42828aa/figures/Qwen3-32B_a100-sxm-80gb_overview.png


--------------------------------------------------------------------------------
/figures/Qwen3-32B_a100-sxm-80gb_throughput_vs_seq_len.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harleyszhang/llm_counts/2cd9bc0dd0aa757000b8bd9d5867e986d42828aa/figures/Qwen3-32B_a100-sxm-80gb_throughput_vs_seq_len.png


--------------------------------------------------------------------------------
/figures/grpah_decode_llama2-70B_tp4_bs16_seqlen1024_genlen128.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harleyszhang/llm_counts/2cd9bc0dd0aa757000b8bd9d5867e986d42828aa/figures/grpah_decode_llama2-70B_tp4_bs16_seqlen1024_genlen128.png


--------------------------------------------------------------------------------
/figures/grpah_prefill_llama2-70B_tp4_bs16_seqlen1024_genlen128.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harleyszhang/llm_counts/2cd9bc0dd0aa757000b8bd9d5867e986d42828aa/figures/grpah_prefill_llama2-70B_tp4_bs16_seqlen1024_genlen128.png


--------------------------------------------------------------------------------
/figures/roofline_analysis_optimized.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harleyszhang/llm_counts/2cd9bc0dd0aa757000b8bd9d5867e986d42828aa/figures/roofline_analysis_optimized.png


--------------------------------------------------------------------------------
/images/flops_decode_llama2-70b_tp8_bs32_seqlen1024_genlen128.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harleyszhang/llm_counts/2cd9bc0dd0aa757000b8bd9d5867e986d42828aa/images/flops_decode_llama2-70b_tp8_bs32_seqlen1024_genlen128.png


--------------------------------------------------------------------------------
/images/flops_prefill_llama2-70b_tp8_bs32_seqlen1024_genlen128.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harleyszhang/llm_counts/2cd9bc0dd0aa757000b8bd9d5867e986d42828aa/images/flops_prefill_llama2-70b_tp8_bs32_seqlen1024_genlen128.png


--------------------------------------------------------------------------------
/images/grpah_decode_llama2-70b_tp8_bs32_seqlen1024_genlen128.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harleyszhang/llm_counts/2cd9bc0dd0aa757000b8bd9d5867e986d42828aa/images/grpah_decode_llama2-70b_tp8_bs32_seqlen1024_genlen128.png


--------------------------------------------------------------------------------
/images/grpah_prefill_llama2-70b_tp8_bs32_seqlen1024_genlen128.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harleyszhang/llm_counts/2cd9bc0dd0aa757000b8bd9d5867e986d42828aa/images/grpah_prefill_llama2-70b_tp8_bs32_seqlen1024_genlen128.png


--------------------------------------------------------------------------------
/images/latency_decode_llama2-70b_tp8_bs32_seqlen1024_genlen128.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harleyszhang/llm_counts/2cd9bc0dd0aa757000b8bd9d5867e986d42828aa/images/latency_decode_llama2-70b_tp8_bs32_seqlen1024_genlen128.png


--------------------------------------------------------------------------------
/images/latency_prefill_llama2-70b_tp8_bs32_seqlen1024_genlen128.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harleyszhang/llm_counts/2cd9bc0dd0aa757000b8bd9d5867e986d42828aa/images/latency_prefill_llama2-70b_tp8_bs32_seqlen1024_genlen128.png


--------------------------------------------------------------------------------
/images/params_llama2-70b_tp8_bs32_seqlen1024_genlen128.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harleyszhang/llm_counts/2cd9bc0dd0aa757000b8bd9d5867e986d42828aa/images/params_llama2-70b_tp8_bs32_seqlen1024_genlen128.png


--------------------------------------------------------------------------------
/llm_counts/benchmark_analyzer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding  : utf-8 -*-
  2 | # author      : honggao.zhang
  3 | # Create      : 2024-7-19
  4 | # Update      : 2025-01-05
  5 | # Version     : 0.2.0
  6 | # Description : transformer model(llm) profiling tools,
  7 | #               with latency, memory, flops, and params distribution analysis.
  8 | 
  9 | import logging
 10 | import pprint
 11 | import matplotlib.pyplot as plt
 12 | import argparse
 13 | import json
 14 | import copy
 15 | 
 16 | from .utils.config import *
 17 | from .utils.utils import *
 18 | 
 19 | from .count_flops import CountCausalLMFlops
 20 | from .count_params import CountCausalLMParams
 21 | from .count_memory import CountCausalLMMemory
 22 | from .count_latency import CountCausalLMLatency
 23 | from .layer_graph_visualizer import LayerAnalyzer
 24 | 
 25 | logger = logging.getLogger()
 26 | 
 27 | 
 28 | class LayerAnalyzerVisual(object):
 29 |     """Measures the latency, memory, number of estimated floating-point operations,
 30 |     and parameters of each module in a PyTorch model.
 31 |     """
 32 | 
 33 |     def __init__(self, llm_configs: LLMConfigs) -> None:
 34 |         self.llm_configs = llm_configs
 35 |         self.model_config = llm_configs.model_config
 36 |         self.gpu_config = llm_configs.gpu_config
 37 |         self.inference_config = llm_configs.inference_config
 38 |         self.parallelism_config = llm_configs.parallelism_config
 39 |         self.gpu_efficiency_config = llm_configs.gpu_efficiency_config
 40 | 
 41 |         self.h = self.model_config.hidden_size
 42 |         self.l = self.model_config.num_layers
 43 |         self.V = self.model_config.vocab_size
 44 | 
 45 |         self.b = llm_configs.inference_config.bs
 46 |         self.s = llm_configs.inference_config.seq_len
 47 |         self.o = llm_configs.inference_config.generate_len
 48 |         self.bytes_per_param = llm_configs.inference_config.bytes_per_param
 49 | 
 50 |         self.tp_size = self.parallelism_config.tp_size
 51 |         self.pp_size = self.parallelism_config.pp_size
 52 |         self.num_layers_per_gpu = int(self.l / self.parallelism_config.pp_size)
 53 | 
 54 |         self.gpu_memory_in_GB = (
 55 |             llm_configs.gpu_config.memory_GPU_in_GB * 10**9
 56 |         ) 
 57 | 
 58 |         self.llm_params = CountCausalLMParams(self.model_config)
 59 |         self.llm_flops = CountCausalLMFlops(self.model_config)
 60 |         self.llm_memory = CountCausalLMMemory(llm_configs)
 61 |         self.llm_latency = CountCausalLMLatency(llm_configs)
 62 | 
 63 |     def infer_profile(
 64 |         self,
 65 |         bs: int = 1,
 66 |         seq_len: int = 522,
 67 |         generate_len: int = 1526,
 68 |         act_dtype_bytes: int = BYTES_FP16,
 69 |         kv_cache_bytes: int = BYTES_FP16,
 70 |         qkvo_weight_dtype_bytes: int = BYTES_FP16,
 71 |         mlp_weight_dtype_bytes=BYTES_FP16,
 72 |         flops_efficiency: float = None,
 73 |         hbm_memory_efficiency: float = HBM_MEMORY_EFFICIENCY,
 74 |         intra_node_memory_efficiency=INTRA_NODE_MEMORY_EFFICIENCY,
 75 |         inter_node_memory_efficiency=INTER_NODE_MEMORY_EFFICIENCY,
 76 |         print_flag=False,
 77 |         visual_flag=False,
 78 |     ) -> dict:
 79 |         """LLM inference analysis given the llm configs and inputs."""
 80 | 
 81 |         if self.model_config.max_seq_len is not None:
 82 |             assert seq_len + generate_len <= self.model_config.max_seq_len, (
 83 |                 f"seq_len {seq_len} + generate_len {generate_len} Exceeding the model max_seq_len {self.model_config.max_seq_len}"
 84 |             )
 85 | 
 86 |         if self.l % self.pp_size != 0:
 87 |             logger.warning(
 88 |                 "Warning: the number of layers is not divisible by pp_size, please taking the floor!"
 89 |             )
 90 | 
 91 |         infer_config_dict = {
 92 |             "inference_config": {
 93 |                 "model_name": self.model_config.model_name,
 94 |                 "num_attention_heads": self.model_config.num_heads,
 95 |                 "num_kv_heads": self.model_config.num_kv_heads,
 96 |                 "head_dim": self.model_config.head_dim,
 97 |                 "hidden_size": self.model_config.hidden_size,
 98 |                 "intermediate_size": self.model_config.intermediate_size,
 99 |                 "vocab_size": self.model_config.vocab_size,
100 |                 "max_seq_len": self.model_config.max_seq_len,
101 |                 "bs": bs,
102 |                 "seq_len": seq_len,
103 |                 "tp_size": self.tp_size,
104 |                 "pp_size": self.pp_size,
105 |                 "generate_len": generate_len,
106 |             },
107 |             "gpu_config": {
108 |                 "name": self.gpu_config.name,
109 |                 "memory_GPU_in_GB": f"{self.gpu_config.memory_GPU_in_GB} GB",
110 |                 "gpu_hbm_bandwidth": f"{self.gpu_config.hbm_bandwidth_in_GB_per_sec} GB/s",
111 |                 "gpu_intra_node_bandwidth": f"{self.gpu_config.intra_node_bandwidth_in_GB_per_sec} GB/s",
112 |                 "gpu_fp16_TFLOPS": f"{self.gpu_config.peak_fp16_TFLOPS} TFLOPS",
113 |             },
114 |         }
115 | 
116 |         # -------------------------- 1. Params --------------------------
117 |         params_per_layer, dict_params_per_layer = (
118 |             self.llm_params.count_params_per_layer()
119 |         )
120 |         num_params_model = self.llm_params.count_params_model()
121 | 
122 |         # -------------------------- 2. FLOPs ---------------------------
123 |         prefill_flops_per_layer, prefill_dict_flops_per_layer = (
124 |             self.llm_flops.count_flops_per_layer(bs, seq_len, generate_len)
125 |         )
126 |         decode_flops_per_layer, decode_dict_flops_per_layer = (
127 |             self.llm_flops.count_flops_per_layer(bs, 1, generate_len)
128 |         )
129 | 
130 |         prefill_num_flops_model = self.llm_flops.count_flops_model(bs, seq_len, generate_len)
131 |         decode_num_flops_model = self.llm_flops.count_flops_model(bs, 1, generate_len)
132 | 
133 |         # -------------------------- 3. Memory --------------------------
134 |         memory_prefill_summary_dict, memory_decode_summary_dict = (
135 |             self.llm_memory.count_memory_per_gpu(
136 |                 bs,
137 |                 seq_len,
138 |                 generate_len,
139 |                 flash_attn=False,
140 |                 qkvo_weight_dtype_bytes=qkvo_weight_dtype_bytes,
141 |                 mlp_weight_dtype_bytes=mlp_weight_dtype_bytes,
142 |                 kv_cache_bytes=kv_cache_bytes,
143 |             )
144 |         )
145 | 
146 |         # -------------------------- 4. Latency -------------------------
147 |         prefill_latency_per_layer, prefill_dict_latency_per_layer = (
148 |             self.llm_latency.count_latency_per_layer(bs, seq_len, 0)
149 |         )
150 |         decode_latency_per_layer, decode_dict_latency_per_layer = (
151 |             self.llm_latency.count_latency_per_layer(bs, 1, generate_len)
152 |         )
153 |         prefill_latency_breakdown, decode_latency_breakdown = (
154 |             self.llm_latency.count_latency(
155 |                 bs,
156 |                 seq_len,
157 |                 generate_len,
158 |                 kv_cache_bytes=kv_cache_bytes,
159 |             )
160 |         )
161 | 
162 |         infer_result_dict = {
163 |             "weight_memory_per_gpu": memory_prefill_summary_dict["weight_memory_per_gpu"],
164 |             "consume_memory_per_gpu": memory_decode_summary_dict["consume_memory_per_gpu"],
165 |             "prefill_flops": prefill_num_flops_model,
166 |             "decode_flops_per_step": decode_num_flops_model,
167 |             "TTFT": prefill_latency_breakdown["TTFT"],
168 |             "TTOT": decode_latency_breakdown["TTOT"],
169 |             "kv_cache_latency": decode_latency_breakdown["kv_cache_latency"],
170 |             "total_infer_latency": prefill_latency_breakdown["TTFT"] + decode_latency_breakdown["TTOT"] * generate_len,
171 |             "support_max_batch_total_tokens": memory_decode_summary_dict["max_batch_total_tokens"],
172 |         }
173 | 
174 |         # --------------------------- 5. Memory Access ----------------------
175 |         if visual_flag:
176 |             model_type = self.model_config.model_type
177 |             llm_analyzer = LayerAnalyzer(self.model_config, self.gpu_config, tp_size=self.tp_size)
178 |             results = llm_analyzer.analyze_model(bs=bs, seq_len=seq_len, generate_len=generate_len)
179 | 
180 |             # -------------------------- 绘图：模型 graph 图示例 --------------------------
181 |             base_path = f"_{self.model_config.model_name}_tp{self.tp_size}_bs{self.b}_seqlen{self.s}_genlen{self.o}.png"
182 |             llm_analyzer.create_layer_graph(model_type, results, base_path)
183 |             # Formatter.print_format_summary_dict(results, get_dict_depth(results))
184 | 
185 |             # -------------------------- 绘图：Pie 图示例 --------------------------
186 |             prefill_latency_pie_save_path = f"./figures/latency_prefill" + base_path
187 |             decode_latency_pie_save_path = f"./figures/latency_decode" + base_path
188 |             prefill_flops_pie_save_path = f"./figures/flops_prefill" + base_path
189 |             decode_flops_pie_save_path = f"./figures/flops_decode" + base_path
190 |             params_pie_save_path = f"./figures/params" + base_path
191 | 
192 |             pie_tasks = [
193 |                 (dict_params_per_layer, "Params Distribution", params_pie_save_path),
194 |                 (prefill_dict_flops_per_layer, "Prefill FLOPS Distribution", prefill_flops_pie_save_path),
195 |                 (decode_dict_flops_per_layer, "Decode FLOPS Distribution", decode_flops_pie_save_path),
196 |                 (prefill_dict_latency_per_layer, "Prefill Latency Distribution", prefill_latency_pie_save_path),
197 |                 (decode_dict_latency_per_layer, "Decode Latency Distribution", decode_latency_pie_save_path),
198 |             ]
199 |             for data, title, path in pie_tasks:
200 |                 self.plot_distribution_pie(data, title, path)
201 | 
202 |         # ------------------------- 6. pretty‑print report --------------------
203 |         if print_flag:
204 |             self._print_report(
205 |                 infer_config_dict,
206 |                 copy.deepcopy(infer_result_dict),
207 |                 dict_params_per_layer,
208 |                 num_params_model,
209 |                 prefill_dict_flops_per_layer,
210 |                 prefill_num_flops_model,
211 |                 memory_prefill_summary_dict,
212 |                 memory_decode_summary_dict,
213 |                 prefill_latency_breakdown,
214 |                 decode_latency_breakdown,
215 |             )
216 | 
217 |         return infer_result_dict
218 | 
219 |     def plot_distribution_pie(
220 |         self,
221 |         data: dict[str, float],
222 |         title: str,
223 |         save_path: str,
224 |         *,
225 |         explode_small_pct: float = 4.0,   # explode slices whose pct < this value
226 |         label_pct_threshold: float = 0.5, # display "<x%" for very small slices
227 |         label_display_threshold: float = 2.0,  # hide outer label below this pct
228 |     ):
229 |         """
230 |         Pie chart styled similar to the user's sample:
231 | 
232 |         • Solid pie (no donut) with white borders between slices.
233 |         • Slice label placed *outside*; percentage text inside.
234 |         • Slices whose share < ``explode_small_pct`` are exploded.
235 |         • Title large, bold, perfectly centred horizontally.
236 |         """
237 |         if not data:
238 |             return
239 | 
240 |         labels = list(data.keys())
241 |         sizes = list(data.values())
242 |         total = float(sum(sizes)) or 1.0
243 | 
244 |         pct_list = [100 * s / total for s in sizes]
245 |         labels_display = [
246 |             lbl if pct >= label_display_threshold else "" for lbl, pct in zip(labels, pct_list)
247 |         ]
248 | 
249 |         # colour palette
250 |         cmap = plt.get_cmap("tab20" if len(labels) > 9 else "tab10")
251 |         colors = [cmap(i % cmap.N) for i in range(len(labels))]
252 | 
253 |         # proportional explode: smaller share → larger offset (capped at 0.18)
254 |         explode = [
255 |             min(0.18, 0.04 + (explode_small_pct - pct) / explode_small_pct * 0.10)
256 |             if (pct := 100 * s / total) < explode_small_pct
257 |             else 0
258 |             for s in sizes
259 |         ]
260 | 
261 |         # formatting tiny percentage
262 |         def _autopct(pct: float) -> str:
263 |             return (
264 |                 f"<{label_pct_threshold:.1f}%" if pct < label_pct_threshold else f"{pct:.1f}%"
265 |             )
266 | 
267 |         # high‑dpi for clarity
268 |         fig, ax = plt.subplots(figsize=(7, 6), dpi=300)
269 | 
270 |         wedges, texts, autotexts = ax.pie(
271 |             sizes,
272 |             labels=labels_display,
273 |             labeldistance=1.18,
274 |             autopct=_autopct,
275 |             pctdistance=0.78,
276 |             startangle=140,
277 |             colors=colors,
278 |             explode=explode,
279 |             wedgeprops={"edgecolor": "white", "linewidth": 1.0},
280 |             textprops={"fontsize": 10, "color": "black"},
281 |         )
282 |         # inner % text style
283 |         plt.setp(autotexts, size=9, weight="bold", color="white")
284 | 
285 |         # keep legend for color reference but remove title to save space
286 |         ax.legend(
287 |             wedges,
288 |             labels,
289 |             loc="upper center",
290 |             bbox_to_anchor=(0.5, -0.14),
291 |             ncol=min(len(labels), 5),
292 |             fontsize=9,
293 |             frameon=False,
294 |         )
295 | 
296 |         ax.axis("equal")  # perfect circle
297 | 
298 |         # Title
299 |         fig.suptitle(
300 |             title,
301 |             fontsize=18,
302 |             weight="bold",
303 |             y=0.98,
304 |             color="#2c3e50",
305 |         )
306 | 
307 |         # tidy layout – adjust bottom for legend
308 |         fig.subplots_adjust(left=0.05, right=0.95, top=0.88, bottom=0.25)
309 |         fig.savefig(save_path, bbox_inches="tight", pad_inches=0.06, dpi=300)
310 |         plt.close(fig)
311 | 
312 |     # ------------------------- Pretty‑print helpers -------------------- #
313 |     def _print_section(self, title, summary_dict, category, extra_totals=None):
314 |         """Print a single analysis section with optional totals."""
315 |         print(f"\n---------------------------- {title} ----------------------------")
316 |         Formatter.print_format_summary_dict(
317 |             summary_dict=summary_dict,
318 |             depth=get_dict_depth(summary_dict),
319 |             category=category,
320 |         )
321 |         if extra_totals:
322 |             pprint.pprint(extra_totals, indent=4, sort_dicts=False)
323 | 
324 |     def _print_report(
325 |         self,
326 |         infer_config_dict,
327 |         infer_result_dict,
328 |         dict_params_per_layer,
329 |         num_params_model,
330 |         prefill_dict_flops_per_layer,
331 |         prefill_num_flops_model,
332 |         memory_prefill_summary_dict,
333 |         memory_decode_summary_dict,
334 |         prefill_latency_breakdown,
335 |         decode_latency_breakdown,
336 |     ):
337 |         """Pretty‑print a full performance report."""
338 |         print("\n-------------------------- LLM main infer config --------------------------")
339 |         pprint.pprint(infer_config_dict, indent=4, sort_dicts=False)
340 | 
341 |         print("\n-------------------------- LLM infer performance analysis --------------------------")
342 |         Formatter.print_format_summary_dict(
343 |             infer_result_dict, get_dict_depth(infer_result_dict)
344 |         )
345 | 
346 |         sections = [
347 |             (
348 |                 "LLM Params per_layer analysis",
349 |                 dict_params_per_layer,
350 |                 "params",
351 |                 {"params_model": num_to_string(num_params_model)},
352 |             ),
353 |             (
354 |                 "LLM Prefill Flops per_layer analysis",
355 |                 prefill_dict_flops_per_layer,
356 |                 "flops",
357 |                 {"prefill flops_model": num_to_string(prefill_num_flops_model)},
358 |             ),
359 |             (
360 |                 "LLM Memory analysis (Prefill)",
361 |                 memory_prefill_summary_dict,
362 |                 "memory",
363 |                 None,
364 |             ),
365 |             (
366 |                 "LLM Memory analysis (Decode)",
367 |                 memory_decode_summary_dict,
368 |                 "memory",
369 |                 None,
370 |             ),
371 |             (
372 |                 "LLM Latency analysis (Prefill)",
373 |                 prefill_latency_breakdown,
374 |                 "latency",
375 |                 None,
376 |             ),
377 |             (
378 |                 "LLM Latency analysis (Decode)",
379 |                 decode_latency_breakdown,
380 |                 "latency",
381 |                 None,
382 |             ),
383 |         ]
384 | 
385 |         for title, summary_dict, category, extra in sections:
386 |             self._print_section(title, summary_dict, category, extra)
387 | 
388 | def llm_profile(
389 |     model_name,
390 |     gpu_name: str = "a100-sxm-40gb",
391 |     bytes_per_param: int = BYTES_FP16,
392 |     batch_size: int = 20,
393 |     seq_len: int = 1024,
394 |     generate_len=1024,
395 |     dp_size: int = 1,
396 |     tp_size: int = 8,
397 |     pp_size: int = 1,
398 |     sp_size: int = 1,
399 |     act_dtype_bytes: int = BYTES_FP16,
400 |     kv_cache_bytes: int = BYTES_FP16,
401 |     flops_efficiency: float = FLOPS_EFFICIENCY,
402 |     hbm_memory_efficiency: float = HBM_MEMORY_EFFICIENCY,
403 |     intra_node_memory_efficiency=INTRA_NODE_MEMORY_EFFICIENCY,
404 |     inter_node_memory_efficiency=INTER_NODE_MEMORY_EFFICIENCY,
405 |     print_flag: bool = False,
406 |     visual_flag: bool = False,
407 | ) -> dict:
408 |     """Returns dict of the total floating-point operations, MACs, parameters and latency of a llm.
409 |     It now returns a dictionary containing FLOPs, latency, HBM memory usage and max_batch_total_tokens.
410 | 
411 |     Args:
412 |         model_name (str, optional): model name to query the pre-defined `model_configs.json`. 
413 |             Defaults to "llama-13b".
414 |         gpu_name (str, optional): gpu name to query the pre-defined `model_configs.json`. 
415 |             Defaults to "v100-sxm2-32gb".
416 |         batch_size (int, optional): _description_. Defaults to 1.
417 |         seq_len (int, optional): batch size per GPU.. Defaults to 522.
418 |         generate_len (int, optional): The maximum numbers of tokens to generate, 
419 |             ignoring the number of tokens in the prompt. Defaults to 1526.
420 |         dp_size (int, optional): data parallelism size. Defaults to 1.
421 |         tp_size (int, optional): tensor parallelism size. Defaults to 1.
422 |         pp_size (int, optional): pipeline parallelism size. Defaults to 1.
423 |         sp_size (int, optional): sequence parallelism size. Defaults to 1.
424 |             past last key/values attentions (if applicable to the model) to speed up decoding. Defaults to True.
425 |         layernorm_dtype_bytes (int, optional): number of bytes in the data type for the layernorm activations..
426 |             Defaults to BYTES_FP16.
427 |         kv_cache_bytes (int, optional): number of bytes in the data type for the kv_cache. Defaults to None.
428 |         flops_efficiency (float, optional): flops efficiency, ranging from 0 to 1. Defaults to None.
429 |         hbm_memory_efficiency (float, optional): GPU HBM memory efficiency, ranging from 0 to 1. 
430 |             Defaults to HBM_MEMORY_EFFICIENCY.
431 |         intra_node_memory_efficiency (_type_, optional): intra-node memory efficiency, ranging from 0 to 1.. 
432 |             Defaults to INTRA_NODE_MEMORY_EFFICIENCY.
433 |         inter_node_memory_efficiency (_type_, optional): inter-node memory efficiency, ranging from 0 to 1.. 
434 |             Defaults to INTER_NODE_MEMORY_EFFICIENCY.
435 | 
436 |     Returns:
437 |         dict: a summary dictionary of the inference analysis
438 |     """
439 |     model_config, gpu_config = get_model_and_gpu_config_by_name(model_name, gpu_name)
440 | 
441 |     parallelism_config = ParallelismConfig(
442 |         tp_size=tp_size, pp_size=pp_size, dp_size=dp_size, sp_size=sp_size
443 |     )
444 | 
445 |     inference_config = InferenceConfig(
446 |         bs=batch_size,
447 |         seq_len=seq_len,
448 |         generate_len=generate_len,
449 |         bytes_per_param=bytes_per_param,
450 |         act_dtype_bytes=act_dtype_bytes,
451 |         kv_cache_bytes=kv_cache_bytes,
452 |     )
453 | 
454 |     gpu_efficiency_config = GPUEfficiencyConfig(
455 |         flops_efficiency=flops_efficiency,
456 |         hbm_memory_efficiency=hbm_memory_efficiency,
457 |         intra_node_memory_efficiency=intra_node_memory_efficiency,
458 |         inter_node_memory_efficiency=inter_node_memory_efficiency,
459 |     )
460 | 
461 |     llm_configs = LLMConfigs(
462 |         model_config=model_config,
463 |         gpu_config=gpu_config,
464 |         parallelism_config=parallelism_config,
465 |         inference_config=inference_config,
466 |         gpu_efficiency_config=gpu_efficiency_config,
467 |     )
468 | 
469 |     profiler = LayerAnalyzerVisual(llm_configs)
470 | 
471 |     infer_result_dict = profiler.infer_profile(
472 |         bs=batch_size,
473 |         seq_len=seq_len,
474 |         generate_len=generate_len,
475 |         act_dtype_bytes=act_dtype_bytes,
476 |         flops_efficiency=flops_efficiency,
477 |         hbm_memory_efficiency=hbm_memory_efficiency,
478 |         print_flag=print_flag,
479 |         visual_flag=visual_flag,
480 |     )
481 | 
482 |     # ---------------------------------------------------------------------
483 |     # Collect summary metrics (keep raw numbers for downstream maths)      #
484 |     # ---------------------------------------------------------------------
485 |     weight_memory_per_gpu = infer_result_dict.get("weight_memory_per_gpu", None)
486 |     consume_memory_per_gpu = infer_result_dict.get("consume_memory_per_gpu", None)
487 |     prefill_flops = infer_result_dict.get("prefill_flops", None)
488 |     
489 |     table_results = {
490 |         "seq_len": seq_len,
491 |         "generate_len": generate_len,
492 |         "prefill_flops": num_to_string(prefill_flops),           
493 |         "weight_memory_per_gpu": num_to_string(weight_memory_per_gpu),
494 |         "consume_memory_per_gpu": num_to_string(consume_memory_per_gpu),       
495 |         "TTFT": infer_result_dict.get("TTFT", None),
496 |         "TTOT": infer_result_dict.get("TTOT", None),
497 |         "Total_latency": infer_result_dict.get("total_infer_latency", None),
498 |     }
499 |     visual_results = {
500 |         "seq_len": seq_len,
501 |         "generate_len": generate_len,
502 |         "prefill_flops": prefill_flops,                   # raw number
503 |         "weight_memory_per_gpu": weight_memory_per_gpu,
504 |         "consume_memory_per_gpu": consume_memory_per_gpu, # raw bytes
505 |         "TTFT": infer_result_dict.get("TTFT", None),
506 |         "TTOT": infer_result_dict.get("TTOT", None),
507 |         "Total_latency": infer_result_dict.get("total_infer_latency", None),
508 |     }
509 |     return table_results, visual_results
510 | 
511 | 
512 | # ----------------------------- Command‑line interface ----------------------------- #
513 | def _cli():
514 |     """Command‑line wrapper for quick profiling."""
515 |     parser = argparse.ArgumentParser(description="LLMCounts – quick model inference profiler")
516 |     parser.add_argument("--model_name", required=True, help="Model name defined in model_configs.json")
517 |     parser.add_argument("--gpu_name", default="a100-sxm-40gb", help="GPU name defined in model_configs.json")
518 |     parser.add_argument("--batch_size", type=int, default=1)
519 |     parser.add_argument("--seq_len", type=int, default=1024)
520 |     parser.add_argument("--generate_len", type=int, default=1024)
521 |     parser.add_argument("--tp_size", type=int, default=1)
522 |     parser.add_argument("--pp_size", type=int, default=1)
523 |     parser.add_argument("--dp_size", type=int, default=1)
524 |     parser.add_argument("--sp_size", type=int, default=1)
525 |     parser.add_argument("--visual", action="store_true", help="Generate pie‑charts and layer graph")
526 |     parser.add_argument("--print", dest="print_flag", action="store_true", help="Pretty‑print verbose breakdown")
527 |     parser.add_argument("--json", dest="json_flag", action="store_true", help="Output raw results as JSON")
528 |     args = parser.parse_args()
529 | 
530 |     table_results, visual_results = llm_profile(
531 |         model_name=args.model_name,
532 |         gpu_name=args.gpu_name,
533 |         batch_size=args.batch_size,
534 |         seq_len=args.seq_len,
535 |         generate_len=args.generate_len,
536 |         tp_size=args.tp_size,
537 |         pp_size=args.pp_size,
538 |         dp_size=args.dp_size,
539 |         sp_size=args.sp_size,
540 |         print_flag=args.print_flag,
541 |         visual_flag=args.visual,
542 |     )
543 | 
544 |     if args.json_flag:
545 |         print(json.dumps(visual_results, indent=2))
546 |     else:
547 |         import pprint
548 |         pprint.pprint(table_results, indent=2)
549 | 
550 | 
551 | if __name__ == "__main__":
552 |     _cli()


--------------------------------------------------------------------------------
/llm_counts/configs/gpu_configs.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "t4-pcie-15gb": {
  3 |         "name": "t4-pcie-15gb",
  4 |         "memory_GPU_in_GB": 15,
  5 |         "hbm_bandwidth_in_GB_per_sec": 300,
  6 |         "intra_node_bandwidth_in_GB_per_sec": 32,
  7 |         "peak_fp16_TFLOPS": 65,
  8 |         "peak_int8_TFLOPS": 130,
  9 |         "peak_int4_TFLOPS": 260,
 10 |         "intra_node_min_message_latency": 8e-06
 11 |     },
 12 |     "v100-pcie-32gb": {
 13 |         "name": "v100-pcie-32gb",
 14 |         "memory_GPU_in_GB": 32,
 15 |         "hbm_bandwidth_in_GB_per_sec": 900,
 16 |         "intra_node_bandwidth_in_GB_per_sec": 32,
 17 |         "inter_node_bandwidth_in_GB_per_sec": 200,
 18 |         "peak_fp16_TFLOPS": 112,
 19 |         "peak_int8_TFLOPS": 224,
 20 |         "peak_int4_TFLOPS": 448,
 21 |         "intra_node_min_message_latency": 8e-06,
 22 |         "onchip_buffer": 20480e3
 23 |     },
 24 |     "v100-sxm-32gb": {
 25 |         "name": "v100-sxm-32gb",
 26 |         "memory_GPU_in_GB": 32,
 27 |         "hbm_bandwidth_in_GB_per_sec": 900,
 28 |         "intra_node_bandwidth_in_GB_per_sec": 300,
 29 |         "inter_node_bandwidth_in_GB_per_sec": 200,
 30 |         "peak_fp16_TFLOPS": 112,
 31 |         "peak_int8_TFLOPS": 224,
 32 |         "peak_int4_TFLOPS": 448,
 33 |         "intra_node_min_message_latency": 8e-06,
 34 |         "onchip_buffer": 20480e3
 35 |     },
 36 |     "br104p": {
 37 |         "name": "br104p",
 38 |         "memory_GPU_in_GB": 32,
 39 |         "hbm_bandwidth_in_GB_per_sec": 819,
 40 |         "intra_node_bandwidth_in_GB_per_sec": 192,
 41 |         "inter_node_bandwidth_in_GB_per_sec": 200,
 42 |         "peak_fp32_TFLOPS": 256,
 43 |         "peak_fp16_TFLOPS": 512,
 44 |         "peak_int8_TFLOPS": 1024,
 45 |         "intra_node_min_message_latency": 8e-06
 46 |     },
 47 |     "a100-pcie-40gb": {
 48 |         "name": "a100-pcie-40gb",
 49 |         "memory_GPU_in_GB": 40,
 50 |         "hbm_bandwidth_in_GB_per_sec": 1555,
 51 |         "intra_node_bandwidth_in_GB_per_sec": 64,
 52 |         "inter_node_bandwidth_in_GB_per_sec": 200,
 53 |         "peak_fp32_TFLOPS": 156,
 54 |         "peak_fp16_TFLOPS": 312,
 55 |         "peak_int8_TFLOPS": 624,
 56 |         "peak_int4_TFLOPS": 1248,
 57 |         "intra_node_min_message_latency": 8e-06,
 58 |         "onchip_buffer": 27648e3
 59 |     },
 60 |     "a100-sxm-40gb": {
 61 |         "name": "a100-sxm-40gb",
 62 |         "memory_GPU_in_GB": 40,
 63 |         "hbm_bandwidth_in_GB_per_sec": 1555,
 64 |         "intra_node_bandwidth_in_GB_per_sec": 600,
 65 |         "inter_node_bandwidth_in_GB_per_sec": 200,
 66 |         "peak_fp32_TFLOPS": 156,
 67 |         "peak_fp16_TFLOPS": 312,
 68 |         "peak_int8_TFLOPS": 624,
 69 |         "peak_int4_TFLOPS": 1248,
 70 |         "intra_node_min_message_latency": 8e-06,
 71 |         "onchip_buffer": 27648e3
 72 |     },
 73 |     "a100-pcie-80gb": {
 74 |         "name": "a100-pcie-80gb",
 75 |         "memory_GPU_in_GB": 80,
 76 |         "hbm_bandwidth_in_GB_per_sec": 1935,
 77 |         "intra_node_bandwidth_in_GB_per_sec": 64,
 78 |         "inter_node_bandwidth_in_GB_per_sec": 200,
 79 |         "peak_fp32_TFLOPS": 156,
 80 |         "peak_fp16_TFLOPS": 312,
 81 |         "peak_int8_TFLOPS": 624,
 82 |         "peak_int4_TFLOPS": 1248,
 83 |         "intra_node_min_message_latency": 8e-06,
 84 |         "onchip_buffer": 27648e3
 85 |     },
 86 |     "a100-sxm-80gb": {
 87 |         "name": "a100-sxm-80gb",
 88 |         "memory_GPU_in_GB": 80,
 89 |         "hbm_bandwidth_in_GB_per_sec": 2039,
 90 |         "intra_node_bandwidth_in_GB_per_sec": 600,
 91 |         "inter_node_bandwidth_in_GB_per_sec": 200,
 92 |         "peak_fp32_TFLOPS": 156,
 93 |         "peak_fp16_TFLOPS": 312,
 94 |         "peak_int8_TFLOPS": 624,
 95 |         "peak_int4_TFLOPS": 1248,
 96 |         "intra_node_min_message_latency": 8e-06,
 97 |         "onchip_buffer": 27648e3
 98 |     },
 99 |     "910b-64gb": {
100 |         "name": "910b-64gb",
101 |         "memory_GPU_in_GB": 64,
102 |         "hbm_bandwidth_in_GB_per_sec": 460,
103 |         "intra_node_bandwidth_in_GB_per_sec": 392,
104 |         "inter_node_bandwidth_in_GB_per_sec": 200,
105 |         "peak_fp32_TFLOPS": 188,
106 |         "peak_fp16_TFLOPS": 376,
107 |         "peak_int8_TFLOPS": 752,
108 |         "peak_int4_TFLOPS": 1504,
109 |         "intra_node_min_message_latency": 9e-06
110 |     },
111 |     "h100-sxm-80gb": {
112 |         "name": "a100-sxm-80gb",
113 |         "memory_GPU_in_GB": 80,
114 |         "hbm_bandwidth_in_GB_per_sec": 3430,
115 |         "intra_node_bandwidth_in_GB_per_sec": 900,
116 |         "inter_node_bandwidth_in_GB_per_sec": 400,
117 |         "peak_fp32_TFLOPS": 989,
118 |         "peak_fp16_TFLOPS": 1979,
119 |         "peak_int8_TFLOPS": 3958,
120 |         "intra_node_min_message_latency": 8e-06,
121 |         "onchip_buffer": 33792e3
122 |     },
123 |     "h100-pcie-80gb": {
124 |         "name": "a100-sxm-80gb",
125 |         "memory_GPU_in_GB": 80,
126 |         "hbm_bandwidth_in_GB_per_sec": 2048,
127 |         "intra_node_bandwidth_in_GB_per_sec": 128,
128 |         "inter_node_bandwidth_in_GB_per_sec": 400,
129 |         "peak_fp32_TFLOPS": 756,
130 |         "peak_fp16_TFLOPS": 1513,
131 |         "peak_int8_TFLOPS": 3026,
132 |         "intra_node_min_message_latency": 8e-06,
133 |         "onchip_buffer": 33792e3
134 |     },
135 |     "a30-pcie-24gb": {
136 |         "name": "a30-pcie-24gb",
137 |         "memory_GPU_in_GB": 24,
138 |         "hbm_bandwidth_in_GB_per_sec": 933,
139 |         "intra_node_bandwidth_in_GB_per_sec": 64,
140 |         "inter_node_bandwidth_in_GB_per_sec": 200,
141 |         "peak_fp32_TFLOPS": 82,
142 |         "peak_fp16_TFLOPS": 165,
143 |         "peak_int8_TFLOPS": 330,
144 |         "peak_int4_TFLOPS": 661,
145 |         "intra_node_min_message_latency": 8e-06
146 |     },
147 |     "a30-sxm-24gb": {
148 |         "name": "a30-sxm-24gb",
149 |         "memory_GPU_in_GB": 24,
150 |         "hbm_bandwidth_in_GB_per_sec": 933,
151 |         "intra_node_bandwidth_in_GB_per_sec": 200,
152 |         "inter_node_bandwidth_in_GB_per_sec": 200,
153 |         "peak_fp32_TFLOPS": 82,
154 |         "peak_fp16_TFLOPS": 165,
155 |         "peak_int8_TFLOPS": 330,
156 |         "peak_int4_TFLOPS": 661,
157 |         "intra_node_min_message_latency": 8e-06
158 |     },
159 |     "a40-pcie-48gb": {
160 |         "name": "a40-pcie-48gb",
161 |         "memory_GPU_in_GB": 44.98,
162 |         "hbm_bandwidth_in_GB_per_sec": 696,
163 |         "intra_node_bandwidth_in_GB_per_sec": 64,
164 |         "inter_node_bandwidth_in_GB_per_sec": 200,
165 |         "peak_fp32_TFLOPS": 74.8,
166 |         "peak_fp16_TFLOPS": 149.7,
167 |         "peak_int8_TFLOPS": 299.3,
168 |         "peak_int4_TFLOPS": 598.7,
169 |         "intra_node_min_message_latency": 8e-06
170 |     }
171 | }


--------------------------------------------------------------------------------
/llm_counts/configs/gpu_perf.ini:
--------------------------------------------------------------------------------
 1 | [T4]
 2 | gpu_memory=16GB
 3 | single_precision=8.1TFLOPS
 4 | gpu_memory_bandwidth=300GB/s
 5 | interconnect_bandwidth=32GB/s
 6 | [L4]
 7 | gpu_memory=30GB
 8 | single_precision=24TFLOPS
 9 | gpu_memory_bandwidth=300GB/s
10 | interconnect_bandwidth=64GB/s
11 | [L40]
12 | gpu_memory=48GB
13 | single_precision=90.5TFLOPS
14 | gpu_memory_bandwidth=864GB/s
15 | interconnect_bandwidth=64GB/s
16 | [V100]
17 | gpu_memory=36GB
18 | single_precision=14TFLOPS
19 | gpu_memory_bandwidth=900GB/s
20 | interconnect_bandwidth=32GB/s
21 | [A100]
22 | gpu_memory=80GB
23 | single_precision=19.5TFLOPS
24 | gpu_memory_bandwidth=1935GB/s
25 | interconnect_bandwidth=64GB/s


--------------------------------------------------------------------------------
/llm_counts/configs/model_configs.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "llama-7B":{
  3 |         "num_layers": 32,
  4 |         "num_heads": 32,
  5 |         "hidden_size": 4096,
  6 |         "intermediate_size": 11008,
  7 |         "vocab_size": 32000,
  8 |         "max_seq_len": 2048,
  9 |         "model_type": "llama",
 10 |         "model_name": "llama-7B"
 11 |     },
 12 |     "llama-13B":{
 13 |         "num_layers": 40,
 14 |         "num_heads": 40,
 15 |         "hidden_size": 5120,
 16 |         "intermediate_size": 13824,
 17 |         "vocab_size": 32000,
 18 |         "max_seq_len": 2048,
 19 |         "model_type": "llama",
 20 |         "model_name": "llama-13B"
 21 |     },
 22 |     "llama-30B":{
 23 |         "num_layers": 60,
 24 |         "num_heads": 52,
 25 |         "hidden_size": 6656,
 26 |         "intermediate_size": 17920,
 27 |         "vocab_size": 32000,
 28 |         "max_seq_len": 2048,
 29 |         "model_type": "llama",
 30 |         "model_name": "llama-30B"
 31 |     },
 32 |     "llama-65B":{
 33 |         "num_layers": 80,
 34 |         "num_heads": 64,
 35 |         "hidden_size": 8192,
 36 |         "intermediate_size": 22016,
 37 |         "vocab_size": 32000,
 38 |         "max_seq_len": 2048,
 39 |         "model_type": "llama",
 40 |         "model_name": "llama-65B"
 41 |     },
 42 |     "llama2-13B":{
 43 |         "num_layers": 40,
 44 |         "num_heads": 40,
 45 |         "num_kv_heads": 40,
 46 |         "hidden_size": 5120,
 47 |         "intermediate_size": 13824,
 48 |         "vocab_size": 32000,
 49 |         "max_seq_len": 4096,
 50 |         "model_type": "llama",
 51 |         "model_name": "llama2-13B"
 52 |     },
 53 |     "llama2-70B":{
 54 |         "num_layers": 80,
 55 |         "num_heads": 64,
 56 |         "num_kv_heads": 8,
 57 |         "hidden_size": 8192,
 58 |         "intermediate_size": 28672,
 59 |         "vocab_size": 32000,
 60 |         "max_seq_len": 4096,
 61 |         "model_type": "llama2",
 62 |         "model_name": "llama2-70B"
 63 |     },
 64 |     "internlm-20B": {
 65 |         "num_layers": 60,
 66 |         "num_heads": 40,
 67 |         "num_kv_heads": 40,
 68 |         "hidden_size": 5120,
 69 |         "intermediate_size": 13824,
 70 |         "vocab_size": 103168,
 71 |         "max_seq_len": 16384,
 72 |         "model_type": "internlm",
 73 |         "model_name": "internlm-20B"
 74 |     },
 75 |     "internlm2-20b-chat": {
 76 |         "num_layers": 48,
 77 |         "num_heads": 48,
 78 |         "num_kv_heads": 8,
 79 |         "hidden_size": 6144,
 80 |         "intermediate_size": 16384,
 81 |         "vocab_size": 92544,
 82 |         "max_seq_len": 32768,
 83 |         "model_type": "internlm2",
 84 |         "model_name": "internlm2-20b-chat"
 85 |     },
 86 |     "Qwen3-8B": {
 87 |         "num_layers": 36,
 88 |         "head_dim": 128,
 89 |         "hidden_size": 4096,
 90 |         "num_heads": 32,
 91 |         "num_kv_heads": 8,
 92 |         "intermediate_size": 12288,
 93 |         "vocab_size": 151936,
 94 |         "max_seq_len": 40960,
 95 |         "model_type": "qwen3",
 96 |         "model_name": "Qwen3-8B"
 97 |     },
 98 |     "Qwen3-32B": {
 99 |         "num_layers": 64,
100 |         "head_dim": 128,
101 |         "hidden_size": 5120,
102 |         "num_heads": 64,
103 |         "num_kv_heads": 8,
104 |         "intermediate_size": 25600,
105 |         "vocab_size": 151936,
106 |         "max_seq_len": 40960,
107 |         "model_type": "qwen3",
108 |         "model_name": "Qwen3-32B"
109 |     }
110 | }
111 | 


--------------------------------------------------------------------------------
/llm_counts/count_flops.py:
--------------------------------------------------------------------------------
  1 | from .utils.config import ModelConfig
  2 | 
  3 | 
  4 | class CountCausalLMFlops(object):
  5 |     """CountCausalLMFlops is a class that counts the number of floating point operations (FLOPs) 
  6 |     for a causal language model (LLM) during the forward passes."""
  7 | 
  8 |     def __init__(
  9 |         self,
 10 |         model_config: ModelConfig,
 11 |     ) -> None:
 12 |         self.model_type = model_config.model_type
 13 |         self.num_heads = model_config.num_heads
 14 |         self.num_kv_heads = model_config.num_kv_heads
 15 |         self.head_dim = model_config.head_dim
 16 |         self.hidden_size = model_config.hidden_size
 17 |         self.intermediate_size = model_config.intermediate_size
 18 |         self.l = model_config.num_layers
 19 |         self.V = model_config.vocab_size
 20 | 
 21 |     def count_flops_per_layer_qkvo_proj(self, bs: int, seq_len: int) -> int:
 22 |         """Get the number of floating point operations (flops) for the forward
 23 |         pass of the attention linear layers, given the batch size and sequence length.
 24 | 
 25 |         flops_qkvo_proj = flops_q + flops_k + flops_v + flops_output
 26 | 
 27 |         Args:
 28 |             bs (int): batch size
 29 |             seq_len (int): sequence length
 30 |         """
 31 |         q_proj_flops = 2 * bs * seq_len * self.hidden_size * self.num_heads * self.head_dim
 32 |         k_proj_flops = 2 * bs * seq_len * self.hidden_size * self.num_kv_heads * self.head_dim
 33 |         v_proj_flops = 2 * bs * seq_len * self.hidden_size * self.num_kv_heads * self.head_dim
 34 |         o_proj_flops = 2 * bs * seq_len * self.hidden_size * self.num_heads * self.head_dim
 35 |         qkvo_proj_flops = q_proj_flops + k_proj_flops + v_proj_flops + o_proj_flops
 36 | 
 37 |         return qkvo_proj_flops
 38 | 
 39 |     def count_flops_per_layer_mlp(self, bs: int, seq_len: int) -> int:
 40 |         """Count two flops of matrices multiplication(two linear layers in the MLP module.)
 41 |         eg. llama3.2-1B: self.intermediate_size = 4 * self.hidden_size
 42 |         eg. flops_mlp(llama3.2-1B) = flops_fc1 + flops_fc2 + flops_fc3 
 43 |                                    = 2bs(4h^2) + 2bs(4h^2) + 2bs(4h^2) = 24bsh^2
 44 |         """
 45 |         flops_gate_proj = 2 * bs * seq_len * self.hidden_size * self.intermediate_size
 46 |         flops_up_proj = 2 * bs * seq_len * self.hidden_size * self.intermediate_size
 47 |         flops_down_proj = 2 * bs * seq_len * self.intermediate_size * self.hidden_size
 48 | 
 49 |         return flops_gate_proj + flops_up_proj + flops_down_proj
 50 |     
 51 |     def count_flops_per_layer_attn_kernel(self, bs: int, seq_len: int, generate_len: int) -> int:
 52 |         q_norm_flops = bs * 4 * seq_len * self.head_dim
 53 |         k_norm_flops = q_norm_flops
 54 |         # e^x / sum(e^x); bs = 1 和 seq_len = 1 时 flops 为 3d-1, 张量中每个元素约执行 3 次操作
 55 |         softmax_flops =  bs * 3 * seq_len * self.num_heads * self.head_dim
 56 | 
 57 |         if seq_len != 1:
 58 |             qk_matmul_flops = bs * 2 * seq_len * seq_len * self.num_heads * self.head_dim
 59 |             sv_matmul_flops = qk_matmul_flops
 60 |    
 61 |         else:
 62 |             qk_matmul_flops = 2 * self.num_heads * self.head_dim * (seq_len + generate_len)
 63 |             sv_matmul_flops = qk_matmul_flops
 64 |             
 65 |         flops_self_attention_kernel = q_norm_flops + k_norm_flops + qk_matmul_flops + sv_matmul_flops + softmax_flops
 66 | 
 67 |         return flops_self_attention_kernel
 68 | 
 69 |     def count_flops_per_layer_norm(self, bs: int, seq_len: int) -> int:
 70 |         """flops of 2 rmsnrom per layer"""
 71 |         return bs * 4 * seq_len * self.hidden_size
 72 | 
 73 |     def count_flops_per_layer(self, bs: int, seq_len: int, generate_len:int) -> tuple:
 74 |         flops_per_layer_qkvo_proj = self.count_flops_per_layer_qkvo_proj(bs, seq_len)
 75 |         flops_per_layer_mlp = self.count_flops_per_layer_mlp(bs, seq_len)
 76 | 
 77 |         flops_per_layer_attention_kernel = self.count_flops_per_layer_attn_kernel(
 78 |             bs, seq_len, generate_len,
 79 |         )
 80 |         flops_per_layer_rmsnorm = (
 81 |             self.count_flops_per_layer_norm(bs, seq_len) * 2
 82 |         )  # atten_rmsnorm and mlp_rmsnorm
 83 | 
 84 |         flops_positional_embedding = self.count_flops_positional_embedding(bs, seq_len)
 85 | 
 86 |         flops_per_layer = (
 87 |             flops_per_layer_qkvo_proj
 88 |             + flops_per_layer_mlp
 89 |             + flops_per_layer_rmsnorm
 90 |             + flops_per_layer_attention_kernel
 91 |             + flops_positional_embedding
 92 |         )
 93 | 
 94 |         dict_flops_per_layer = {
 95 |             "attention_kernel": flops_per_layer_attention_kernel,
 96 |             "qkvo_proj": flops_per_layer_qkvo_proj,
 97 |             "mlp": flops_per_layer_mlp,
 98 |             "rmsnorm": flops_per_layer_rmsnorm * 2,
 99 |             "positional_embedding": flops_positional_embedding,
100 |             "input_embedding": 0,
101 |         }
102 | 
103 |         return flops_per_layer, dict_flops_per_layer
104 | 
105 |     def count_flops_positional_embedding(
106 |         self,
107 |         bs:int,
108 |         seq_len:int,
109 |     ) -> int:
110 |         """flops of output token logits layer"""
111 |         return 2 * bs * seq_len * self.hidden_size
112 | 
113 |     def count_flops_model(self, bs: int, seq_len: int, generate_len: int) -> int:
114 |         """Count flops of the forward pass of the transformer model, 
115 |         given the batch size and sequence length.
116 |         """
117 |         num_flops_model = self.count_flops_per_layer(bs, seq_len, generate_len)[0] * self.l
118 |     
119 |         return num_flops_model
120 | 
121 |     def count_flops_bwd_model(self, bs: int, seq_len: int, generate_len:int) -> int:
122 |         """Get the number of floating point operations (flops) for the backward
123 |         pass of the entire transformer model, given the batch size and sequence
124 |         """
125 |         return 2 * self.count_flops_model(bs, seq_len, generate_len)
126 | 


--------------------------------------------------------------------------------
/llm_counts/count_latency.py:
--------------------------------------------------------------------------------
  1 | from .utils.config import (
  2 |     LLMConfigs,
  3 |     get_gpu_hbm_bandwidth,
  4 |     get_intra_node_bandwidth,
  5 |     get_TFLOPS_per_gpu,
  6 | )
  7 | from .utils.constants import *
  8 | from .utils.utils import latency_to_string
  9 | 
 10 | from .count_flops import CountCausalLMFlops
 11 | from .count_params import CountCausalLMParams
 12 | from .count_memory import CountCausalLMMemory
 13 | 
 14 | 
 15 | class CountCausalLMLatency(object):
 16 |     """Count latency by roof-line performance model."""
 17 | 
 18 |     def __init__(self, llm_configs: LLMConfigs) -> None:
 19 |         self.model_config = llm_configs.model_config
 20 |         self.gpu_config = llm_configs.gpu_config
 21 |         self.inference_config = llm_configs.inference_config
 22 |         self.parallelism_config = llm_configs.parallelism_config
 23 | 
 24 |         self.h = self.model_config.hidden_size
 25 |         self.l = self.model_config.num_layers
 26 |         self.V = self.model_config.vocab_size
 27 | 
 28 |         self.b = llm_configs.inference_config.bs
 29 |         self.s = llm_configs.inference_config.seq_len
 30 |         self.o = llm_configs.inference_config.generate_len
 31 |         self.bytes_per_param = llm_configs.inference_config.bytes_per_param
 32 | 
 33 |         self.tp_size = self.parallelism_config.tp_size
 34 |         self.pp_size = self.parallelism_config.pp_size
 35 |         self.num_layers_per_gpu = int(self.l / self.parallelism_config.pp_size)
 36 | 
 37 |         self.gpu_hbm_bandwidth, self.onchip_buffer = (
 38 |             get_gpu_hbm_bandwidth(self.gpu_config, HBM_MEMORY_EFFICIENCY)
 39 |         )  
 40 |         self.gpu_hbm_bandwidth *= 10**9 
 41 |         
 42 |         self.gpu_intra_node_bandwidth = (
 43 |             get_intra_node_bandwidth(self.gpu_config, INTRA_NODE_MEMORY_EFFICIENCY)
 44 |             * 10**9
 45 |         )  # intra node bandwidth，GB/s
 46 |         self.gpu_TFLOPS = (
 47 |             get_TFLOPS_per_gpu(self.gpu_config, flops_efficiency=FLOPS_EFFICIENCY)
 48 |             * 10**12
 49 |         )  # TFLOPS
 50 | 
 51 |         self.llm_params = CountCausalLMParams(self.model_config)
 52 |         self.llm_memory = CountCausalLMMemory(llm_configs)
 53 |         self.llm_flops = CountCausalLMFlops(self.model_config)
 54 | 
 55 |     @staticmethod
 56 |     def print_kernel_bound_info(stage, memory_latency, compute_latency, ops_type):
 57 |         """Print the kernel bound information for the given stage."""
 58 |         if memory_latency > compute_latency:
 59 |             print(
 60 |                 f"{stage} stage: memory_latency {latency_to_string(memory_latency)} \
 61 |                 > compute_latency {latency_to_string(compute_latency)}, this {ops_type} layer is memory bound!"
 62 |             )
 63 |         else:
 64 |             print(
 65 |                 f"{stage} stage: memory_latency {latency_to_string(memory_latency)} \
 66 |                 <= compute_latency {latency_to_string(compute_latency)}, this {ops_type} layer is compute bound!"
 67 |             )
 68 | 
 69 |     def common_count_latency_for_ops(
 70 |         self,
 71 |         bs: int,
 72 |         seq_len: int,
 73 |         generate_len: int = 0,
 74 |         ops_type: str = "qkvo_proj",
 75 |         stage="decode_",
 76 |         print_bound: bool = False,
 77 |     ) -> float:
 78 |         """Count the latency for the forward layer or model, assuming the compute and memory operations are perfectly overlapped.
 79 | 
 80 |         Args:
 81 |             flops (float): flops of the forward layer or model
 82 |             memory (float): r/w memory(bytes) of the forward layer or model
 83 |             tp_size (float): tensor parallelism size
 84 |             gpu_TFLOPS (float): GPU TFLOPS in T(10^12)FLOPS
 85 |             gpu_hbm_bandwidth (float): GPU HBM bandwidth in GB/s(10^9)
 86 | 
 87 |         Returns:
 88 |             float: the latency in seconds for the forward pass
 89 |         """
 90 |         ops_type = ops_type.lower()
 91 | 
 92 |         if ops_type == "qkvo_proj":
 93 |             flops = (
 94 |                 self.llm_flops.count_flops_per_layer_qkvo_proj(bs, seq_len)
 95 |                 / self.tp_size
 96 |             )
 97 |             weight_memory = (
 98 |                 self.llm_params.count_params_per_layer_mha()
 99 |                 * self.bytes_per_param
100 |                 / self.tp_size
101 |             ) * BYTES_FP16
102 |             mac = self.llm_memory.count_mac_per_layer_qkvo_proj(bs, seq_len)[1] / self.tp_size
103 |             
104 |             memory = weight_memory + mac
105 |         elif ops_type == "attn_kernel":
106 |             flops = (
107 |                 self.llm_flops.count_flops_per_layer_attn_kernel(bs, seq_len, generate_len)
108 |                 / self.tp_size
109 |             )
110 |             weight_memory = 0
111 |             mac = self.llm_memory.count_mac_per_layer_attn_kernel(bs, seq_len, generate_len, kv_cache_bytes=BYTES_FP16)[1] / self.tp_size
112 |             memory = weight_memory + mac
113 | 
114 |         elif ops_type == "mlp":
115 |             flops = self.llm_flops.count_flops_per_layer_mlp(bs, seq_len) / self.tp_size
116 |             weight_memory = (
117 |                 self.llm_params.count_params_per_layer_mlp()
118 |                 * self.bytes_per_param
119 |                 / self.tp_size
120 |             ) * BYTES_FP16
121 |             mac = (self.llm_memory.count_mac_per_layer_mlp(bs, seq_len)[1] / self.tp_size)
122 |             memory = weight_memory + mac
123 | 
124 |         elif ops_type == "rmsnorm":
125 |             # Two RMSNorm operations (pre‑attention & pre‑MLP) share the same
126 |             # vector weight, replicated across TP ranks.
127 |             weight_memory = 2 * self.llm_params.count_params_per_layer_norm() * BYTES_FP16
128 |             flops = self.llm_flops.count_flops_per_layer_norm(bs, seq_len)
129 |             mac = self.llm_memory.count_mac_per_layer_norm(bs, seq_len)[1]
130 |             memory = weight_memory + mac
131 |         else:
132 |             raise ValueError(f"Unsupported ops_type: {ops_type}")
133 | 
134 |         compute_latency = flops / (self.gpu_TFLOPS)  # 单位秒
135 |         memory_latency = memory / (self.gpu_hbm_bandwidth)
136 | 
137 |         if print_bound:
138 |             self.print_kernel_bound_info(stage, memory_latency, compute_latency, ops_type)
139 | 
140 |         return max(compute_latency, memory_latency)
141 | 
142 |     def count_latency_per_layer_tp_comm(self, bs: int, seq_len: int) -> float:
143 |         """Count the latency of a single allreduce communication across the
144 |         tensor parallel group in the forward pass of a transformer layer.
145 |         The latency is the max of the latency for the allreduce and the minimum
146 |         message latency through intra-node connect.
147 |         """
148 | 
149 |         if self.tp_size == 1:
150 |             return 0
151 | 
152 |         # 一次 AllReduce 产生的通讯量为 \phi = 2bsh
153 |         # Self-Attention 和 MLP 部分的计算各需要进行一次 All-Reduce 操作, 即每层做 2 次 All-Reduce操作
154 |         # if tp_size is large enough num_data_per_all_reduce can be 4bsh
155 |         num_data_per_all_reduce = (
156 |             6 * bs * seq_len * self.h * (self.tp_size - 1) / (self.tp_size)
157 |         )
158 | 
159 |         latency_per_layer_tp_comm = (
160 |             num_data_per_all_reduce
161 |             * self.bytes_per_param
162 |             / self.gpu_intra_node_bandwidth
163 |         )
164 | 
165 |         # intra_node_min_message_latency: 节点内连接的最小消息延迟
166 |         return max(
167 |             latency_per_layer_tp_comm,
168 |             self.gpu_config.intra_node_min_message_latency,
169 |         )
170 | 
171 |     def count_latency_per_layer(
172 |         self,
173 |         bs: int,
174 |         seq_len: int,
175 |         generate_len: int = 0,
176 |         flash_attn=False,
177 |         kv_cache_bytes: int = BYTES_FP16,
178 |     ) -> tuple:
179 |         kernel_latency_per_layer = 0.0
180 |         dict_latency_per_layer = dict()
181 |         ops_list = ["qkvo_proj", "attn_kernel", "mlp", "rmsnorm"]
182 | 
183 |         for ops_name in ops_list:
184 |             kernel_latency = self.common_count_latency_for_ops(
185 |                 bs, seq_len, generate_len, ops_name,
186 |             )
187 |             dict_latency_per_layer[ops_name] = kernel_latency
188 |             kernel_latency_per_layer += kernel_latency
189 | 
190 |         latency_per_layer_tp_comm = self.count_latency_per_layer_tp_comm(bs, seq_len)
191 |         kv_cache_latency = self.count_latency_kv_cache_per_layer(
192 |             bs, seq_len, generate_len, flash_attn, kv_cache_bytes
193 |         )
194 | 
195 |         latency_per_layer = (
196 |             kernel_latency_per_layer
197 |             + latency_per_layer_tp_comm
198 |             + kv_cache_latency
199 |         )
200 | 
201 |         dict_latency_per_layer["tp_comm"] = latency_per_layer_tp_comm
202 |         dict_latency_per_layer["kv_cache_rw"] = kv_cache_latency
203 | 
204 |         return latency_per_layer, dict_latency_per_layer
205 | 
206 |     def count_latency_input_embedding(self, bs: int, seq_len: int) -> float:
207 |         """Get the latency for the forward pass of the input embedding layer,
208 |         given the batch size, sequence length, and data type of the embedding
209 |         weight.
210 | 
211 |         Args:
212 |             bs (int): batch size
213 |             seq_len (int): sequence length
214 | 
215 |         Returns:
216 |             float: the latency in seconds for the forward pass of the input embedding layer
217 |         """
218 |         memory_latency = (
219 |             self.model_config.vocab_size
220 |             * self.model_config.hidden_size
221 |             * self.bytes_per_param
222 |             / (self.gpu_hbm_bandwidth)
223 |         )
224 |         comm_latency = self.count_latency_per_layer_tp_comm(bs, seq_len)
225 |         return memory_latency + comm_latency
226 | 
227 |     def count_latency_output_embedding(self, bs: int, seq_len: int) -> float:
228 |         """Get the latency for the forward pass of the output embedding layer (computing the logits). 
229 |         The operation is compute bound. With tensor parallelism size > 1, 
230 |         an allgather communicates `bs * seq_len` elements, 
231 |         which is ignored here. Refer to https://arxiv.org/abs/1909.08053 for more details.
232 | 
233 |         Args:
234 |             bs (int): batch size
235 |             seq_len (int): sequence length
236 |         """
237 | 
238 |         compute_latency = (
239 |             2 * bs * seq_len * self.h * self.V / self.tp_size / self.gpu_TFLOPS
240 |         )
241 | 
242 |         return compute_latency
243 | 
244 |     def count_latency_kv_cache_per_layer(
245 |         self,
246 |         bs: int,
247 |         seq_len: int,
248 |         generate_len: int,
249 |         flash_attn: bool = False,
250 |         kv_cache_bytes: int = BYTES_FP16,
251 |     ) -> tuple:
252 |         """Get the latency for the forward pass of the key and value cache in a transformer layer,
253 |         given the batch size, sequence length, and whether the key and value cache is used.
254 | 
255 |         Args:
256 |             bs (int): batch size
257 |             seq_len (int): sequence length
258 |             generate_len (int): number of tokens to generate
259 |         """
260 |         kv_cache_mac = (
261 |             self.llm_memory.count_mac_per_layer_kv_cache(
262 |                 bs, seq_len, generate_len, flash_attn, kv_cache_bytes
263 |             )
264 |             / self.tp_size
265 |         )
266 | 
267 |         memory_latency = kv_cache_mac / (self.gpu_hbm_bandwidth)
268 | 
269 |         return memory_latency
270 | 
271 |     def count_latency_model(
272 |         self,
273 |         bs: int,
274 |         seq_len: int,
275 |         generate_len: int,
276 |         flash_attn: bool = False,
277 |         kv_cache_bytes: int = BYTES_FP16,
278 |         breakdown_prefix: str = "",
279 |     ) -> tuple:
280 |         latency_per_layer, breakdown_per_layer = self.count_latency_per_layer(
281 |             bs,
282 |             seq_len,
283 |             generate_len,
284 |             flash_attn,
285 |             kv_cache_bytes,
286 |         )
287 |         num_layers_per_gpu = self.num_layers_per_gpu
288 | 
289 |         latency_all_layers = latency_per_layer * self.num_layers_per_gpu
290 |         latency_input_embedding = self.count_latency_input_embedding(bs, seq_len)
291 |         latency_output_embedding = self.count_latency_output_embedding(bs, seq_len)
292 | 
293 |         model_latency = (
294 |             latency_all_layers + latency_input_embedding + latency_output_embedding
295 |         )
296 | 
297 |         model_latency_breakdown = {
298 |             breakdown_prefix + "qkvo_proj": (
299 |                 breakdown_per_layer["qkvo_proj"] * num_layers_per_gpu
300 |             ),
301 |             breakdown_prefix + "attn_kernel": (
302 |                 breakdown_per_layer["attn_kernel"] * num_layers_per_gpu
303 |             ),
304 |             breakdown_prefix + "mlp": (breakdown_per_layer["mlp"] * num_layers_per_gpu),
305 |             breakdown_prefix + "rmsnorm": (
306 |                 breakdown_per_layer["rmsnorm"] * num_layers_per_gpu
307 |             ),
308 |             breakdown_prefix + "tp_comm": (
309 |                 breakdown_per_layer["tp_comm"] * num_layers_per_gpu
310 |             ),
311 |             breakdown_prefix + "kv_cache_rw": (
312 |                 breakdown_per_layer["kv_cache_rw"] * num_layers_per_gpu
313 |             ),
314 |         }
315 | 
316 |         return model_latency, model_latency_breakdown
317 | 
318 |     def count_latency(
319 |         self,
320 |         bs: int,
321 |         seq_len: int,
322 |         generate_len: int,
323 |         flash_attn: bool = False,
324 |         kv_cache_bytes: int = BYTES_FP16,
325 |     ) -> tuple:
326 |         # 1, 预填充阶段
327 |         prefill_latency, prefill_latency_breakdown = self.count_latency_model(
328 |             bs,
329 |             seq_len,
330 |             generate_len=0,
331 |             flash_attn=flash_attn,
332 |             kv_cache_bytes=kv_cache_bytes,
333 |             breakdown_prefix="prefill_",
334 |         )
335 | 
336 |         prefill_latency_breakdown.update(
337 |             {
338 |                 "TTFT": prefill_latency,
339 |             }
340 |         )
341 | 
342 |         # 2, 解码阶段
343 |         kv_cache_latency = self.count_latency_kv_cache_per_layer(
344 |             bs, seq_len, generate_len, flash_attn, kv_cache_bytes
345 |         ) * self.num_layers_per_gpu
346 | 
347 |         decode_model_latency, decode_latency_breakdown = self.count_latency_model(
348 |             bs,
349 |             1,
350 |             generate_len=generate_len,
351 |             flash_attn=flash_attn,
352 |             kv_cache_bytes=kv_cache_bytes,
353 |             breakdown_prefix="decode_",
354 |         )
355 | 
356 |         decode_latency = decode_model_latency + kv_cache_latency
357 | 
358 |         decode_latency_breakdown.update(
359 |             {
360 |                 "kv_cache_latency": kv_cache_latency,
361 |                 "TTOT": (decode_latency),
362 |             }
363 |         )
364 |         return prefill_latency_breakdown, decode_latency_breakdown
365 | 


--------------------------------------------------------------------------------
/llm_counts/count_memory.py:
--------------------------------------------------------------------------------
  1 | from .utils.config import LLMConfigs
  2 | from .utils.constants import BYTES_FP16
  3 | from .count_params import CountCausalLMParams
  4 | 
  5 | from functools import reduce
  6 | import operator as _op
  7 | 
  8 | def _B(*dims):
  9 |     """Utility: multiply arbitrary dimensions to get a byte count."""
 10 |     return reduce(_op.mul, dims, 1)
 11 | 
 12 | 
 13 | class CountCausalLMMemory(object):
 14 |     """Count memory of the model and layers."""
 15 | 
 16 |     def __init__(self, llm_configs: LLMConfigs) -> None:
 17 |         self.model_config = llm_configs.model_config
 18 |         self.model_type = self.model_config.model_type
 19 |         self.hidden_size = self.model_config.hidden_size
 20 |         self.intermediate_size = self.model_config.intermediate_size
 21 | 
 22 |         self.num_heads = self.model_config.num_heads
 23 |         self.num_kv_heads = self.model_config.num_kv_heads
 24 |         self.head_dim = self.model_config.head_dim
 25 |         self.num_layers = self.model_config.num_layers
 26 |         self.V = self.model_config.vocab_size
 27 | 
 28 |         self.bytes_per_param = llm_configs.inference_config.bytes_per_param
 29 |         self.act_dtype_bytes = BYTES_FP16
 30 | 
 31 |         self.tp_size = llm_configs.parallelism_config.tp_size
 32 |         self.pp_size = llm_configs.parallelism_config.pp_size
 33 |         self.num_layers_per_gpu = int(self.num_layers / self.pp_size)
 34 | 
 35 |         self.gpu_memory_in_GB = llm_configs.gpu_config.memory_GPU_in_GB * 10**9
 36 |         self.llm_params = CountCausalLMParams(self.model_config)
 37 | 
 38 |     def count_memory_weight_per_gpu(self, ):
 39 |         """Get the memory of the model weights"""
 40 |         params_model = self.llm_params.count_params_model()
 41 |         memory_weight_per_gpu = params_model * self.bytes_per_param / self.tp_size
 42 | 
 43 |         return memory_weight_per_gpu
 44 |     
 45 |     def count_mac_per_layer_attn_kernel(
 46 |         self,
 47 |         bs: int,
 48 |         seq_len,
 49 |         generate_len: int = 0,
 50 |         flash_attn: bool = False,
 51 |         kv_cache_bytes: int = BYTES_FP16,
 52 |     ):
 53 |         if self.model_type == "qwen3":
 54 |             norm_bytes = 2 * (
 55 |                 _B(self.head_dim, BYTES_FP16)                    # load γ
 56 |                 + 2 * _B(bs, seq_len, self.head_dim, BYTES_FP16) # load + store acts
 57 |             )
 58 |         else:
 59 |             norm_bytes = 0
 60 | 
 61 |         if not flash_attn:
 62 |             if seq_len != 1:
 63 |                 # dim changge: (bs, seq_len, hidden_size) -> (bs, seq_len, num_heads, head_dim)
 64 |                 # (bs, seq_len, num_heads, head_dim) -> (bs, num_heads, seq_len, head_dim)
 65 |                 # qk^t: (bs, num_heads, seq_len, head_dim) *  (bs, num_kv_heads, seq_len, head_sim) -> (bs, num_heads, seq_len, seq_len)
 66 |                 # sv: (bs, num_heads, seq_len, seq_len) * (bs, num_kv_heads, seq_len, head_dim) -> (bs, num_heads, seq_len, head_dim)
 67 | 
 68 |                 load_q_mem = bs * self.num_heads * seq_len * self.head_dim
 69 |                 load_k_mem = bs * self.num_kv_heads * seq_len * self.head_dim
 70 |                 qk_store_mem = bs * self.num_heads * seq_len * seq_len
 71 | 
 72 |                 load_softmax_mem = qk_store_mem
 73 |                 softmax_store_mem = bs * self.num_heads * seq_len * seq_len
 74 | 
 75 |                 load_s_mem = softmax_store_mem
 76 |                 load_v_mem = bs * self.num_kv_heads * seq_len * self.head_dim
 77 |                 sv_store_mem = bs * self.num_heads * seq_len * self.head_dim
 78 | 
 79 |                 self_atten_mac = (load_q_mem + load_k_mem + qk_store_mem
 80 |                                   + load_softmax_mem + softmax_store_mem
 81 |                                   + load_s_mem + load_v_mem + sv_store_mem)
 82 |                 max_act = max(load_q_mem, load_k_mem, qk_store_mem,
 83 |                               load_softmax_mem, softmax_store_mem,
 84 |                               load_s_mem, load_v_mem, sv_store_mem) * self.act_dtype_bytes
 85 |                 return max_act, self_atten_mac * kv_cache_bytes + norm_bytes
 86 | 
 87 |             else:
 88 |                 # dim changge: (bs, 1, hidden_size) -> (bs, 1, num_heads, head_dim)
 89 |                 # (bs, 1, num_heads, head_dim) -> (bs, num_heads, 1, head_dim)
 90 |                 # qk^t: (bs, num_heads, seq_len + generate_len, head_dim) *  (bs, num_kv_heads, seq_len + generate_len, head_sim) -> (bs, num_heads, seq_len + generate_len, seq_len + generate_len)
 91 |                 # sv: (bs, num_heads, seq_len + generate_len, seq_len + generate_len) * (bs, num_kv_heads, seq_len + generate_len, head_dim) -> (bs, num_heads, seq_len + generate_len, head_dim)
 92 | 
 93 |                 load_q_mem = bs * self.num_heads * 1  * self.head_dim
 94 |                 load_k_mem = bs * self.num_kv_heads * (seq_len + generate_len) * self.head_dim
 95 |                 qk_store_mem = bs * self.num_heads * (seq_len + generate_len) * (seq_len + generate_len)
 96 | 
 97 |                 load_softmax_mem = qk_store_mem
 98 |                 softmax_store_mem = bs * self.num_heads * (seq_len + generate_len) * (seq_len + generate_len)
 99 | 
100 |                 load_s_mem = softmax_store_mem
101 |                 load_v_mem = bs * self.num_kv_heads * (seq_len + generate_len) * self.head_dim
102 |                 sv_store_mem = bs * self.num_heads * (seq_len + generate_len) * self.head_dim
103 | 
104 |                 max_act = max(load_q_mem, load_k_mem, qk_store_mem,
105 |                               load_softmax_mem, softmax_store_mem,
106 |                               load_s_mem, load_v_mem, sv_store_mem) * self.act_dtype_bytes
107 |                 self_atten_mac = (load_q_mem + load_k_mem + qk_store_mem
108 |                                   + load_softmax_mem + softmax_store_mem
109 |                                   + load_s_mem + load_v_mem + sv_store_mem)
110 |                 
111 |                 return max_act, self_atten_mac * kv_cache_bytes + norm_bytes
112 |         
113 |     def count_mac_per_layer_kv_cache(
114 |         self,
115 |         bs,
116 |         seq_len,
117 |         generate_len: int = 0,
118 |         flash_attn: bool = False,
119 |         kv_cache_bytes: int = BYTES_FP16,
120 |     ):
121 |         if not flash_attn:
122 |             store_k_cache = (
123 |                 self.num_kv_heads * self.head_dim * bs * seq_len * kv_cache_bytes
124 |             )
125 |             store_v_cache = (
126 |                 self.num_kv_heads * self.head_dim * bs * seq_len * kv_cache_bytes
127 |             )
128 |             if seq_len != 1:
129 |                 return store_k_cache + store_v_cache
130 |             else:
131 |                 qk_matmul_load_k_cache = (
132 |                     (seq_len + generate_len)
133 |                     * self.head_dim
134 |                     * bs
135 |                     * self.num_kv_heads
136 |                     * kv_cache_bytes
137 |                 )
138 |                 sv_matmul_load_v_cache = (
139 |                     (seq_len + generate_len)
140 |                     * self.head_dim
141 |                     * bs
142 |                     * self.num_kv_heads
143 |                     * kv_cache_bytes
144 |                 )
145 |             
146 |                 kv_cache_mac = (
147 |                     qk_matmul_load_k_cache
148 |                     + sv_matmul_load_v_cache
149 |                     + store_k_cache
150 |                     + store_v_cache
151 |                 )
152 |                 return kv_cache_mac
153 |         else:
154 |             # FlashAttention path: compute attention on‑the‑fly; only new K/V cache entries are stored
155 |             kv_cache_mac = (
156 |                 self.num_kv_heads
157 |                 * self.head_dim
158 |                 * bs
159 |                 * seq_len
160 |                 * 2  # K + V
161 |                 * kv_cache_bytes
162 |             )
163 | 
164 |         return kv_cache_mac
165 | 
166 |     def count_mac_per_layer_qkvo_proj(
167 |         self,
168 |         bs: int,
169 |         seq_len: int,
170 |         qkvo_weight_dtype_bytes=BYTES_FP16,
171 |     ) -> int:
172 |         """
173 |         Count memory access cost for Q/K/V/O projection layers.
174 |         """
175 |         atten_linear_layers = {
176 |             "q_proj": [self.hidden_size, self.num_heads * self.head_dim],
177 |             "k_proj": [self.hidden_size, self.num_kv_heads * self.head_dim],
178 |             "v_proj": [self.hidden_size, self.num_kv_heads * self.head_dim],
179 |             "out_proj": [self.num_heads * self.head_dim, self.hidden_size],
180 |         }
181 | 
182 |         atten_linear_layers_mac = 0
183 |         max_act = 0
184 |         for name, (in_ch, out_ch) in atten_linear_layers.items():
185 |             is_kv_proj = name in ["k_proj", "v_proj"]
186 |             is_normal_proj = not is_kv_proj
187 | 
188 |             load_weight = in_ch * out_ch
189 |             load_act = in_ch * bs * seq_len
190 |             store_act = 0 if is_kv_proj else bs * seq_len * out_ch
191 |             load_kv_cache = 0
192 |             store_kv_cache = 0 if is_normal_proj else out_ch * bs * seq_len
193 | 
194 |             max_act = max(max_act, load_weight, load_act, store_act, store_kv_cache) 
195 | 
196 |             mac = load_weight + load_act + store_act + load_kv_cache + store_kv_cache
197 |             atten_linear_layers_mac += mac
198 |         
199 |         return max_act * self.act_dtype_bytes, atten_linear_layers_mac * qkvo_weight_dtype_bytes
200 | 
201 |     def count_mac_per_layer_mlp(
202 |         self,
203 |         bs: int,
204 |         seq_len: int,
205 |         mlp_weight_dtype_bytes=BYTES_FP16,
206 |     ) -> float:
207 |         """The `mlp` acts include the input to the two linear layers.
208 |         Refer to https://arxiv.org/abs/2205.05198 for details.
209 |         The two linear layers store their inputs with size 2bsh and 8bsh
210 |         """
211 |         mlp_linear_layers = {
212 |             "gate_proj": [self.hidden_size, self.intermediate_size],
213 |             "up_proj": [self.hidden_size, self.intermediate_size],
214 |             "down_proj": [self.intermediate_size, self.hidden_size],
215 |         }
216 | 
217 |         mlp_linear_layers_mac = 0
218 |         max_act = 0
219 |         for _, (in_ch, out_ch) in mlp_linear_layers.items():
220 |             load_weight = in_ch * out_ch
221 |             load_act = in_ch * bs * seq_len
222 |             store_act = bs * seq_len * out_ch
223 | 
224 |             max_act = max(max_act, load_weight, load_act, store_act) 
225 |             mac = load_weight + load_act + store_act
226 |             mlp_linear_layers_mac += mac
227 |         
228 |         return max_act * self.act_dtype_bytes, mlp_linear_layers_mac * mlp_weight_dtype_bytes
229 | 
230 |     def count_mac_per_layer_norm(
231 |         self,
232 |         bs: int,
233 |         seq_len: int,
234 |     ) -> float:
235 |         """Get the memory (in bytes) required to store the acts of a single layernorm in a transformer layer."""
236 |         rmsnorm_load_weight = self.hidden_size * self.act_dtype_bytes
237 |         rmsnorm_load_act = bs * seq_len * self.hidden_size * self.act_dtype_bytes
238 |         rmsnorm_store_act = bs * seq_len * self.hidden_size * self.act_dtype_bytes
239 | 
240 |         norm_mac_per_gpu = (
241 |             rmsnorm_load_weight + rmsnorm_load_act + rmsnorm_store_act
242 |         )
243 |         max_act = max(rmsnorm_load_weight, rmsnorm_load_act, rmsnorm_store_act) * self.act_dtype_bytes
244 |         return max_act, norm_mac_per_gpu
245 | 
246 |     def count_mac_input_embedding(self, bs: int, seq_len: int) -> float:
247 |         input_embedding_load_act = bs * seq_len * self.act_dtype_bytes
248 |         input_embedding_store_act = (
249 |             bs * seq_len * self.hidden_size * self.act_dtype_bytes
250 |         )
251 |         input_embedding_mac_per_gpu = (
252 |             input_embedding_load_act + input_embedding_store_act
253 |         )
254 | 
255 |         return input_embedding_mac_per_gpu
256 | 
257 |     def count_memory_kv_cache_per_layer(
258 |         self,
259 |         bs: int,
260 |         seq_len: int,
261 |         generate_len: int,
262 |         kv_cache_bytes: int = BYTES_FP16,
263 |     ) -> float:
264 |         """Get the memory (in bytes) required to store the key and value cache
265 |         for a transformer layer in inference, given the batch size, sequence
266 |         length, act data type, and tensor parallelism size.
267 | 
268 |         memory_kv_cache = 4blh(s+o) unit is byte
269 |         Args:
270 |             bs (int): batch size
271 |             context_len (int): seq_len + generate_len
272 | 
273 |         Returns:
274 |             float: the memory (in bytes) required  to store the key and value cache 
275 |             for a transformer layer in inference.
276 |         """
277 | 
278 |         # At least on attention head on each tensor-parallel GPU
279 |         num_kv_heads_per_gpu = max(self.num_kv_heads, 1)
280 |         memory_kv_cache_per_layer = (
281 |             bs
282 |             * (seq_len + generate_len)
283 |             * num_kv_heads_per_gpu
284 |             * self.head_dim
285 |             * 2
286 |             * kv_cache_bytes
287 |         )
288 | 
289 |         return memory_kv_cache_per_layer
290 | 
291 |     def count_max_act_per_layer(
292 |         self,
293 |         bs: int,
294 |         seq_len_ctx: int,
295 |         generate_len: int = 0,  # used only for decode stage
296 |         *,
297 |         stage: str = "prefill",        # "prefill" | "decode"
298 |         flash_attn: bool = False,
299 |         qkvo_weight_dtype_bytes: int = BYTES_FP16,
300 |         mlp_weight_dtype_bytes: int = BYTES_FP16,
301 |     ) -> float:
302 |         assert stage in {"prefill", "decode"}
303 | 
304 |         # For decode stage each step handles just **one token**.
305 |         tokens = 1 if stage == "decode" else seq_len_ctx
306 | 
307 |         act_per_layer_self_atten, _ = self.count_mac_per_layer_attn_kernel(
308 |             bs,
309 |             tokens,
310 |             generate_len=generate_len,
311 |             flash_attn=flash_attn,
312 |             kv_cache_bytes=qkvo_weight_dtype_bytes,
313 |         )
314 |         act_per_layer_qkvo_proj, _ = self.count_mac_per_layer_qkvo_proj(
315 |             bs,
316 |             tokens,
317 |             qkvo_weight_dtype_bytes=qkvo_weight_dtype_bytes,
318 |         )
319 |         act_per_layer_mlp, _ = self.count_mac_per_layer_mlp(
320 |                 bs,
321 |                 tokens,
322 |                 mlp_weight_dtype_bytes=mlp_weight_dtype_bytes,
323 |             )
324 |         act_per_layer_rn, _ = self.count_mac_per_layer_norm(bs, tokens) 
325 | 
326 |         act_per_layer = max(act_per_layer_self_atten, act_per_layer_qkvo_proj, act_per_layer_mlp, act_per_layer_rn)
327 | 
328 |         return act_per_layer
329 |     
330 |     def count_memory_per_gpu(
331 |         self,
332 |         bs: int,
333 |         seq_len: int,
334 |         generate_len: int,
335 |         flash_attn: bool = True,
336 |         qkvo_weight_dtype_bytes: int = BYTES_FP16,
337 |         mlp_weight_dtype_bytes=BYTES_FP16,
338 |         kv_cache_bytes: int = BYTES_FP16,
339 |     ) -> tuple:
340 |         # 1, prefill stage count memory and max_bs
341 |         weight_memory_per_gpu = self.count_memory_weight_per_gpu()  # count model weights memory
342 |         memory_left_per_gpu = self.gpu_memory_in_GB - weight_memory_per_gpu
343 | 
344 |         # --- 1) PREFILL stage ----------------------------------------- #
345 |         prefill_act_bs_1 = self.count_max_act_per_layer(
346 |             1,
347 |             seq_len,
348 |             generate_len=generate_len,
349 |             stage="prefill",
350 |             flash_attn=flash_attn,
351 |             qkvo_weight_dtype_bytes=qkvo_weight_dtype_bytes,
352 |             mlp_weight_dtype_bytes=mlp_weight_dtype_bytes,
353 |         ) // self.tp_size
354 | 
355 |         prefill_max_bs = int(memory_left_per_gpu / prefill_act_bs_1)
356 |         prefill_act_per_gpu = bs * prefill_act_bs_1
357 | 
358 |         # --- 2) DECODE stage ------------------------------------------ #
359 |         kv_cache_memory_bs_1_per_gpu = (self.count_memory_kv_cache_per_layer(1, seq_len, generate_len, kv_cache_bytes) * self.num_layers_per_gpu) / self.tp_size
360 |         decode_act_bs_1_per_gpu = self.count_max_act_per_layer(
361 |             1,
362 |             seq_len,
363 |             generate_len=generate_len,
364 |             stage="decode",
365 |             flash_attn=flash_attn,
366 |             qkvo_weight_dtype_bytes=qkvo_weight_dtype_bytes,
367 |             mlp_weight_dtype_bytes=mlp_weight_dtype_bytes,
368 |         ) // self.tp_size
369 |         decode_max_bs = memory_left_per_gpu // (decode_act_bs_1_per_gpu + kv_cache_memory_bs_1_per_gpu)
370 | 
371 |         kv_cache_memory_per_gpu = bs * kv_cache_memory_bs_1_per_gpu
372 |         decode_act_per_gpu = decode_act_bs_1_per_gpu * bs
373 |         max_batch_total_tokens = decode_max_bs * (seq_len + generate_len)
374 | 
375 |         assert bs <= decode_max_bs, (
376 |             f"For context length: {seq_len + generate_len}, bs {bs} is too large to fit"
377 |             " in GPU memory, decode_max_bs:"
378 |             f" {decode_max_bs}"
379 |         )
380 | 
381 |         assert memory_left_per_gpu > (
382 |             kv_cache_memory_per_gpu + decode_act_per_gpu
383 |         ), (
384 |             "kv_cache and act memory with bs ="
385 |             f" {bs} is too large to fit in GPU memory"
386 |         )
387 |         
388 |         consume_memory_per_gpu = (
389 |             weight_memory_per_gpu + decode_act_per_gpu + kv_cache_memory_per_gpu
390 |         )
391 | 
392 |         # memory summary
393 |         memory_prefill_summary_dict = {
394 |             "weight_memory_per_gpu": weight_memory_per_gpu,
395 |             "prefill_max_bs": prefill_max_bs,
396 |             "prefill_act_per_gpu": prefill_act_per_gpu,
397 |         }
398 | 
399 |         memory_decode_summary_dict = {
400 |             "decode_act_per_gpu": decode_act_per_gpu,
401 |             "kv_cache_memory_per_gpu": kv_cache_memory_per_gpu,
402 |             "consume_memory_per_gpu": consume_memory_per_gpu,
403 |             "decode_max_bs": decode_max_bs,
404 |             "max_batch_total_tokens": int(max_batch_total_tokens * 0.97),
405 |         }
406 | 
407 |         return memory_prefill_summary_dict, memory_decode_summary_dict
408 | 


--------------------------------------------------------------------------------
/llm_counts/count_params.py:
--------------------------------------------------------------------------------
  1 | from .utils.config import ModelConfig
  2 | from .utils.constants import *
  3 | 
  4 | 
  5 | class CountCausalLMParams(object):
  6 |     def __init__(self, model_config: ModelConfig) -> None:
  7 |         self.hidden_size = model_config.hidden_size
  8 |         self.intermediate_size = model_config.intermediate_size
  9 |         self.num_layers = model_config.num_layers
 10 |         self.V = model_config.vocab_size
 11 | 
 12 |         self.num_heads = model_config.num_heads
 13 |         self.num_kv_heads = model_config.num_kv_heads
 14 |         self.head_dim = model_config.head_dim
 15 |         self.model_type = model_config.model_type
 16 | 
 17 |     def count_params_embedding(self, shared_embedding: bool = True) -> int:
 18 |         """Get the number of parameters in the embedding layer. 
 19 |         params_te = vocab_size * d_model
 20 |         Args:
 21 |             shared_embedding (bool, optional):  whether the output embedding 
 22 |             shares weights with the input embedding. Defaults to True.
 23 | 
 24 |         Returns: 
 25 |             int: the number of parameters in the embedding layer.
 26 |         """
 27 |         num_params_input_embedding = self.V * self.hidden_size
 28 |         num_params_output_embedding = (
 29 |             self.V * self.hidden_size if not shared_embedding else 0
 30 |         )
 31 | 
 32 |         return num_params_input_embedding + num_params_output_embedding
 33 | 
 34 |     def count_params_per_layer_mha(self) -> int:
 35 |         """Get the number of parameters per layer in the attention module
 36 |         which include 4 linear layer: q/k/v/o linear layers.
 37 | 
 38 |         Returns:
 39 |             int: the number of parameters per layer in the attention module(mha)
 40 |         """
 41 |         params_qo_proj = 2 * self.hidden_size * self.num_heads * self.head_dim
 42 |         params_kv_proj = 2 * self.hidden_size * self.num_kv_heads * self.head_dim
 43 |         return params_qo_proj + params_kv_proj
 44 | 
 45 |     def count_params_per_layer_mlp(self) -> int:
 46 |         """Get the number of parameters in the MLP linear layers, including the
 47 |         intermediate and output matrices.
 48 |         params_mlp = params_gate_proj + params_up_proj + params_down_proj
 49 |         Returns:
 50 |             int: the number of parameters in the two MLP linear layers
 51 |         """
 52 |         params_gate_proj = self.hidden_size * self.intermediate_size
 53 |         params_up_proj = self.hidden_size * self.intermediate_size
 54 |         params_down_proj = self.intermediate_size * self.hidden_size
 55 |         params_mlp = params_gate_proj + params_up_proj + params_down_proj
 56 | 
 57 |         return params_mlp
 58 | 
 59 |     def count_params_per_layer_norm(self) -> int:
 60 |         """Get the number of atten_norm and mlp_norm parameters per layer.
 61 |         """
 62 |         # q_norm、k_norm、atten_norm、mlp_norm
 63 |         if self.model_type == "qwen3":
 64 |             return 2 * self.hidden_size + 2 * self.head_dim
 65 |         else:
 66 |             return 2 * self.hidden_size
 67 | 
 68 |     def count_params_per_layer(self, norm_ignore=False) -> tuple:
 69 |         """Get the number of params per layer mainly including the attention and MLP layers.
 70 | 
 71 |         params_per_layer = params_mha + params_mlp + params_norm
 72 | 
 73 |         """
 74 |         params_per_layer_mha = self.count_params_per_layer_mha()
 75 |         params_per_layer_mlp = self.count_params_per_layer_mlp()
 76 |         params_per_layer_norm = 0 if norm_ignore else self.count_params_per_layer_norm()
 77 |         params_input_embedding = self.count_params_embedding()
 78 | 
 79 |         params_per_layer = (
 80 |             params_per_layer_mha + params_per_layer_mlp + params_per_layer_norm
 81 |         )
 82 | 
 83 |         dict_params_per_layer = {
 84 |             "qkvo_proj": params_per_layer_mha,
 85 |             "mlp": params_per_layer_mlp,
 86 |             "rmsnorm": params_per_layer_norm,
 87 |             "input_embedding": params_input_embedding,
 88 |             "output_embedding": params_input_embedding,
 89 |         }
 90 | 
 91 |         return params_per_layer, dict_params_per_layer
 92 | 
 93 |     def count_params_model(self) -> int:
 94 |         """Get the total number of parameters in the model 
 95 |         including all layers and token embedding layer.
 96 |         params_model = params_embedding + params_per_layer * num_layers
 97 |                     = V * d_model + 12 * d_model**2 * num_layers
 98 |         Returns:
 99 |             int: the total number of parameters in the model
100 |         """
101 |         params_per_layer, _ = self.count_params_per_layer()
102 |         params_model = (
103 |             params_per_layer * self.num_layers + self.count_params_embedding()
104 |         )
105 | 
106 |         return params_model


--------------------------------------------------------------------------------
/llm_counts/layer_graph_visualizer.py:
--------------------------------------------------------------------------------
  1 | """
  2 | cli entry point for LayerAnalyzer, which analyzes the memory access and FLOPs of a model.
  3 | Usage:
  4 |     ```bash
  5 |     python -m llm_counts.llm_analyzer \
  6 |     --result-json path/to/results.json \
  7 |     --model-type qwen3 \
  8 |     --output my_layer_graph
  9 | ```
 10 | """
 11 | from .utils.constants import BYTES_FP16
 12 | from .utils.config import *
 13 | from .utils.utils import num_to_string
 14 | from .roofline_model import roofline_analysis
 15 | 
 16 | 
 17 | class LayerAnalyzer(object):
 18 |     """Count memory access of the model and layers."""
 19 | 
 20 |     def __init__(self, model_config,  gpu_config, tp_size) -> None:
 21 |         self.tp_size = tp_size
 22 |         self.bandwidth, self.onchip_buffer = get_gpu_hbm_bandwidth(gpu_config) # GB/s
 23 |         self.bandwidth *= 10**9 
 24 |         self.gpu_max_ops = get_TFLOPS_per_gpu(gpu_config, data_type="fp16") * 10**12  # TFLOPs
 25 |     
 26 |         self.model_type = model_config.model_type
 27 |         self.hidden_size = model_config.hidden_size
 28 |         self.intermediate_size = model_config.intermediate_size
 29 |         self.num_heads = model_config.num_heads
 30 |         self.num_kv_heads = model_config.num_kv_heads
 31 |         self.head_dim = self.hidden_size // self.num_heads
 32 | 
 33 |         # attention linear layers
 34 |         self.linear_layers = {
 35 |             "q_proj": [self.hidden_size, self.num_heads * self.head_dim],
 36 |             "k_proj": [self.hidden_size, self.num_kv_heads * self.head_dim],
 37 |             "v_proj": [self.hidden_size, self.num_kv_heads * self.head_dim],
 38 |             "out_proj": [self.num_heads * self.head_dim, self.hidden_size],
 39 |             
 40 |             "gate_proj": [self.hidden_size, self.intermediate_size],
 41 |             "up_proj": [self.hidden_size, self.intermediate_size],
 42 |             "down_proj": [self.intermediate_size, self.hidden_size],
 43 |         }
 44 | 
 45 |         self.results = {"decode": {}, "prefill": {}}
 46 | 
 47 |     def _analyze_to_results(
 48 |         self,
 49 |         stage,
 50 |         kernel_name,
 51 |         flops,
 52 |         load_weight,
 53 |         load_act,
 54 |         store_act,
 55 |         load_kv_cache,
 56 |         store_kv_cache,
 57 |         data_type="fp16"
 58 |     ):
 59 |         memory_access = (load_weight + load_act + store_act  + load_kv_cache + store_kv_cache)
 60 |         a_intensity, att_flops, bound = roofline_analysis(self.gpu_max_ops, 
 61 |                                                           self.bandwidth, 
 62 |                                                           flops, memory_access) # Arithmetic Intensity
 63 | 
 64 |         self.results[stage][kernel_name] = {
 65 |             "flops": num_to_string(flops),
 66 |             "memory_access": f"{num_to_string(memory_access)}B",
 67 |             "arithmetic_intensity": int(a_intensity),
 68 |             "att_flops": num_to_string(att_flops),
 69 |             "bound": bound,
 70 |             "load_weight": f"{num_to_string(load_weight)}B",
 71 |             "load_act": num_to_string(load_act),
 72 |             "store_act": num_to_string(store_act),
 73 |             "load_kv_cache": num_to_string(load_kv_cache),
 74 |             "store_kv_cache": num_to_string(store_kv_cache),
 75 |         }
 76 | 
 77 |         return self.results
 78 | 
 79 |     def analyze_linear_layers(
 80 |         self, 
 81 |         bs: int,
 82 |         seq_len: int,
 83 |         linear_weight_bytes: int = BYTES_FP16,
 84 |         act_byte: int = BYTES_FP16,
 85 |         kv_byte: int = BYTES_FP16,
 86 |     ):
 87 |         """
 88 |         Count and save the FLOPs and memory access of self-attention layers.
 89 |         This function is used to analyze the self-attention layers in the model.
 90 |         """
 91 |         # 1. attention linear layers analysis
 92 |         for name, (in_ch, out_ch) in self.linear_layers.items():
 93 |             is_kv_proj = name in ["k_proj", "v_proj"]
 94 |             is_normal_proj = not is_kv_proj
 95 | 
 96 |             self._analyze_to_results(
 97 |                 "prefill",
 98 |                 name,
 99 |                 flops=2 * bs * seq_len * in_ch * out_ch // self.tp_size,
100 |                 load_weight=in_ch * out_ch * linear_weight_bytes // self.tp_size,
101 |                 load_act=in_ch * bs * seq_len * act_byte // self.tp_size,
102 |                 store_act=0 if is_kv_proj else  bs * seq_len * out_ch * act_byte // self.tp_size,
103 |                 load_kv_cache=0,
104 |                 store_kv_cache=(0 if is_normal_proj else out_ch * bs * seq_len * kv_byte) // self.tp_size
105 |             )  
106 |             self._analyze_to_results(
107 |                 "decode",
108 |                 name,
109 |                 flops=2 * bs * in_ch * out_ch // self.tp_size,
110 |                 load_weight=in_ch * out_ch * linear_weight_bytes // self.tp_size,
111 |                 load_act=in_ch * bs * act_byte // self.tp_size,
112 |                 store_act=0 if is_kv_proj else out_ch * bs * act_byte // self.tp_size,
113 |                 load_kv_cache=0,
114 |                 store_kv_cache=(0 if is_normal_proj else out_ch * bs * kv_byte) // self.tp_size,
115 |             )
116 |     
117 |     def analyze_self_atten_kernel(
118 |         self, 
119 |         bs: int,
120 |         seq_len: int,
121 |         generate_len: int,
122 |         num_kv_heads: int,
123 |         num_heads: int,
124 |         head_dim: int,
125 |         flash_attn: bool = False,
126 |         act_byte: int = BYTES_FP16,
127 |         kv_byte: int = BYTES_FP16,
128 |     ):
129 |         """
130 |         Count and save the FLOPs and memory access of self-attention kernels.
131 |         This function is used to analyze the self-attention kernels in the model.
132 |         """
133 |         hidden_size = num_heads * head_dim
134 |         if not flash_attn:
135 |             ##########################prefill stage##########################
136 |             # 1, qkt kernel analysis
137 |             name = "qk_matmul"
138 |             load_q_mem = bs * self.num_heads * seq_len * self.head_dim
139 |             load_k_mem = bs * self.num_kv_heads * seq_len * self.head_dim
140 |             qk_store_mem = bs * self.num_heads * seq_len * seq_len
141 |             self._analyze_to_results(
142 |                 "prefill",
143 |                 name,
144 |                 flops=2 * seq_len * seq_len * self.head_dim * bs * self.num_heads,
145 |                 load_weight=0,
146 |                 load_act=(load_q_mem + load_k_mem) * act_byte, # load q and k act, shape is [s, h]
147 |                 store_act=qk_store_mem * act_byte,
148 |                 load_kv_cache=0,
149 |                 store_kv_cache=0,
150 |             )
151 |             # 2, softmax kernel analysis
152 |             name = f"softmax"
153 |             load_softmax_mem = qk_store_mem
154 |             softmax_store_mem = bs * self.num_heads * seq_len * seq_len
155 |             self._analyze_to_results(
156 |                 "prefill",
157 |                 name,
158 |                 flops= (bs * num_heads * seq_len * seq_len * 1 * 5),
159 |                 load_weight=0,
160 |                 load_act=load_softmax_mem * act_byte,
161 |                 store_act=softmax_store_mem * act_byte,
162 |                 load_kv_cache=0,
163 |                 store_kv_cache=0,
164 |             )
165 |             # 3, sv kernel analysis
166 |             name = "sv_matmul"
167 |             load_s_mem = softmax_store_mem
168 |             load_v_mem = bs * self.num_kv_heads * seq_len * self.head_dim
169 |             sv_store_mem = bs * self.num_heads * seq_len * self.head_dim
170 |             self._analyze_to_results(
171 |                 "prefill",
172 |                 name,
173 |                 flops=bs * 2 * seq_len * seq_len * head_dim * num_heads,
174 |                 load_weight=0,
175 |                 load_act=load_s_mem * act_byte, # load score(qkt) act, shape is [s, s]
176 |                 store_act=sv_store_mem * act_byte,
177 |                 load_kv_cache=load_v_mem,
178 |                 store_kv_cache=0,
179 |             )
180 |             ##########################decode stage##########################
181 |             name = "qk_matmul"
182 |             # load q and k, k is form kv cache
183 |             qk_matmul_flops = 2 * self.num_heads * self.head_dim * (seq_len + generate_len)
184 |             load_q_mem = bs * self.num_heads * 1  * self.head_dim
185 |             load_k_mem = bs * self.num_kv_heads * (seq_len + generate_len) * self.head_dim
186 |             qk_store_mem = bs * self.num_heads * (seq_len + generate_len) * (seq_len + generate_len)
187 |             self._analyze_to_results(
188 |                 "decode",
189 |                 name,
190 |                 flops=qk_matmul_flops,
191 |                 load_weight=0,
192 |                 load_act=load_q_mem * act_byte,
193 |                 store_act=qk_store_mem * act_byte,
194 |                 load_kv_cache=load_k_mem * kv_byte,
195 |                 store_kv_cache=0,
196 |             )
197 |             # 2, softmax kernel analysis
198 |             name = f"softmax"
199 |             load_softmax_mem = qk_store_mem
200 |             softmax_store_mem = bs * self.num_heads * (seq_len + generate_len) * (seq_len + generate_len)
201 |             self._analyze_to_results(
202 |                 "decode",
203 |                 name,
204 |                 flops= (bs * num_heads * seq_len * seq_len * 1 * 5),
205 |                 load_weight=0,
206 |                 load_act=load_softmax_mem * act_byte,
207 |                 store_act=softmax_store_mem * act_byte,
208 |                 load_kv_cache=0,
209 |                 store_kv_cache=0,
210 |             )
211 |             # 3, sv kernel analysis
212 |             name = "sv_matmul"
213 |             load_s_mem = softmax_store_mem
214 |             load_v_mem = bs * self.num_kv_heads * (seq_len + generate_len) * self.head_dim
215 |             sv_store_mem = bs * self.num_heads * (seq_len + generate_len) * self.head_dim
216 |             self._analyze_to_results(
217 |                 "decode",
218 |                 name,
219 |                 flops=qk_matmul_flops,
220 |                 load_weight=0,
221 |                 load_act=load_s_mem * act_byte, # load score(qkt) act, shape is [s, s]
222 |                 store_act=sv_store_mem * act_byte,
223 |                 load_kv_cache=load_v_mem,
224 |                 store_kv_cache=0,
225 |             )
226 |         else:
227 |             name = f"fused_attention" # flash_attn2
228 |             qk_matmul_OPs = seq_len * seq_len * head_dim * num_heads * bs * 2
229 |             sv_matmul_OPs = seq_len * head_dim * seq_len * num_heads * bs * 2
230 |             softmax_OPs = bs * num_heads * seq_len * seq_len * 5
231 | 
232 |             block_size_r = min(math.ceil(self.onchip_buffer / (kv_byte * head_dim)), head_dim)
233 |             n_blocks_r = math.ceil(seq_len / block_size_r)
234 |             q_numel = seq_len * head_dim * bs * num_heads * act_byte
235 |             o_numel = seq_len * seq_len * bs * num_heads * act_byte
236 | 
237 |             self._analyze_to_results(
238 |                 "prefill",
239 |                 name,
240 |                 flops=qk_matmul_OPs + sv_matmul_OPs + softmax_OPs,
241 |                 load_weight=0,
242 |                 load_act=q_numel,
243 |                 store_act=o_numel * 2,  # initialize O and save O
244 |                 load_kv_cache=n_blocks_r * (seq_len) * head_dim * bs * num_kv_heads * kv_byte * 2,
245 |                 store_kv_cache=0,
246 |             )
247 | 
248 |             qk_matmul_OPs = seq_len * head_dim * num_heads * bs * 2
249 |             sv_matmul_OPs = 1 * head_dim * seq_len * num_heads * bs * 2
250 |             softmax_OPs = bs * num_heads * seq_len * 1 * 5
251 | 
252 |             n_blocks_r = math.ceil(1 / block_size_r)
253 |             q_numel = (1) * head_dim * bs * num_heads * act_byte
254 |             o_numel = 1 * seq_len * bs * num_heads * act_byte
255 |             self._analyze_to_results(
256 |                 "decode",
257 |                 name,
258 |                 OPs=qk_matmul_OPs + sv_matmul_OPs + softmax_OPs,
259 |                 load_weight=0,
260 |                 load_act=q_numel,
261 |                 store_act=o_numel * 2,  # initialize O and save O
262 |                 load_kv_cache=n_blocks_r * (seq_len) * head_dim * bs * num_kv_heads * kv_byte * 2,
263 |                 store_kv_cache=0,
264 |             )
265 | 
266 |         if self.model_type == "qwen3":
267 |             kernel_names = ["q_norm", "k_norm"]
268 |             # qwen3 模型中 rms_norm 计算中使用了一个额外的线性变换
269 |             q_norm_flops = bs * 4 * seq_len * self.head_dim
270 |             q_norm_load_weight = self.head_dim * BYTES_FP16
271 |             q_norm_load_act = bs * seq_len * self.head_dim * BYTES_FP16 # equal k_norm_load_act
272 |             q_norm_store_act = bs * seq_len * self.head_dim * BYTES_FP16
273 | 
274 |             # prefill/decode 阶段
275 |             for stage in ["prefill", "decode"]:
276 |                 if stage == "decode":
277 |                     q_norm_flops = int(q_norm_flops // seq_len)
278 |                     q_norm_load_act = int(q_norm_load_act // seq_len)
279 |                     q_norm_store_act = int(q_norm_store_act // seq_len)
280 | 
281 |                 for _, kernel_name in enumerate(kernel_names):
282 |                     self._analyze_to_results(
283 |                         stage,
284 |                         kernel_name,
285 |                         flops=q_norm_flops // self.tp_size,
286 |                         load_weight=q_norm_load_weight // self.tp_size,
287 |                         load_act=q_norm_load_act // self.tp_size,
288 |                         store_act=q_norm_store_act // self.tp_size,
289 |                         load_kv_cache=0,
290 |                         store_kv_cache=0,
291 |                     )
292 |     
293 |     def analyze_other_kernels(
294 |         self,
295 |         bs: int,
296 |         seq_len: int,
297 |         act_byte: int = BYTES_FP16,
298 |     ):
299 |         norm_flops = bs * seq_len * 4 * self.hidden_size  # mlp_norm, attn_norm
300 |         norm_load_weight = self.hidden_size * BYTES_FP16
301 |         norm_load_act = bs * seq_len * self.hidden_size * BYTES_FP16
302 |         norm_store_act = bs * seq_len * self.hidden_size * BYTES_FP16
303 | 
304 |         # silu 和 dot * 都是纯逐元素操作算子
305 |         silu_dot_flops = (bs * 4 * seq_len * self.intermediate_size)  # 每个张量元素执行 4 次操作
306 |         silu_dot_load_act = bs * 2 * seq_len * self.intermediate_size * act_byte
307 |         silu_dot_store_act = (bs * 2 * seq_len * self.intermediate_size * act_byte)
308 | 
309 |         mlp_add_flops = bs * seq_len * self.hidden_size
310 |         mlp_add_load_act = bs * seq_len * self.hidden_size * act_byte
311 |         mlp_add_store_act = bs * seq_len * self.hidden_size * act_byte
312 | 
313 |         # other kernels (memory bound)
314 |         kernel_names = ["attn_norm", "mlp_norm", "mlp_silu_dot", "attn_add", "mlp_add"]
315 |         flops_list = [norm_flops, norm_flops, silu_dot_flops, mlp_add_flops, mlp_add_flops]
316 |         
317 |         load_act_list = [norm_load_act, norm_load_act, silu_dot_load_act, mlp_add_load_act, mlp_add_load_act,]
318 |         store_act_list = [norm_store_act, norm_store_act, silu_dot_store_act, mlp_add_store_act, mlp_add_store_act,]
319 | 
320 |         # prefill/decode 阶段
321 |         for stage in ["prefill", "decode"]:
322 |             for i, kernel_name in enumerate(kernel_names):
323 |                 load_weight = (0 if (kernel_name not in ["attn_norm", "mlp_norm"]) else norm_load_weight)
324 | 
325 |                 load_act = load_act_list[i]
326 |                 store_act = store_act_list[i]
327 |                 flops = flops_list[i]
328 | 
329 |                 if stage == "decode":
330 |                     flops = int(flops // seq_len)
331 |                     load_act = int(load_act // seq_len)
332 |                     store_act = int(store_act // seq_len)
333 |                     
334 |                 self._analyze_to_results(
335 |                     stage,
336 |                     kernel_name,
337 |                     flops=flops // self.tp_size,
338 |                     load_weight=load_weight // self.tp_size,
339 |                     load_act=load_act // self.tp_size,
340 |                     store_act=store_act // self.tp_size,
341 |                     load_kv_cache=0,
342 |                     store_kv_cache=0,
343 |                 )
344 |     
345 |     def analyze_model(
346 |         self,
347 |         bs: int,
348 |         seq_len: int,
349 |         generate_len: int = 0,
350 |         flash_attn: bool = False,
351 |         act_byte: int = BYTES_FP16,
352 |         kv_byte: int = BYTES_FP16,
353 |     ):
354 |         """
355 |         Analyze the model and save the results.
356 |         This function is used to analyze the model and save the results.
357 |         """
358 |         # 1. analyze linear layers
359 |         self.analyze_linear_layers(bs, seq_len, act_byte=act_byte, kv_byte=kv_byte)
360 | 
361 |         # 2. analyze self attention kernels
362 |         self.analyze_self_atten_kernel(
363 |             bs, seq_len, generate_len, 
364 |             num_kv_heads=self.num_kv_heads, 
365 |             num_heads=self.num_heads, 
366 |             head_dim=self.head_dim, 
367 |             flash_attn=flash_attn, 
368 |             act_byte=act_byte, 
369 |             kv_byte=kv_byte
370 |         )
371 | 
372 |         # 3. analyze other kernels
373 |         self.analyze_other_kernels(
374 |             bs, seq_len,
375 |         )
376 | 
377 |         return self.results
378 |     
379 | 
380 | # ---------------------------------------------------------------------------
381 | # Transformer‑layer graph visualisation
382 | # ---------------------------------------------------------------------------
383 | 
384 | _DEPENDENCIES = {
385 |     "input": [],
386 |     "attn_norm": ["input"],
387 |     "q_proj": ["attn_norm"],
388 |     "k_proj": ["attn_norm"],
389 |     "v_proj": ["attn_norm"],
390 |     "qk_matmul": ["q_proj", "k_proj"],
391 |     "softmax": ["qk_matmul"],
392 |     "sv_matmul": ["softmax", "v_proj"],
393 |     "out_proj": ["sv_matmul"],
394 |     "attn_add": ["input", "out_proj"],
395 |     "mlp_norm": ["attn_add"],
396 |     "gate_proj": ["mlp_norm"],
397 |     "up_proj": ["mlp_norm"],
398 |     "mlp_silu_dot": ["up_proj", "gate_proj"],
399 |     "down_proj": ["mlp_silu_dot"],
400 |     "mlp_add": ["attn_add", "down_proj"],
401 |     "output": ["mlp_add"],
402 | }
403 | 
404 | class LayerGraphVisualizer:
405 |     """Render a transformer layer’s roofline‑analysis graph as a PNG."""
406 | 
407 |     def __init__(self, model_type: str, results: dict, shapes: dict = None) -> None:
408 |         self.model_type = model_type
409 |         self.results = results
410 |         if model_type == "qwen3":
411 |             # qwen3 模型中有额外的 q_norm 和 k_norm 层
412 |             _DEPENDENCIES["q_norm"] = ["q_proj"]
413 |             _DEPENDENCIES["k_norm"] = ["k_proj"]
414 |         # self.shapes = shapes or {}          # optional {kernel: "B×S×C"} mapping
415 | 
416 |     # --------------------------------------------------------------------- #
417 |     # internal helpers
418 |     # --------------------------------------------------------------------- #
419 |     def _label(self, node: str, kernel_stats: dict) -> str:
420 |         """Build a neat multi‑line Graphviz label, optionally with shape info."""
421 |         label = f"{node}\nFlops: {kernel_stats['flops']}, Access: {kernel_stats['memory_access']}, \nParams: {kernel_stats.get('load_weight', 0)}, Bound: {kernel_stats.get('bound', 'N/A')}"
422 |         return label
423 | 
424 |     # --------------------------------------------------------------------- #
425 |     # public API
426 |     # --------------------------------------------------------------------- #
427 |     def render(self, base_path: str = "layer_graph") -> None:
428 |         """Generate one PNG per stage (prefill / decode) under ./figures/."""
429 |         from graphviz import Digraph
430 | 
431 |         for stage, stage_res in self.results.items():
432 |             dot = Digraph(
433 |                 format="png",
434 |                 node_attr={"style": "filled", "shape": "box", "fontname": "Arial"},
435 |             )
436 | 
437 |             # Only include nodes and deps relevant for this stage, but always include "input" and "output"
438 |             pruned_deps = {
439 |                 n: [d for d in deps if d in stage_res or d in ("input","output")]
440 |                 for n, deps in _DEPENDENCIES.items()
441 |                 if n in stage_res or n in ("input","output")
442 |             }
443 | 
444 |             for node, deps in pruned_deps.items():
445 |                 color = (
446 |                     "lightblue" if "proj" in node
447 |                     else "plum" if "matmul" in node
448 |                     else "lightcyan"
449 |                 )
450 |                 if node in stage_res:
451 |                     label = self._label(node, stage_res[node])
452 |                 else:
453 |                     # default zero stats for input/output
454 |                     label = (
455 |                         f"{node}\n"
456 |                         "Flops: 0, Access: 0\n"
457 |                         "Params: 0, Bound: N/A"
458 |                     )
459 |                 dot.node(node, label=label, fillcolor=color)
460 |                 for dep in deps:
461 |                     if dep in pruned_deps:
462 |                         dot.edge(dep, node)
463 |             graph_path = f"./figures/grpah_{stage}_{base_path}"
464 |             dot.render(graph_path, cleanup=True)
465 | 
466 | # ---------------------------------------------------------------------------
467 | # Command‑line entry‑point
468 | # ---------------------------------------------------------------------------
469 | def _main() -> None:
470 |     import argparse, json
471 |     from pathlib import Path
472 | 
473 |     parser = argparse.ArgumentParser(
474 |         description="Generate a transformer layer graph (Graphviz PNG) from "
475 |                     "an LayerAnalyzer result JSON."
476 |     )
477 |     parser.add_argument("--result-json", type=Path, required=True,
478 |                         help="Path to the analysis‑result JSON produced by LayerAnalyzer")
479 |     parser.add_argument("--model-type", required=True,
480 |                         help="Model type tag, e.g. 'llama' or 'qwen3'")
481 |     parser.add_argument("--output", default="layer_graph",
482 |                         help="Base filename for the generated PNG(s)")
483 |     args = parser.parse_args()
484 | 
485 |     with args.result_json.open() as fp:
486 |         results = json.load(fp)
487 | 
488 |     LayerGraphVisualizer(args.model_type, results).render(args.output)
489 | 
490 | if __name__ == "__main__":  # pragma: no cover
491 |     _main()


--------------------------------------------------------------------------------
/llm_counts/roofline_model.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | from typing import Sequence, List, Dict
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | import argparse
  6 | 
  7 | # 全局常量：最大算术强度和最大 TFLOPS，用于坐标轴范围
  8 | MAX_OI = 1400
  9 | MAX_TFLOPS = 2000
 10 | 
 11 | @dataclass(frozen=True)
 12 | class GPU:
 13 |     """GPU 硬件参数：FP16 峰值算力（TFLOPS）和带宽（TB/s）。"""
 14 |     name: str
 15 |     fp16_tflops: float  # TFLOPS
 16 |     hbm_bw: float       # TB/s
 17 | 
 18 | @dataclass(frozen=True)
 19 | class ModelConfig:
 20 |     """模型配置参数"""
 21 |     name: str
 22 |     total_flops: float   # 总 FLOP (以 TeraFLOP 为单位)
 23 |     total_bytes: float   # 总内存访问 (以 TeraByte 为单位)
 24 |     color: str           # 绘图颜色
 25 | 
 26 | def roofline_analysis(
 27 |     peak_flops: float,
 28 |     bandwidth: float,
 29 |     total_flops: float,
 30 |     total_mac_bytes: float
 31 | ) -> tuple[float, float, str]:
 32 |     """
 33 |     Analyzes the roofline model and returns the arithmetic intensity, 
 34 |     attainable FLOPs, and the bounding factor (memory or compute).
 35 |     """
 36 |     if total_mac_bytes == 0:  # 防止除以零
 37 |         return 0, peak_flops, "compute"
 38 |     
 39 |     ai = total_flops / total_mac_bytes
 40 |     turning_point = peak_flops / bandwidth
 41 | 
 42 |     if ai < turning_point:
 43 |         return ai, ai * bandwidth, "memory"
 44 |     else:
 45 |         return ai, peak_flops, "compute"
 46 | 
 47 | def plot_roofline(
 48 |     models: Sequence[ModelConfig],
 49 |     gpus: Sequence[GPU],
 50 |     output_file: str = "roofline_optimized.png"
 51 | ) -> None:
 52 |     """
 53 |     绘制经过优化的、用于比较的 Roofline 曲线。
 54 | 
 55 |     主要优化点:
 56 |     1. 使用 Log-Log 坐标轴，符合行业标准。
 57 |     2. 采用智能图例管理，避免图例冗长。
 58 |     3. 使用 adjust_text 自动防止文本标签重叠。
 59 |     4. 优化视觉设计，突出重点信息。
 60 |     """
 61 |     fig, ax = plt.subplots(figsize=(14, 10))
 62 |     plot_colors = ['red', 'blue', 'green', 'orange', 'purple']
 63 |     # --- 1. 绘制 GPU 屋顶线 (作为背景) ---
 64 |     # 使用对数坐标轴，范围从 0.1 到 10000
 65 |     oi_range = np.logspace(-1, 4, 200)
 66 |     gpu_linestyles = ['-', '--', '-.', ':']
 67 | 
 68 |     for i, gpu in enumerate(gpus):
 69 |         roofline = np.minimum(oi_range * gpu.hbm_bw, gpu.fp16_tflops)
 70 |         linestyle = gpu_linestyles[i % len(gpu_linestyles)]
 71 |         # 使用统一的灰色系，不同线型来区分，作为背景不干扰主要数据
 72 |         ax.plot(
 73 |             oi_range,
 74 |             roofline,
 75 |             linestyle=linestyle,
 76 |             linewidth=2,
 77 |             label=f"{gpu.name} Roof (Turn @ {gpu.fp16_tflops / gpu.hbm_bw:.1f})",
 78 |             color=plot_colors[i % len(plot_colors)],
 79 |             alpha=0.9
 80 |         )
 81 | 
 82 |     # --- 2. 绘制模型性能点并收集文本标签 ---
 83 |     text_labels = []
 84 | 
 85 |     for model in models:
 86 |         # **智能图例技巧**: 为每个模型创建一个“虚拟”的图例条目，
 87 |         # 这样图例中每个模型只显示一次。
 88 |         ax.scatter([], [], color=model.color, marker='o', s=120, label=f"{model.name}")
 89 | 
 90 |         for gpu in gpus:
 91 |             ai, attainable, bound = roofline_analysis(
 92 |                 gpu.fp16_tflops,
 93 |                 gpu.hbm_bw,
 94 |                 model.total_flops,
 95 |                 model.total_bytes
 96 |             )
 97 | 
 98 |             ax.scatter(
 99 |                 ai,
100 |                 attainable,
101 |                 s=120,
102 |                 marker='o', # 使用统一标记，用颜色区分模型
103 |                 color=model.color,
104 |                 edgecolors='black',
105 |                 zorder=5  # 确保点在最上层
106 |             )
107 | 
108 |             # 准备文本标签，稍后由 adjust_text 统一处理
109 |             label_text = f"{gpu.name}\n{attainable:.0f} TFLOPS ({bound[:3]}.)"
110 |             text_labels.append(
111 |                 ax.text(ai, attainable, label_text, fontsize=9, ha='center')
112 |             )
113 | 
114 |     # --- 3. 图表美化与最终处理 ---
115 |     # 切换到对数坐标轴
116 |     ax.set_xscale('log')
117 |     ax.set_yscale('log')
118 | 
119 |     ax.set_xlabel("Arithmetic Intensity (FLOPs / Bytes) [log scale]", fontsize=12)
120 |     ax.set_ylabel("Attainable Performance (TFLOPS) [log scale]", fontsize=12)
121 |     ax.set_title("Comparative Roofline Analysis", fontsize=16, fontweight='bold')
122 |     
123 |     # 使用 'both' 在主次刻度上都显示网格，对 log 尺度很友好
124 |     ax.grid(True, which="both", linestyle="--", alpha=0.5)
125 | 
126 |     # 自动调整坐标轴范围，并留出一些边距
127 |     ax.autoscale(True)
128 |     ax.set_xlim(left=max(ax.get_xlim()[0], 0.5))
129 |     ax.set_ylim(bottom=max(ax.get_ylim()[0], 10))
130 | 
131 |     # **关键步骤**: 调用 adjust_text 来智能地防止标签重叠
132 |     # 它会自动移动标签，并可以用箭头指向原始数据点
133 |     from adjustText import adjust_text
134 |     adjust_text(
135 |         text_labels,
136 |         ax=ax,
137 |         arrowprops=dict(arrowstyle='->', color='gray', lw=0.5)
138 |     )
139 | 
140 |     # 图例现在很简洁，可以优雅地放在图内
141 |     ax.legend(fontsize=10, loc='lower right')
142 | 
143 |     fig.tight_layout()
144 |     plt.savefig(output_file, dpi=300)
145 |     print(f"Optimized roofline plot saved to {output_file}")
146 |     plt.close(fig)
147 | 
148 | 
149 | def main():
150 |     # 预定义 GPU 配置
151 |     GPUS = [
152 |         GPU("H100", 989, 3.35),
153 |         GPU("A100", 312, 2.039),
154 |         GPU("RTX4090", 330, 1.008),
155 |         GPU("MI300X", 1150, 5.2),
156 |         GPU("L40S", 363, 0.864),
157 |     ]
158 |     
159 |     # 预定义模型配置
160 |     MODELS = {
161 |         "gpt3": ModelConfig(
162 |             "GPT-3 (175B)", 
163 |             total_flops=314000,  # TFLOPs (3.14e14 FLOPs)
164 |             total_bytes=1000,    # TB (1e15 bytes)
165 |             color='red'
166 |         ),
167 |         "llama2-70b": ModelConfig(
168 |             "LLaMA2-70B", 
169 |             total_flops=70000,   # TFLOPs (7e13 FLOPs)
170 |             total_bytes=200,     # TB (2e14 bytes)
171 |             color='blue'
172 |         ),
173 |         "qwen2.5-3b": ModelConfig(
174 |             "Qwen2.5-3B", 
175 |             total_flops=3000,    # TFLOPs (3e12 FLOPs)
176 |             total_bytes=10,      # TB (1e13 bytes)
177 |             color='green'
178 |         ),
179 |     }
180 |     
181 |     # 命令行参数解析
182 |     parser = argparse.ArgumentParser(description="Roofline Model Analysis Tool")
183 |     parser.add_argument("--models", nargs="+", choices=list(MODELS.keys()),
184 |                         default=["gpt3", "llama2-70b", "qwen2.5-3b"],
185 |                         help="Models to analyze (default: all)")
186 |     parser.add_argument("--gpus", nargs="+", 
187 |                         default=["H100", "A100", "RTX4090"],
188 |                         help="GPUs to analyze (default: H100, A100, RTX4090)")
189 |     parser.add_argument("--output", default="roofline_analysis.png",
190 |                         help="Output filename (default: roofline_analysis.png)")
191 |     
192 |     args = parser.parse_args()
193 |     
194 |     # 获取选中的模型和GPU
195 |     selected_models = [MODELS[model] for model in args.models]
196 |     selected_gpus = [gpu for gpu in GPUS if gpu.name in args.gpus]
197 |     
198 |     if not selected_gpus:
199 |         print("Error: No valid GPUs selected. Available options:")
200 |         for gpu in GPUS:
201 |             print(f"  - {gpu.name}")
202 |         return
203 |     
204 |     # 生成屋顶线图
205 |     plot_roofline(selected_models, selected_gpus, args.output)
206 | 
207 | if __name__ == "__main__":
208 |     main()


--------------------------------------------------------------------------------
/llm_counts/utils/__pycache__/config.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harleyszhang/llm_counts/2cd9bc0dd0aa757000b8bd9d5867e986d42828aa/llm_counts/utils/__pycache__/config.cpython-310.pyc


--------------------------------------------------------------------------------
/llm_counts/utils/__pycache__/config.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harleyszhang/llm_counts/2cd9bc0dd0aa757000b8bd9d5867e986d42828aa/llm_counts/utils/__pycache__/config.cpython-311.pyc


--------------------------------------------------------------------------------
/llm_counts/utils/__pycache__/config.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harleyszhang/llm_counts/2cd9bc0dd0aa757000b8bd9d5867e986d42828aa/llm_counts/utils/__pycache__/config.cpython-312.pyc


--------------------------------------------------------------------------------
/llm_counts/utils/__pycache__/constants.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harleyszhang/llm_counts/2cd9bc0dd0aa757000b8bd9d5867e986d42828aa/llm_counts/utils/__pycache__/constants.cpython-310.pyc


--------------------------------------------------------------------------------
/llm_counts/utils/__pycache__/constants.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harleyszhang/llm_counts/2cd9bc0dd0aa757000b8bd9d5867e986d42828aa/llm_counts/utils/__pycache__/constants.cpython-311.pyc


--------------------------------------------------------------------------------
/llm_counts/utils/__pycache__/constants.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harleyszhang/llm_counts/2cd9bc0dd0aa757000b8bd9d5867e986d42828aa/llm_counts/utils/__pycache__/constants.cpython-312.pyc


--------------------------------------------------------------------------------
/llm_counts/utils/__pycache__/utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harleyszhang/llm_counts/2cd9bc0dd0aa757000b8bd9d5867e986d42828aa/llm_counts/utils/__pycache__/utils.cpython-310.pyc


--------------------------------------------------------------------------------
/llm_counts/utils/__pycache__/utils.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harleyszhang/llm_counts/2cd9bc0dd0aa757000b8bd9d5867e986d42828aa/llm_counts/utils/__pycache__/utils.cpython-311.pyc


--------------------------------------------------------------------------------
/llm_counts/utils/config.py:
--------------------------------------------------------------------------------
  1 | # -*- coding  : utf-8 -*-
  2 | # Description : gpu, model, Parallelism, data, train and inference config definition
  3 | 
  4 | import math, json
  5 | from .constants import *
  6 | from typing import Optional
  7 | from dataclasses import dataclass
  8 | from enum import Enum
  9 | from functools import total_ordering
 10 | from transformers import AutoConfig
 11 | import os
 12 | 
 13 | 
 14 | class ActivationRecomputation(Enum):
 15 |     NONE = 0
 16 |     """No activation recomputation; requires the most amount of memory."""
 17 | 
 18 |     SELECTIVE = 1
 19 |     """Selectively checkpoints and recomputes only parts of each transformer
 20 |     layer that take up a considerable amount of memory but are not
 21 |     computationally expensive to recompute, i.e. Q K V matrix multiplies, 
 22 |     QK^T matrix multiply, softmax, softmax dropout, and attention over V."""
 23 | 
 24 |     FULL = 2
 25 |     """Full activation recomputation stores the input to EVERY transformer
 26 |     layer, which is sharded across the tensor parallel group, thus requiring an
 27 |     extra all-gather (ignored for now) per layer and add communication
 28 |     overhead; requires the lease amount of memory; requires an extra forward
 29 |     pass."""
 30 | 
 31 | 
 32 | @total_ordering
 33 | class DSZeRO(Enum):
 34 |     NONE = 0
 35 |     """No DeepSPeed ZeRO; requires the most amount of memory."""
 36 | 
 37 |     STAGE_1 = 1
 38 |     """ZeRO stage 1 shards the optimizer states across the data parallel
 39 |     group."""
 40 | 
 41 |     STAGE_2 = 2
 42 |     """ZeRO stage 2 shards the optimizer states and gradients across the data
 43 |     parallel group."""
 44 | 
 45 |     STAGE_3 = 3
 46 |     """ZeRO stage 3 shards the optimizer states, gradients, and model weights
 47 |     across the data parallel group."""
 48 | 
 49 |     def __lt__(self, other):
 50 |         # 炫技写法
 51 |         if other.__class__ is self.__class__:
 52 |             return self.value < other.value  # Enum 枚举类自动赋值
 53 |         return NotImplemented
 54 | 
 55 |     def __eq__(self, other):
 56 |         if isinstance(other, DSZeRO):
 57 |             return self.value == other.value
 58 |         return NotImplemented
 59 | 
 60 | 
 61 | @dataclass
 62 | class GPUEfficiencyConfig:
 63 |     flops_efficiency: float = 1.0
 64 |     hbm_memory_efficiency: float = 1.0
 65 |     intra_node_memory_efficiency: float = 1.0
 66 |     inter_node_memory_efficiency: float = 1.0
 67 | 
 68 | 
 69 | @dataclass
 70 | class InferenceConfig:
 71 |     """Inference configuration dataclass."""
 72 | 
 73 |     bs: int = None  # batch size
 74 |     seq_len: int = 522  # input sequence length
 75 |     generate_len: int = 1526  # number of tokens to generate
 76 |     context_len: int = None  # context length
 77 |     bytes_per_param: int = BYTES_FP16  # model weight bytes
 78 |     act_dtype_bytes: int = BYTES_FP16  # activation data type bytes
 79 |     kv_cache_bytes: int = BYTES_FP16  # key/value cache data type bytes
 80 | 
 81 |     def __post_init__(self):
 82 |         if self.context_len is None:
 83 |             self.context_len = self.seq_len + self.generate_len
 84 | 
 85 | 
 86 | @dataclass
 87 | class ParallelismConfig:
 88 |     """dataclass module provides a decorator and functions for automatically adding 
 89 |     generated special methods such as __init__() and __repr__() to user-defined classes.
 90 |     """
 91 | 
 92 |     tp_size: int = (
 93 |         1  # tensor parallelism size, Megatron-LM tensor parallelism implementation
 94 |     )
 95 |     pp_size: int = (
 96 |         1  # pipeline parallelism size, Megatron-LM pipeline parallelism implementation
 97 |     )
 98 |     dp_size: int = 1  # data parallelism size, DeepSpeed Zero parallelism implementation
 99 |     sp_size: int = (
100 |         1  # sequence parallelism size, Megatron-LM sequence parallelism implementation
101 |     )
102 | 
103 | 
104 | @dataclass
105 | class ModelConfig:
106 |     num_layers: Optional[int] = None  # number of transformer layers (blocks)
107 |     num_heads: Optional[int] = None  # number of attention heads
108 |     head_dim: Optional[int] = None          # <— 新增：允许显式传入
109 |     hidden_size: Optional[int] = None  # hidden dimension
110 |     vocab_size: Optional[int] = None  # vocabulary size
111 |     num_kv_heads: Optional[int] = None
112 |     max_seq_len: Optional[int] = None  # max sequence length
113 |     intermediate_size: Optional[int] = None  # hidden dimension of FFN, default to 4 * hidden_size
114 |     model_type: str = (
115 |         None  # model type as tagged on Hugging Face (e.g., gpt2, opt, llama.)
116 |     )
117 |     model_name: str = (
118 |         None  # model name as tagged on Hugging Face (e.g., gpt2-xl, opt, llama-13b.)
119 |     )
120 | 
121 |     # -------- post-init 逻辑 -------- #
122 |     def __post_init__(self) -> None:
123 |         # ① KV-heads 默认 = Q-heads
124 |         if self.num_kv_heads is None:
125 |             self.num_kv_heads = self.num_heads
126 | 
127 |         # ② FFN 维度默认 = 4×hidden_size
128 |         if self.intermediate_size is None:
129 |             self.intermediate_size = self.hidden_size * 4
130 | 
131 |         # ③ **核心：head_dim 计算**  
132 |         #    若用户 / HF config 已提供，则直接用；否则按经典公式推断
133 |         if self.head_dim is None:
134 |             self.head_dim = self.hidden_size // self.num_heads
135 | 
136 |             # ④ 一致性检查（可选：遇到 MoE/GQA 可放宽）
137 |             assert (
138 |                 self.hidden_size == self.head_dim * self.num_heads
139 |             ), (
140 |                 "hidden_size 与 num_heads×head_dim 不一致；"
141 |                 "若模型采用变体架构，请显式指定 head_dim"
142 |             )
143 | 
144 |     @classmethod
145 |     def from_pretrained(
146 |         cls, pretrained_model_name_or_path: str, trust_remote_code: bool = True
147 |     ):
148 |         """
149 |         Load a Hugging Face model configuration and map it to ModelConfig.
150 | 
151 |         Args:
152 |             pretrained_model_name_or_path (str): Path or name of the pretrained model.
153 |             trust_remote_code (bool): Whether to trust remote code for custom models.
154 | 
155 |         Returns:
156 |             ModelConfig: An instance of the custom ModelConfig class.
157 |         """
158 |         # Load the Hugging Face configuration
159 |         hf_config = AutoConfig.from_pretrained(
160 |             pretrained_model_name_or_path, trust_remote_code=trust_remote_code
161 |         )
162 | 
163 |         # Create a ModelConfig instance by mapping the fields
164 |         return cls(
165 |             num_layers=hf_config.num_hidden_layers,
166 |             num_heads=hf_config.num_attentionum_headss,
167 |             hidden_size=hf_config.hidden_size,
168 |             vocab_size=hf_config.vocab_size,
169 |             num_kv_heads=getattr(hf_config, "num_kv_heads", None),
170 |             max_seq_len=hf_config.max_position_embeddings,
171 |             intermediate_size=hf_config.intermediate_size,
172 |             model_type=hf_config.model_type,
173 |             model_name=hf_config.name_or_path,
174 |         )
175 | 
176 | 
177 | @dataclass
178 | class GPUConfig:
179 |     # 1, gpu 型号和显存大小
180 |     name: str  # GPU config name
181 |     memory_GPU_in_GB: float  # memory per GPU in GB
182 |     onchip_buffer: float = None  # on-chip buffer size in bytes, e.g., register file size
183 | 
184 |     # 2, gpu 显存带宽、节点内带宽、节点间带宽
185 |     hbm_bandwidth_in_GB_per_sec: float=None  # GPU HBM bandwidth in GB/s
186 |     intra_node_bandwidth_in_GB_per_sec: float=None # intra node GPU bandwidth in GB/s.(PCIE/NVLINK)
187 |     intra_node_min_message_latency: float=None # minimum intra node message latency in seconds
188 |     # inter node bandwidth in GB/s, assuming Mellanox 200Gbps HDR Infiniband
189 |     inter_node_bandwidth_in_GB_per_sec: float = 200  
190 | 
191 |     # 3, 不同精度的 Tensor core 的计算性能
192 |     peak_fp32_TFLOPS: float = None  # peak Tensor TFLOPS for FP32
193 |     peak_fp16_TFLOPS: float = None  # peak Tensor TFLOPS for FP16
194 |     peak_int8_TFLOPS: float = None  # peak Tensor TFLOPS for INT8
195 |     peak_int4_TFLOPS: float = None  # peak Tensor TFLOPS for INT4
196 | 
197 |     FLOPS_EFFICIENCY = 0.9
198 |     HBM_MEMORY_EFFICIENCY = 0.9
199 |     INTRA_NODE_BANDWIDTH_EFFICIENCY = 0.9
200 | 
201 |     def __post_init__(self):
202 |         """
203 |         Post-initialization processing to compute missing values and apply efficiencies.
204 |         """
205 |         # Ensure FP32 TFLOPS is calculated if missing
206 |         if self.peak_fp32_TFLOPS is None and self.peak_fp16_TFLOPS is not None:
207 |             self.peak_fp32_TFLOPS = self.peak_fp16_TFLOPS / 2
208 | 
209 |         # Ensure INT8 and INT4 TFLOPS are calculated if missing
210 |         if self.peak_int8_TFLOPS is None and self.peak_fp16_TFLOPS is not None:
211 |             self.peak_int8_TFLOPS = 2 * self.peak_fp16_TFLOPS
212 |         if self.peak_int4_TFLOPS is None and self.peak_fp16_TFLOPS is not None:
213 |             self.peak_int4_TFLOPS = 4 * self.peak_fp16_TFLOPS
214 | 
215 |         # Apply FLOPS efficiency and round to nearest integer
216 |         if self.FLOPS_EFFICIENCY:
217 |             self.actual_peak_fp32_TFLOPS = math.ceil(
218 |                 self.peak_fp32_TFLOPS * self.FLOPS_EFFICIENCY
219 |             )
220 |             self.actual_peak_fp16_TFLOPS = math.ceil(
221 |                 self.peak_fp16_TFLOPS * self.FLOPS_EFFICIENCY
222 |             )
223 |             self.actual_peak_int8_TFLOPS = math.ceil(
224 |                 self.peak_int8_TFLOPS * self.FLOPS_EFFICIENCY
225 |             )
226 |             self.actual_peak_int4_TFLOPS = math.ceil(
227 |                 self.peak_int4_TFLOPS * self.FLOPS_EFFICIENCY
228 |             )
229 | 
230 | 
231 | class LLMConfigs(object):
232 |     """LLMConfigs is a dataclass that contains all the configurations for the LLM model."""
233 | 
234 |     def __init__(
235 |         self,
236 |         gpu_config: GPUConfig,
237 |         model_config: ModelConfig,
238 |         parallelism_config: ParallelismConfig = ParallelismConfig(),
239 |         inference_config: InferenceConfig = InferenceConfig(),
240 |         gpu_efficiency_config: GPUEfficiencyConfig = GPUEfficiencyConfig(),
241 |     ) -> None:
242 |         self.model_config = model_config
243 |         self.gpu_config = gpu_config
244 |         self.parallelism_config = parallelism_config
245 |         self.inference_config = inference_config  # 用户自行指定配置
246 |         self.gpu_efficiency_config = gpu_efficiency_config  # 用户自行指定配置
247 | 
248 | 
249 | def get_model_and_gpu_config_by_name(
250 |     model_name="llama-13b", gpu_name="v100-pcie-32gb"
251 | ) -> dict:
252 |     """Read model and gpu configs from a json file."""
253 |     current_dir = os.path.dirname(__file__)
254 |     model_config_path = os.path.join(current_dir, "../configs/model_configs.json")
255 |     gpu_config_path = os.path.join(current_dir, "../configs/gpu_configs.json")
256 | 
257 |     with open(model_config_path, "r") as f:
258 |         config_json = json.load(f)  # 类似于 dict 类型
259 |         if model_name in config_json:
260 |             print(f"model name {model_name} is found in {model_config_path}")
261 |             config_dict = config_json[model_name]
262 |             model_config = ModelConfig(**config_dict)
263 |         else:
264 |             print(
265 |                 f"model name {model_name} is not found in {model_config_path} so need to apply transformers AutoConfig"
266 |             )
267 |             # 加载模型配置
268 |             model_config = ModelConfig.from_pretrained(model_name, trust_remote_code=True)
269 | 
270 |     with open(gpu_config_path, "r") as f:
271 |         config_json = json.load(f)  # 类似于 dict 类型
272 |         config_dict = config_json[gpu_name]
273 |         assert gpu_name in config_json, (
274 |             f"gpu name {gpu_name} not found in {gpu_config_path}"
275 |         )
276 |         gpu_config = GPUConfig(**config_dict)
277 | 
278 |     return model_config, gpu_config
279 | 
280 | 
281 | def get_TFLOPS_per_gpu(
282 |     gpu_config: GPUConfig, data_type="fp16", flops_efficiency=FLOPS_EFFICIENCY
283 | ) -> float:
284 |     """Get the expected TFLOPS per GPU for the specified data type
285 |     configuration/GPU (adjusted by flops_efficiency)
286 | 
287 |     Returns:
288 |         float: TFLOPS per GPU and unit is T.
289 |     """
290 |     if data_type == "int8":
291 |         gemm_TFOPS = gpu_config.peak_int8_TFLOPS
292 |     elif data_type == "fp16":
293 |         gemm_TFOPS = gpu_config.peak_fp16_TFLOPS
294 |     else:
295 |         print("weight_bits and activation_bits must be 8, or 16!")
296 | 
297 |     return gemm_TFOPS * flops_efficiency
298 | 
299 | 
300 | def get_gpu_hbm_bandwidth(
301 |     gpu_config: GPUConfig, hbm_memory_efficiency=HBM_MEMORY_EFFICIENCY
302 | ) -> list:
303 |     return gpu_config.hbm_bandwidth_in_GB_per_sec * hbm_memory_efficiency, gpu_config.onchip_buffer
304 | 
305 | 
306 | def get_intra_node_bandwidth(
307 |     gpu_config: GPUConfig, intra_node_memory_efficiency=INTRA_NODE_MEMORY_EFFICIENCY
308 | ) -> float:
309 |     return gpu_config.intra_node_bandwidth_in_GB_per_sec * intra_node_memory_efficiency
310 | 
311 | 
312 | def get_inter_node_bandwidth(
313 |     gpu_config: GPUConfig, inter_node_memory_efficiency=INTER_NODE_MEMORY_EFFICIENCY
314 | ) -> float:
315 |     return gpu_config.inter_node_bandwidth_in_GB_per_sec * inter_node_memory_efficiency
316 | 


--------------------------------------------------------------------------------
/llm_counts/utils/constants.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | from functools import total_ordering
 3 | 
 4 | #########################################
 5 | #######     llm profiler    ############
 6 | #########################################
 7 | 
 8 | FLOPS_EFFICIENCY = (
 9 |     0.9  # FLOPS efficiency achieved by Megatron-LM is ~0.5 for LLM training
10 | )
11 | HBM_MEMORY_EFFICIENCY = 0.9  # GPU HBM memory efficiency
12 | INTRA_NODE_MEMORY_EFFICIENCY = 0.75  # intra-node (nvlink) memory efficiency
13 | INTER_NODE_MEMORY_EFFICIENCY = 0.9  # inter-node memory efficiency
14 | 
15 | NUM_GPUS_PER_NODE = 8  # number of GPUs per node
16 | 
17 | TOLERANCE = 0.01  # tolerance for floating point comparisons
18 | 
19 | BITS_PER_BYTE = 8  # number of bits in a byte
20 | 
21 | BITS_FP32 = 32  # number of bits in FP32 data type
22 | BITS_FP16 = 16  # number of bits in FP16 data type
23 | BITS_INT8 = 8  # number of bits in INT8 data type
24 | BITS_INT4 = 4  # number of bits in INT4 data type
25 | 
26 | BYTES_FP32 = BITS_FP32 // BITS_PER_BYTE  # number of bytes in FP32 data type
27 | BYTES_FP16 = BITS_FP16 // BITS_PER_BYTE  # number of bytes in FP16 data type
28 | BYTES_INT8 = BITS_INT8 // BITS_PER_BYTE  # number of bytes in INT8 data type
29 | BYTES_INT4 = BITS_INT4 // BITS_PER_BYTE  # number of bytes in INT4 data type
30 | 
31 | PRINT_LINE_WIDTH = 100
32 | 
33 | GPUS = [1, 2, 4, 8]
34 | 
35 | 
36 | @total_ordering
37 | class ActivationRecomputation(Enum):
38 |     NONE = 0
39 |     """No activation recomputation; requires the most amount of memory."""
40 |     ATTN_COMPUTE = 1
41 |     """Selectively checkpoints the attention computation (
42 |     QK^T matrix multiply, softmax, softmax dropout, and attention overV.) 
43 |     in the attention module of a transformer layer; this part takes up a 
44 |     considerable amount of memory but are not computationally expensive to recompute"""
45 |     ATTN = 2
46 |     """Selectively checkpoints the input to the attention module in a transformer layer; 
47 |     requires an extra forward pass on attention."""
48 |     NORM_ATTN_NORM = 3
49 |     """Selectively checkpoints the input to the sequence of modules (layernom-attention-layernom) 
50 |     in a transformer layer; requires an extra forward pass on (layernom-attention-layernom)."""
51 |     FULL = 4
52 |     """Full activation recomputation stores the input to the transformer layer; requires the least
53 |     amount of memory; requires an extra forward pass of the layer."""
54 | 
55 |     def __lt__(self, other):
56 |         if self.__class__ is other.__class__:
57 |             return self.value < other.value
58 |         return NotImplemented
59 | 


--------------------------------------------------------------------------------
/llm_counts/utils/utils.py:
--------------------------------------------------------------------------------
  1 | import pprint
  2 | from .constants import *
  3 | 
  4 | class Formatter(object):
  5 |     @classmethod
  6 |     def format_value(cls, value, category):
  7 |         """根据类别统一格式化 value."""
  8 |         if category == "params" or category == "flops":
  9 |             return num_to_string(value)
 10 |         elif category == "latency":
 11 |             return latency_to_string(value)
 12 |         elif category == "memory":
 13 |             return f"{num_to_string(value)}B"
 14 |         return value  # 如果没有匹配，返回原值
 15 | 
 16 |     @classmethod
 17 |     def print_format_summary_dict(
 18 |         self,
 19 |         summary_dict: dict,
 20 |         depth: int,
 21 |         category: str | None = None,
 22 |     ) -> str:
 23 |         """
 24 |         打印时对 params / flops / latency / memory 等进行统一转换显示。
 25 |         If *category* is provided, apply that formatting to every leaf value that is
 26 |         not a nested dict; otherwise fall back to key‑based inference.
 27 |         """
 28 |         if category is not None and not isinstance(summary_dict, dict):
 29 |             # Safety bail‑out (shouldn't happen)
 30 |             return summary_dict
 31 |         for key, value in summary_dict.items():
 32 |             # If category is explicitly provided, ignore key‑name heuristics
 33 |             explicit_cat = category
 34 |             if (explicit_cat == "params" or explicit_cat == "flops") or ("params" in key or "flops" in key):
 35 |                 if not isinstance(value, dict):
 36 |                     summary_dict.update({key: num_to_string(value)})
 37 |                 else:
 38 |                     self.print_format_summary_dict(
 39 |                         value, get_dict_depth(value) - 1, category
 40 |                     )  # 递归
 41 |             if explicit_cat == "latency" or "latency" in key:
 42 |                 if not isinstance(value, dict):
 43 |                     summary_dict.update({key: latency_to_string(value)})
 44 |                 else:
 45 |                     self.print_format_summary_dict(value, get_dict_depth(value) - 1, category)
 46 |             if explicit_cat == "memory" or "memory" in key:
 47 |                 if not isinstance(value, dict):
 48 |                     summary_dict.update({key: f"{num_to_string(value)}B"})
 49 |                 else:
 50 |                     self.print_format_summary_dict(value, get_dict_depth(value) - 1, category)
 51 |         if depth >= 1:
 52 |             pprint.pprint(summary_dict, indent=4, sort_dicts=False)
 53 | 
 54 | 
 55 | def print_list(list):
 56 |     """print one-dimensional list
 57 | 
 58 |     :param list: List[int]
 59 |     :return: None
 60 |     """
 61 |     for i, x in enumerate(list):
 62 |         print(x, end="\n")
 63 | 
 64 | 
 65 | def get_dict_depth(d, depth=0):
 66 |     if not isinstance(d, dict):
 67 |         return depth
 68 |     if not d:
 69 |         return depth
 70 | 
 71 |     return max(get_dict_depth(v, depth + 1) for v in d.values())
 72 | 
 73 | 
 74 | def latency_to_string(latency_in_s, precision=2, return_type="string"):
 75 |     if latency_in_s is None:
 76 |         return "None" if return_type == "string" else None
 77 | 
 78 |     day = 24 * 60 * 60
 79 |     hour = 60 * 60
 80 |     minute = 60
 81 |     ms = 1 / 1000
 82 |     us = 1 / 1000000
 83 | 
 84 |     if latency_in_s // day > 0:
 85 |         value = round(latency_in_s / day, precision)
 86 |         unit = "days"
 87 |     elif latency_in_s // hour > 0:
 88 |         value = round(latency_in_s / hour, precision)
 89 |         unit = "hours"
 90 |     elif latency_in_s // minute > 0:
 91 |         value = round(latency_in_s / minute, precision)
 92 |         unit = "minutes"
 93 |     elif latency_in_s > 1:
 94 |         value = round(latency_in_s, precision)
 95 |         unit = "s"
 96 |     elif latency_in_s > ms:
 97 |         value = round(latency_in_s / ms, precision)
 98 |         unit = "ms"
 99 |     else:
100 |         value = round(latency_in_s / us, precision)
101 |         unit = "us"
102 | 
103 |     if return_type == "string":
104 |         return f"{value} {unit}"
105 |     elif return_type == "float":
106 |         return value
107 |     else:
108 |         return (value, unit)
109 | 
110 | 
111 | def num_to_string(num, precision=2, return_type="string"):
112 |     if num is None:
113 |         return "None" if return_type == "string" else None
114 | 
115 |     if num // 10**12 > 0:
116 |         value = round(num / 10.0**12, precision)
117 |         unit = "T"
118 |     elif num // 10**9 > 0:
119 |         value = round(num / 10.0**9, precision)
120 |         unit = "G"
121 |     elif num // 10**6 > 0:
122 |         value = round(num / 10.0**6, precision)
123 |         unit = "M"
124 |     elif num // 10**3 > 0:
125 |         value = round(num / 10.0**3, precision)
126 |         unit = "K"
127 |     else:
128 |         value = num
129 |         unit = ""
130 | 
131 |     if return_type == "string":
132 |         return f"{value} {unit}".strip()
133 |     elif return_type == "float":
134 |         return value
135 |     else:
136 |         return (value, unit)
137 | 
138 | 
139 | def get_readable_summary_dict(summary_dict: dict, title="Summary") -> str:
140 |     log_str = f"\n{title.center(PRINT_LINE_WIDTH, '-')}\n"
141 |     for key, value in summary_dict.items():
142 |         if "num_tokens" in key or "num_params" in key or "flops" in key:
143 |             log_str += f"{key}: {num_to_string(value)}\n"
144 |         elif "gpu_hours" == key:
145 |             log_str += f"{key}: {int(value)}\n"
146 |         elif "memory" in key and "efficiency" not in key:
147 |             log_str += f"{key}: {num_to_string(value)}B\n"
148 |         elif "latency" in key:
149 |             log_str += f"{key}: {latency_to_string(value)}\n"
150 |         else:
151 |             log_str += f"{key}: {value}\n"
152 |     log_str += f"{'-' * PRINT_LINE_WIDTH}\n"
153 |     return log_str
154 | 
155 | 
156 | def within_range(val, target, tolerance):
157 |     return abs(val - target) / target < tolerance
158 | 
159 | 
160 | def average(lst):
161 |     if not lst:
162 |         return None
163 |     return sum(lst) / len(lst)
164 | 
165 | 
166 | def max_value(lst):
167 |     if not lst:
168 |         return None
169 |     return max(lst)
170 | 


--------------------------------------------------------------------------------
/llm_counts/utils/visualizer.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | 
  3 | # ====================================================================== #
  4 | # Refactored visualisation as a lightweight class                        #
  5 | # ====================================================================== #
  6 | class SeqLenVisualizer:
  7 |     """Encapsulates all plots & tables for a sequence‑length sweep."""
  8 | 
  9 |     def __init__(
 10 |         self,
 11 |         df: pd.DataFrame,
 12 |         model: str,
 13 |         gpu: str,
 14 |         *,
 15 |         out_dir: str = "figures",
 16 |         flops_unit: str = "TFLOPs",
 17 |         mem_unit: str = "GiB",
 18 |         dpi: int = 300,
 19 |         show: bool = False,
 20 |     ):
 21 |         from pathlib import Path
 22 |         import matplotlib.pyplot as plt
 23 | 
 24 |         self.df = df.sort_values("seq_len")
 25 |         self.model = model
 26 |         self.gpu = gpu
 27 |         self.out_dir = Path(out_dir)
 28 |         self.flops_unit = flops_unit
 29 |         self.mem_unit = mem_unit
 30 |         self.dpi = dpi
 31 |         self.show = show
 32 | 
 33 |         self.out_dir.mkdir(parents=True, exist_ok=True)
 34 |         plt.style.use("seaborn-v0_8-paper")
 35 |         plt.rcParams.update(
 36 |             {
 37 |                 "figure.facecolor": "white",
 38 |                 "axes.facecolor": "white",
 39 |                 "axes.grid": True,
 40 |                 "grid.alpha": 0.3,
 41 |                 "grid.linestyle": "--",
 42 |                 "axes.edgecolor": "#cccccc",
 43 |                 "axes.spines.top": False,
 44 |                 "axes.spines.right": False,
 45 |                 "font.size": 12,
 46 |                 "axes.titleweight": "bold",
 47 |             }
 48 |         )
 49 | 
 50 |         # Pre‑compute unit scaling divisors
 51 |         _scale = {
 52 |             "GFLOPs": 1e9,
 53 |             "TFLOPs": 1e12,
 54 |             "PFLOPs": 1e15,
 55 |             "MiB": 2**20,
 56 |             "GiB": 2**30,
 57 |         }
 58 |         self.flops_div = _scale.get(self.flops_unit, 1.0)
 59 |         self.mem_div = _scale.get(self.mem_unit, 1.0)
 60 | 
 61 |     # -------------------------- helpers ------------------------------ #
 62 |     def _save(self, fig, suffix: str):
 63 |         fig.savefig(
 64 |             self.out_dir
 65 |             / f"{self.model}_{self.gpu}_{suffix}.png",
 66 |             dpi=self.dpi,
 67 |             bbox_inches="tight",
 68 |         )
 69 |         if self.show:
 70 |             import webbrowser, os, matplotlib.pyplot as _plt
 71 |             _plt.show()
 72 |         import matplotlib.pyplot as plt
 73 |         plt.close(fig)
 74 | 
 75 |     @staticmethod
 76 |     def _line_scatter(ax, x, y, y_label, title, cmap="viridis_r"):
 77 |         sc = ax.scatter(
 78 |             x,
 79 |             y,
 80 |             c=y,
 81 |             cmap=cmap,
 82 |             s=70,
 83 |             edgecolor="black",
 84 |             linewidths=0.4,
 85 |         )
 86 |         ax.plot(x, y, linewidth=1.2, alpha=0.75)
 87 |         ax.set_xlabel("Sequence length (tokens)")
 88 |         ax.set_ylabel(y_label)
 89 |         ax.set_title(title)
 90 |         ax.grid(True, linestyle="--", alpha=0.3)
 91 |         return sc
 92 | 
 93 | 
 94 |     # ----------------------- public interface ------------------------ #
 95 |     def visualize(self):
 96 |         self._metric_figs()
 97 |         self._latency_fig()
 98 |         self._composite_fig()
 99 |         self._interactive_html()
100 |         self._print_table()
101 | 
102 |     # ------------------ individual plot generators ------------------ #
103 |     def _metric_figs(self):
104 |         import matplotlib.pyplot as plt
105 | 
106 |         metrics = [
107 |             ("flops", "prefill_flops", self.flops_div, f"Prefill {self.flops_unit}"),
108 |             ("memory", "consume_memory_per_gpu", self.mem_div, f"HBM ({self.mem_unit})"),
109 |         ]
110 |         if "throughput_tok_per_second" in self.df.columns:
111 |             metrics.append(("throughput", "throughput_tok_per_second", 1.0, "Throughput (tok/s)"))
112 | 
113 |         for suffix, col, div, label in metrics:
114 |             if col not in self.df:
115 |                 continue
116 |             y = (self.df[col] / div) if div != 1.0 else self.df[col]
117 |             fig, ax = plt.subplots(figsize=(7, 4), constrained_layout=True)
118 |             sc = self._line_scatter(
119 |                 ax,
120 |                 self.df["seq_len"],
121 |                 y,
122 |                 label,
123 |                 f"{self.model} on {self.gpu}\n{label} vs seq_len",
124 |             )
125 |             plt.colorbar(sc, ax=ax, label=label)
126 |             self._save(fig, f"{suffix}_vs_seq_len")
127 | 
128 |     def _latency_fig(self):
129 |         import matplotlib.pyplot as plt
130 | 
131 |         fig, ax1 = plt.subplots(figsize=(7, 4), constrained_layout=True)
132 |         if "TTFT" in self.df.columns:
133 |             ax1.plot(
134 |                 self.df["seq_len"],
135 |                 self.df["TTFT"],
136 |                 "s-.",
137 |                 linewidth=1.5,
138 |                 label="TTFT (s)",
139 |             )
140 |             ax1.set_ylabel("TTFT (s)")
141 |         ax2 = ax1.twinx()
142 |         if "TTOT" in self.df.columns:
143 |             ax2.plot(
144 |                 self.df["seq_len"],
145 |                 self.df["TTOT"] * 1000.0,
146 |                 "^:",
147 |                 linewidth=1.5,
148 |                 color="tab:red",
149 |                 label="TTOT (ms)",
150 |             )
151 |             ax2.set_ylabel("TTOT (ms)")
152 |         ax1.set_xlabel("Sequence length (tokens)")
153 |         ax1.set_title(f"{self.model} on {self.gpu}\nTTFT & TTOT vs seq_len")
154 |         handles, labels = [], []
155 |         for ax in (ax1, ax2):
156 |             h, l = ax.get_legend_handles_labels()
157 |             handles += h
158 |             labels += l
159 |         ax1.legend(handles, labels, loc="upper left")
160 |         ax1.grid(True, linestyle="--", alpha=0.3)
161 |         self._save(fig, "latency_vs_seq_len")
162 | 
163 |     def _composite_fig(self):
164 |         import matplotlib.pyplot as plt
165 |         from matplotlib.gridspec import GridSpec
166 | 
167 |         mem_norm = self.df["consume_memory_per_gpu"] / self.mem_div
168 |         fig = plt.figure(figsize=(10, 10), constrained_layout=True)
169 |         gs = GridSpec(2, 2, figure=fig)
170 |         axes = {
171 |             "FLOPs": fig.add_subplot(gs[0, 0]),
172 |             "Latency": fig.add_subplot(gs[0, 1]),
173 |             "Memory": fig.add_subplot(gs[1, 0]),
174 |             "Throughput": fig.add_subplot(gs[1, 1]),
175 |         }
176 | 
177 |         self._line_scatter(
178 |             axes["FLOPs"],
179 |             self.df["seq_len"],
180 |             self.df["prefill_flops"] / self.flops_div,
181 |             f"Prefill {self.flops_unit}",
182 |             "FLOPs",
183 |         )
184 | 
185 |         if "TTFT" in self.df.columns:
186 |             axes["Latency"].plot(self.df["seq_len"], self.df["TTFT"], "s-.", label="TTFT (s)")
187 |         if "TTOT" in self.df.columns:
188 |             axes["Latency"].plot(self.df["seq_len"], self.df["TTOT"] * 1000.0, "^:", label="TTOT (ms)")
189 |         axes["Latency"].set_title("Latency"); axes["Latency"].legend()
190 | 
191 |         self._line_scatter(
192 |             axes["Memory"],
193 |             self.df["seq_len"],
194 |             mem_norm,
195 |             f"HBM ({self.mem_unit})",
196 |             "Memory",
197 |         )
198 | 
199 |         if "throughput_tok_per_second" in self.df.columns:
200 |             self._line_scatter(
201 |                 axes["Throughput"],
202 |                 self.df["seq_len"],
203 |                 self.df["throughput_tok_per_second"],
204 |                 "Throughput (tok/s)",
205 |                 "Throughput",
206 |             )
207 | 
208 |         fig.suptitle(f"{self.model} on {self.gpu}\nOverview", fontsize=14)
209 |         self._save(fig, "overview")
210 | 
211 |     def _interactive_html(self):
212 |         try:
213 |             import plotly.graph_objects as go
214 |             from plotly.offline import plot as psave
215 |             from pathlib import Path
216 |             figs = []
217 | 
218 |             # basic metrics
219 |             meta = [
220 |                 ("TTFT (s)", "TTFT", 1.0),
221 |                 ("TTOT (ms)", "TTOT", 1000.0),
222 |                 (f"Prefill {self.flops_unit}", "prefill_flops", self.flops_div),
223 |                 (f"HBM ({self.mem_unit})", "consume_memory_per_gpu", self.mem_div),
224 |                 ("Throughput (tok/s)", "throughput_tok_per_second", 1.0),
225 |             ]
226 |             for name, col, div in meta:
227 |                 if col not in self.df:
228 |                     continue
229 |                 y = self.df[col] * div if div != 1.0 else self.df[col]
230 |                 f = go.Figure(go.Scatter(x=self.df["seq_len"], y=y, mode="lines+markers"))
231 |                 f.update_layout(title=f"{name} vs seq_len", template="seaborn")
232 |                 figs.append((name, f))
233 | 
234 |             html_path = Path(self.out_dir) / f"{self.model}_{self.gpu}_interactive.html"
235 |             with open(html_path, "w") as fhtml:
236 |                 for i, (title, fig) in enumerate(figs):
237 |                     fhtml.write(f"<h2>{title}</h2>")
238 |                     fhtml.write(psave(fig, include_plotlyjs="cdn" if i == 0 else False, output_type="div"))
239 |             if self.show:
240 |                 import webbrowser, os
241 |                 webbrowser.open("file://" + os.path.abspath(html_path))
242 |         except ImportError:
243 |             print("[INFO] plotly missing – skipped interactive output.")
244 | 
245 |     def _print_table(self):
246 |         summary = self.df.copy()
247 |         summary["prefill_flops"] = (summary["prefill_flops"] / self.flops_div).map("{:,.2f}".format)
248 |         summary["consume_memory_per_gpu"] = (summary["consume_memory_per_gpu"] / self.mem_div).map("{:,.2f}".format)
249 |         if "throughput_tok_per_second" in summary.columns:
250 |             summary["throughput_tok_per_second"] = summary["throughput_tok_per_second"].map("{:,.2f}".format)
251 |         print("=" * 80)
252 |         print(summary.to_string(index=False))
253 |         print("=" * 80)


--------------------------------------------------------------------------------
/test_torch_info.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from typing import Union, TextIO, Optional
  3 | import torch
  4 | from transformers import AutoModel, AutoConfig, PreTrainedModel
  5 | from accelerate import init_empty_weights
  6 | 
  7 | MODEL_ID = "/home/honggao/llm_weights/Qwen3-8B"
  8 | 
  9 | 
 10 | def print_empty_model(model_id):
 11 | 	"""
 12 | 	Accelerate 提供 init_empty_weights 上下文管理器，令所有 Parameter 和 Buffer 
 13 | 	都放在 meta device，尺寸为 0，因此既 不下载权重 也 不占内存。
 14 | 	"""
 15 | 	cfg = AutoConfig.from_pretrained(model_id)   # 只拉配置
 16 | 
 17 | 	with init_empty_weights(): 
 18 | 		model = AutoModel.from_config(cfg)
 19 | 		print(model)
 20 | 	return model
 21 | 
 22 | def print_transformers_model_summary(
 23 |     model: PreTrainedModel,
 24 |     *,
 25 |     use_torchinfo: bool = False,
 26 |     input_size: Optional[tuple] = None,
 27 |     file: Union[str, TextIO, None] = None,
 28 | ) -> None:
 29 |     """
 30 |     打印 Hugging Face Transformers 模型结构 + 权重 shape。
 31 |     
 32 |     Args:
 33 |         model (PreTrainedModel): 已加载好的模型实例。
 34 |         use_torchinfo (bool): 是否调用 torchinfo.summary() 生成额外摘要。
 35 |         input_size (tuple): 当 use_torchinfo=True 时需提供 (seq_len, ) or (bs, seq_len, ...)。
 36 |         file: None  -> 输出到 stdout；
 37 |               str   -> 输出到指定路径文件；
 38 |               TextIO -> 已打开的文件句柄。
 39 |     """
 40 |     import math
 41 | 
 42 |     def _human_readable(num: float, *, base: int = 1000, 
 43 |                         units=("", "K", "M", "G", "T", "P"), suffix=""):
 44 |         """Convert a large number to human‑readable form (e.g. 12.3G)."""
 45 |         if num == 0:
 46 |             return f"0{suffix}"
 47 |         exp = min(int(math.log(num, base)), len(units) - 1)
 48 |         value = num / (base ** exp)
 49 |         return f"{value:.2f}{units[exp]}{suffix}"
 50 | 
 51 |     def _dump(msg: str = ""):
 52 |         if fh:
 53 |             fh.write(msg + "\n")
 54 |         else:
 55 |             print(msg)
 56 | 
 57 |     # 0) 处理输出目标
 58 |     fh = open(file, "w") if isinstance(file, str) else file
 59 | 
 60 |     # 1) 模型 __repr__
 61 |     _dump("=" * 60)
 62 |     _dump("Model architecture (__repr__):")
 63 |     _dump("=" * 60)
 64 |     _dump(str(model))
 65 |     
 66 |     # 2) 权重 shape
 67 |     _dump("\n" + "=" * 60)
 68 |     _dump("Parameter shapes (name -> shape, #elements):")
 69 |     _dump("=" * 60)
 70 | 
 71 |     # Token count estimation for FLOPs (default = 1 token if unknown)
 72 |     tokens = 1
 73 |     if input_size is not None:
 74 |         # Accept (seq_len,), (bs, seq_len) or any shape where last dim is seq_len
 75 |         if len(input_size) == 1:
 76 |             tokens = input_size[0]
 77 |         else:
 78 |             tokens = input_size[0] * input_size[-1]
 79 | 
 80 |     total_params = 0
 81 |     total_flops = 0
 82 |     total_mem_bytes = 0
 83 |     for name, param in model.named_parameters():
 84 |         numel = param.numel()
 85 |         total_params += numel
 86 | 
 87 |         # ---- Estimate per‑parameter FLOPs ----
 88 |         if param.dim() == 2:  # typical (out, in) weight matrix
 89 |             flops = 2 * param.shape[0] * param.shape[1] * tokens
 90 |         elif param.dim() == 1:  # bias / norm weight
 91 |             flops = param.shape[0] * tokens
 92 |         else:
 93 |             flops = numel  # fallback crude estimate
 94 |         total_flops += flops
 95 | 
 96 |         # ---- Memory access cost (parameter bytes only) ----
 97 |         mem_bytes = numel * param.element_size()
 98 |         total_mem_bytes += mem_bytes
 99 | 
100 |         # ---- Pretty print ----
101 |         flops_str = _human_readable(flops, suffix="F")
102 |         mem_str = _human_readable(mem_bytes, base=1024, units=("B","KB","MB","GB","TB","PB"))
103 |         _dump(f"{name:<60} {str(tuple(param.shape)):<20} {numel:,}  |  {flops_str:<8}  |  {mem_str}")
104 | 
105 |     _dump(f"\nTotal parameters: {total_params:,}")
106 |     _dump(f"Estimated forward FLOPs: {_human_readable(total_flops, suffix='F')}")
107 |     _dump(f"Parameter memory: {_human_readable(total_mem_bytes, base=1024, units=('B','KB','MB','GB','TB','PB'))}")
108 | 
109 |     # 3) 可选 torchinfo 摘要
110 |     if use_torchinfo:
111 |         try:
112 |             from torchinfo import summary  # pip install torchinfo
113 |             assert input_size is not None, "`input_size` must be provided when use_torchinfo=True"
114 |             info = summary(
115 |                 model,
116 |                 input_size=input_size,
117 |                 depth=3,
118 |                 col_names=("kernel_size", "output_size", "num_params", "mult_adds"),
119 |                 dtypes=[torch.long],  # 对 NLP 模型输入通常是 int64 token id
120 |             )
121 |             _dump("\n" + "=" * 60)
122 |             _dump("torchinfo summary():")
123 |             _dump("=" * 60)
124 |             _dump(str(info))
125 |         except ImportError:
126 |             _dump("torchinfo 未安装，跳过摘要。pip install torchinfo 获取更丰富视图。")
127 |     
128 |     if isinstance(file, str):  # 自动关闭文件
129 |         fh.close()
130 | 
131 | from torchviz import make_dot  # pip install torchviz graphviz
132 | def save_model_graph(
133 |     model, 
134 |     input_example: torch.Tensor, 
135 |     file_name: str = "model_graph.svg"
136 | ) -> None:
137 |     """
138 |     利用 torchviz 生成前向图；input_example 必须能直接送入 model。
139 |     """
140 |     model.eval()
141 |     y = model(input_example)
142 |     dot = make_dot(y, params=dict(model.named_parameters()))
143 |     dot.format = file_name.split(".")[-1]  # 自动根据后缀决定 svg/png
144 |     dot.render(file_name, cleanup=True)
145 |     print(f"✅ Graph saved to {file_name}")
146 | 
147 | if __name__ == "__main__":
148 |     # model = AutoModel.from_pretrained(MODEL_ID)
149 |     model =  print_empty_model(MODEL_ID)
150 |     input_example = torch.randint(0, 1000, (2, 2048))  # 随机输入
151 |     print_transformers_model_summary(
152 |         model=model,
153 |         use_torchinfo=True,
154 |         input_size=(2, 2048),
155 |         file="qwen3_8b_structure.txt"
156 |     )


--------------------------------------------------------------------------------