├── .gitignore ├── figs └── eye.png ├── frontend ├── public │ ├── eye.ico │ └── favicon.ico ├── src │ ├── main.js │ ├── components │ │ ├── LeftPannel.vue │ │ ├── Header.vue │ │ ├── graphs │ │ │ └── graph_config.js │ │ ├── left_controls │ │ │ └── Config.vue │ │ └── Graph.vue │ ├── utils.js │ └── App.vue ├── jsconfig.json ├── index.html ├── README.md ├── vite.config.js ├── package.json └── .gitignore ├── roofline_model.py ├── utils.py ├── backend_settings.py ├── LICENSE ├── model_params └── DiT.py ├── backend_app.py ├── analyze_cli.py ├── analyze_gen_cli.py ├── configs ├── chatglm3.py ├── DiT.py ├── gpt-j-6B.py ├── opt.py └── Llama.py ├── hardwares └── hardware_params.py ├── README.md ├── get_model_graph.py ├── model_analyzer.py └── examples ├── plot_hardware.ipynb └── plot_memory.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | output 2 | tmp 3 | *.pyc 4 | .vscode -------------------------------------------------------------------------------- /figs/eye.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hahnyuan/LLM-Viewer/HEAD/figs/eye.png -------------------------------------------------------------------------------- /frontend/public/eye.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hahnyuan/LLM-Viewer/HEAD/frontend/public/eye.ico -------------------------------------------------------------------------------- /frontend/public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hahnyuan/LLM-Viewer/HEAD/frontend/public/favicon.ico -------------------------------------------------------------------------------- /frontend/src/main.js: -------------------------------------------------------------------------------- 1 | import { createApp } from 'vue' 2 | import App from './App.vue' 3 | 4 | const app=createApp(App) 5 | app.mount('#app') -------------------------------------------------------------------------------- /frontend/jsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "paths": { 4 | "@/*": ["./src/*"] 5 | } 6 | }, 7 | "exclude": ["node_modules", "dist"] 8 | } 9 | -------------------------------------------------------------------------------- /frontend/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | LLM-Viewer 8 | 9 | 10 |
11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /frontend/README.md: -------------------------------------------------------------------------------- 1 | # LLMViewer Frontend 2 | 3 | This project uses Vue 3 in Vite. 4 | 5 | ## Project Setup 6 | 7 | ```sh 8 | npm install 9 | ``` 10 | 11 | ### Compile and Hot-Reload for Development 12 | 13 | ```sh 14 | npm run dev 15 | ``` 16 | or 17 | ```sh 18 | npm run build 19 | npm run preview 20 | ``` 21 | 22 | ### Compile and Minify for Production 23 | 24 | ```sh 25 | npm run build 26 | ``` 27 | -------------------------------------------------------------------------------- /frontend/vite.config.js: -------------------------------------------------------------------------------- 1 | import { fileURLToPath, URL } from 'node:url' 2 | 3 | import { defineConfig } from 'vite' 4 | import vue from '@vitejs/plugin-vue' 5 | 6 | // https://vitejs.dev/config/ 7 | export default defineConfig({ 8 | plugins: [ 9 | vue(), 10 | ], 11 | resolve: { 12 | alias: { 13 | '@': fileURLToPath(new URL('./src', import.meta.url)) 14 | } 15 | }, 16 | define: { 17 | llm_viewer_frontend_version: JSON.stringify(process.env.npm_package_version) 18 | } 19 | 20 | }) 21 | -------------------------------------------------------------------------------- /frontend/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "llmviewer", 3 | "version": "0.4.0", 4 | "private": true, 5 | "type": "module", 6 | "scripts": { 7 | "dev": "vite", 8 | "build": "vite build", 9 | "preview": "vite preview" 10 | }, 11 | "dependencies": { 12 | "@antv/g6": "^4.8.24", 13 | "@vitejs/plugin-vue": "^5.0.4", 14 | "@vue/cli-service": "^5.0.8", 15 | "axios": "^1.6.7", 16 | "chart.js": "^4.4.2", 17 | "chartjs-plugin-annotation": "^3.0.1", 18 | "numeral": "^2.0.6", 19 | "vite": "^5.1.5", 20 | "vue": "^3.4.21" 21 | }, 22 | "devDependencies": { 23 | "gh-pages": "^6.1.1" 24 | } 25 | } -------------------------------------------------------------------------------- /roofline_model.py: -------------------------------------------------------------------------------- 1 | def roofline_analyze(bandwidth, max_OPS, OPs, memory_access): 2 | # bandwidth is bytes/s 3 | # memory_access in byte 4 | # x axis is OPS/byte 5 | # y axis is OPS/s 6 | y_max = max_OPS 7 | memory_access_bytes = memory_access 8 | turning_point = y_max / bandwidth 9 | arithmetic_intensity = OPs / memory_access_bytes 10 | if arithmetic_intensity < turning_point: 11 | bound = "memory" 12 | performance = arithmetic_intensity * bandwidth 13 | else: 14 | bound = "compute" 15 | performance = y_max 16 | if performance==0: 17 | 1==1 18 | pass 19 | return arithmetic_intensity, performance, bound 20 | -------------------------------------------------------------------------------- /frontend/src/components/LeftPannel.vue: -------------------------------------------------------------------------------- 1 | 6 | 7 | 18 | 19 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | 2 | def str_number(num): 3 | if num > 1e14: 4 | return f"{num/1e12:.0f}T" 5 | elif num > 1e12: 6 | return f"{num/1e12:.1f}T" 7 | elif num>1e11: 8 | return f"{num/1e9:.0f}G" 9 | elif num > 1e9: 10 | return f"{num/1e9:.1f}G" 11 | elif num > 1e8: 12 | return f"{num/1e6:.0f}M" 13 | elif num > 1e6: 14 | return f"{num/1e6:.1f}M" 15 | elif num > 1e5: 16 | return f"{num/1e3:.0f}K" 17 | elif num > 1e3: 18 | return f"{num/1e3:.1f}K" 19 | elif num >= 1: 20 | return f"{num:.1f}" 21 | else: 22 | return f"{num:.2f}" 23 | 24 | def str_number_time(num): 25 | if num >= 1: 26 | return f"{num:.1f}" 27 | elif num > 1e-3: 28 | return f"{num*1e3:.1f}m" 29 | elif num > 1e-6: 30 | return f"{num*1e6:.1f}u" 31 | elif num > 1e-9: 32 | return f"{num*1e9:.1f}n" 33 | else: 34 | return f"{num:.0f}" -------------------------------------------------------------------------------- /backend_settings.py: -------------------------------------------------------------------------------- 1 | from hardwares.hardware_params import hardware_params 2 | 3 | avaliable_model_ids_sources = { 4 | "meta-llama/Llama-2-7b-hf": {"source": "huggingface"}, 5 | "meta-llama/Llama-2-13b-hf": {"source": "huggingface"}, 6 | "meta-llama/Llama-2-70b-hf": {"source": "huggingface"}, 7 | "EleutherAI/gpt-j-6B":{"source": "huggingface"}, 8 | "THUDM/chatglm3-6b": {"source": "huggingface"}, 9 | "facebook/opt-125m": {"source": "huggingface"}, 10 | "facebook/opt-1.3b": {"source": "huggingface"}, 11 | "facebook/opt-2.7b": {"source": "huggingface"}, 12 | "facebook/opt-6.7b": {"source": "huggingface"}, 13 | "facebook/opt-30b": {"source": "huggingface"}, 14 | "facebook/opt-66b": {"source": "huggingface"}, 15 | # "DiT-XL/2": {"source": "DiT"}, 16 | # "DiT-XL/4": {"source": "DiT"}, 17 | } 18 | avaliable_model_ids = [_ for _ in avaliable_model_ids_sources.keys()] 19 | avaliable_hardwares = [_ for _ in hardware_params.keys()] 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Zhihang Yuan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /frontend/src/utils.js: -------------------------------------------------------------------------------- 1 | export function strNumber(num) { 2 | if (num > 1e14) { 3 | return `${(num / 1e12).toFixed(0)}T`; 4 | } else if (num > 1e12) { 5 | return `${(num / 1e12).toFixed(1)}T`; 6 | } else if (num > 1e11) { 7 | return `${(num / 1e9).toFixed(0)}G`; 8 | } else if (num > 1e9) { 9 | return `${(num / 1e9).toFixed(1)}G`; 10 | } else if (num > 1e8) { 11 | return `${(num / 1e6).toFixed(0)}M`; 12 | } else if (num > 1e6) { 13 | return `${(num / 1e6).toFixed(1)}M`; 14 | } else if (num > 1e5) { 15 | return `${(num / 1e3).toFixed(0)}K`; 16 | } else if (num > 1e3) { 17 | return `${(num / 1e3).toFixed(1)}K`; 18 | } else if (num >= 1) { 19 | return `${num.toFixed(1)}`; 20 | } else { 21 | return `${num.toFixed(2)}`; 22 | } 23 | } 24 | 25 | export function strNumberTime(num) { 26 | if (num >= 1) { 27 | return `${num.toFixed(1)}s`; 28 | } else if (num > 1e-3) { 29 | return `${(num * 1e3).toFixed(1)}ms`; 30 | } else if (num > 1e-6) { 31 | return `${(num * 1e6).toFixed(1)}us`; 32 | } else if (num > 1e-9) { 33 | return `${(num * 1e9).toFixed(1)}ns`; 34 | } else { 35 | return `${num.toFixed(0)}s`; 36 | } 37 | } -------------------------------------------------------------------------------- /model_params/DiT.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict 2 | 3 | 4 | model_params={ 5 | "DiT-XL/2":EasyDict( 6 | depth=28, hidden_size=1152, patch_size=2, num_heads=16 7 | ), 8 | "DiT-XL/4":EasyDict( 9 | depth=28, hidden_size=1152, patch_size=4, num_heads=16 10 | ), 11 | "DiT-XL/8":EasyDict( 12 | depth=28, hidden_size=1152, patch_size=8, num_heads=16 13 | ), 14 | "DiT-L/2":EasyDict( 15 | depth=24, hidden_size=1024, patch_size=2, num_heads=16 16 | ), 17 | "DiT-L/4":EasyDict( 18 | depth=24, hidden_size=1024, patch_size=4, num_heads=16 19 | ), 20 | "DiT-L/8":EasyDict( 21 | depth=24, hidden_size=1024, patch_size=8, num_heads=16 22 | ), 23 | "DiT-B/2":EasyDict( 24 | depth=12, hidden_size=768, patch_size=2, num_heads=12 25 | ), 26 | "DiT-B/4":EasyDict( 27 | depth=12, hidden_size=768, patch_size=4, num_heads=12 28 | ), 29 | "DiT-B/8":EasyDict( 30 | depth=12, hidden_size=768, patch_size=8, num_heads=12 31 | ), 32 | "DiT-S/2":EasyDict( 33 | depth=12, hidden_size=384, patch_size=2, num_heads=6 34 | ), 35 | "DiT-S/4":EasyDict( 36 | depth=12, hidden_size=384, patch_size=4, num_heads=6 37 | ), 38 | "DiT-S/8":EasyDict( 39 | depth=12, hidden_size=384, patch_size=8, num_heads=6 40 | ) 41 | 42 | } -------------------------------------------------------------------------------- /backend_app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, request 2 | from flask import render_template 3 | from flask_cors import CORS 4 | from get_model_graph import get_model_graph 5 | from backend_settings import avaliable_hardwares,avaliable_model_ids 6 | import argparse 7 | 8 | app = Flask(__name__) 9 | cors = CORS(app, resources={r"/*": {"origins": "*"}}) 10 | 11 | 12 | @app.route("/") 13 | def index(): 14 | return "backend server ready." 15 | 16 | 17 | @app.route("/get_graph", methods=["POST"]) 18 | def get_graph(): 19 | inference_config = request.json["inference_config"] 20 | nodes, edges, total_results, hardware_info = get_model_graph( 21 | request.json["model_id"], 22 | request.json["hardware"], 23 | None, 24 | inference_config, 25 | ) 26 | return { 27 | "nodes": nodes, 28 | "edges": edges, 29 | "total_results": total_results, 30 | "hardware_info": hardware_info, 31 | } 32 | 33 | @app.route("/get_avaliable", methods=["GET"]) 34 | def get_avaliable(): 35 | return { 36 | "avaliable_hardwares": avaliable_hardwares, 37 | "avaliable_model_ids": avaliable_model_ids, 38 | } 39 | 40 | if __name__ == "__main__": 41 | parser=argparse.ArgumentParser() 42 | parser.add_argument("--port", type=int, default=5000) 43 | parser.add_argument("--local", action="store_true") 44 | parser.add_argument("--debug", action="store_true") 45 | args=parser.parse_args() 46 | host="127.0.0.1" if args.local else "0.0.0.0" 47 | app.run(debug=args.debug,host=host,port=args.port) 48 | -------------------------------------------------------------------------------- /analyze_cli.py: -------------------------------------------------------------------------------- 1 | from model_analyzer import ModelAnalyzer 2 | import torch.nn as nn 3 | import numpy as np 4 | import os 5 | import importlib 6 | import argparse 7 | 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument("model_id", type=str, help="model id") 10 | parser.add_argument( 11 | "hardware", 12 | type=str, 13 | help="name of hardware, for example nvidia_V100 or nvidia_A6000", 14 | ) 15 | parser.add_argument( 16 | "--source", 17 | type=str, 18 | default="huggingface", 19 | help="source of model, if not huggingface, will use local model in model_params.", 20 | ) 21 | parser.add_argument("--config_file", type=str, default=None, help="config file") 22 | parser.add_argument("--batchsize", type=int, default=1, help="batch size") 23 | parser.add_argument("--seqlen", type=int, default=1024, help="sequence length") 24 | parser.add_argument("--w_bit", type=int, default=16, help="weight bitwidth") 25 | parser.add_argument( 26 | "--a_bit", type=int, default=16, help="temporary activation bitwidth" 27 | ) 28 | parser.add_argument("--kv_bit", type=int, default=16, help="kv cache bitwidth") 29 | parser.add_argument( 30 | "--use_flashattention", action="store_true", help="use flash attention" 31 | ) 32 | parser.add_argument( 33 | "--tp-size", 34 | type=int, 35 | default=1, 36 | help="the number of devices for tensor parallelism to use" 37 | ) 38 | args = parser.parse_args() 39 | 40 | analyzer = ModelAnalyzer(args.model_id, args.hardware, args.config_file,source=args.source) 41 | results = analyzer.analyze( 42 | batchsize=args.batchsize, 43 | seqlen=args.seqlen, 44 | w_bit=args.w_bit, 45 | a_bit=args.a_bit, 46 | kv_bit=args.kv_bit, 47 | use_flashattention=args.use_flashattention, 48 | tp_size=args.tp_size 49 | ) 50 | analyzer.save_csv() 51 | -------------------------------------------------------------------------------- /analyze_gen_cli.py: -------------------------------------------------------------------------------- 1 | from model_analyzer import ModelAnalyzer 2 | import torch.nn as nn 3 | import numpy as np 4 | import os 5 | import importlib 6 | import argparse 7 | 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument("model_id", type=str, help="model id") 10 | parser.add_argument( 11 | "hardware", 12 | type=str, 13 | help="name of hardware, for example nvidia_V100 or nvidia_A6000", 14 | ) 15 | parser.add_argument("--config_file", type=str, default=None, help="config file") 16 | parser.add_argument("--batchsize", type=int, default=1, help="batch size") 17 | parser.add_argument("--seqlen", type=int, default=1024, help="sequence length") 18 | parser.add_argument("--promptlen", type=int, default=128, help="prompt sequence length") 19 | parser.add_argument("--w_bit", type=int, default=16, help="weight bitwidth") 20 | parser.add_argument("--a_bit", type=int, default=16, help="temporary activation bitwidth") 21 | parser.add_argument("--kv_bit", type=int, default=16, help="kv cache bitwidth") 22 | parser.add_argument("--use_flashattention", action="store_true", help="use flash attention") 23 | parser.add_argument( 24 | "--tp-size", 25 | type=int, 26 | default=1, 27 | help="the number of devices for tensor parallelism to use" 28 | ) 29 | args = parser.parse_args() 30 | 31 | analyzer=ModelAnalyzer(args.model_id,args.hardware,args.config_file) 32 | ret = analyzer.analyze_generate_task( 33 | args.promptlen, 34 | args.seqlen, 35 | args.batchsize, 36 | args.w_bit, 37 | args.a_bit, 38 | args.kv_bit, 39 | args.use_flashattention, 40 | tp_size=args.tp_size 41 | ) 42 | elapse = ret["inference_time"] 43 | prefill_elapse = ret["prefill_time"] 44 | print(f"{args.hardware}: 1st token latency {prefill_elapse:.2f}, total latency {elapse:.2f}, throughput {args.seqlen * args.batchsize / elapse:.2f} Token/sec") 45 | -------------------------------------------------------------------------------- /frontend/src/App.vue: -------------------------------------------------------------------------------- 1 | 40 | 41 | 53 | 54 | 88 | -------------------------------------------------------------------------------- /configs/chatglm3.py: -------------------------------------------------------------------------------- 1 | def get_num_attention_heads(model_params): 2 | return getattr(model_params, "num_attention_heads") 3 | 4 | def get_hidden_size(model_params): 5 | return getattr(model_params, "hidden_size") 6 | 7 | def get_num_key_value_heads(model_params): 8 | if getattr(model_params,"multi_query_attention"): 9 | return getattr(model_params, "multi_query_group_num") 10 | else: 11 | return getattr(model_params, "num_attention_heads") 12 | 13 | def get_num_hidden_layers(model_params): 14 | return getattr(model_params, "num_layers") 15 | 16 | def get_intermediate_size(model_params): 17 | return getattr(model_params, "ffn_hidden_size") 18 | 19 | def get_vocab_size(model_params): 20 | return getattr(model_params, "padded_vocab_size") 21 | 22 | def get_norm_layers(model_params): 23 | return ["attn_norm", "mlp_norm"] 24 | 25 | def post_process(model_params,args): 26 | hiddensize=get_hidden_size(model_params) 27 | vocab_size=get_vocab_size(model_params) 28 | layers=[] 29 | for stage in ["prefill", "decode"]: 30 | layers.append({ 31 | 'name': 'lm_head', 32 | 'stage':stage, 33 | 'OPs':args['batchsize']*hiddensize*vocab_size*1, 34 | 'load_weight':hiddensize*vocab_size *args['w_byte'], 35 | 'load_act':hiddensize*args['a_byte'], 36 | 'store_act':vocab_size*args['a_byte'], 37 | }) 38 | return layers 39 | 40 | def get_linear_layers(model_params, tp_size: int): 41 | hidden_size=get_hidden_size(model_params) 42 | intermediate_size=get_intermediate_size(model_params) 43 | key_value_heads=get_num_key_value_heads(model_params) 44 | attention_heads=get_num_attention_heads(model_params) 45 | 46 | if tp_size > 1: 47 | assert hidden_size % tp_size == 0 48 | assert intermediate_size % tp_size == 0 49 | assert key_value_heads % tp_size == 0 50 | 51 | return { 52 | "q_proj": [hidden_size, hidden_size // tp_size], 53 | "k_proj": [hidden_size, hidden_size * key_value_heads // attention_heads // tp_size], 54 | "v_proj": [hidden_size, hidden_size * key_value_heads // attention_heads // tp_size], 55 | "out_proj": [hidden_size // tp_size, hidden_size], 56 | "gate_proj": [hidden_size, intermediate_size // tp_size], 57 | "up_proj": [hidden_size, intermediate_size // tp_size], 58 | "down_proj": [intermediate_size // tp_size, hidden_size] 59 | } 60 | 61 | from configs.Llama import flashattention_transformer_layer_graph,transformer_layer_graph -------------------------------------------------------------------------------- /hardwares/hardware_params.py: -------------------------------------------------------------------------------- 1 | # the OPS = sparse OPS/2 2 | 3 | hardware_params = { 4 | # NOTICES: For GPU, we use Register File Size as on-chip buffer size 5 | # https://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf 6 | # NOTICE: V100 not support INT8 in tensor core, so INT8 performance is not good 7 | "nvidia_V100": {"bandwidth": 900e9, "FP16": 112e12, "INT8": 62e12, "onchip_buffer": 20480e3}, 8 | # https://images.nvidia.com/aem-dam/en-zz/Solutions/technologies/NVIDIA-ADA-GPU-PROVIZ-Architecture-Whitepaper_1.1.pdf 9 | "nvidia_A6000": {"bandwidth": 768e9, "FP16": 154.8e12, "INT8": 309.7e12, "onchip_buffer": 21504e3}, 10 | # https://images.nvidia.com/aem-dam/en-zz/Solutions/technologies/NVIDIA-ADA-GPU-PROVIZ-Architecture-Whitepaper_1.1.pdf 11 | "nvidia_A6000_Ada": {"bandwidth": 960e9, "FP16": 364.2e12, "INT8": 728.5e12, "onchip_buffer": 36352e3}, 12 | # https://images.nvidia.com/aem-dam/en-zz/Solutions/data-center/nvidia-ampere-architecture-whitepaper.pdf 13 | # Ampere's SM has 256KB RF, max 164KB Shared Mem 14 | "nvidia_A100": {"bandwidth": 1555e9, "FP16": 312e12, "INT8": 624e12, "onchip_buffer": 27648e3}, # use 40G data 15 | "nvidia_A100_40G": {"bandwidth": 1555e9, "FP16": 312e12, "INT8": 624e12, "onchip_buffer": 27648e3}, 16 | "nvidia_A100_80G": {"bandwidth": 2039e9, "FP16": 312e12, "INT8": 624e12, "onchip_buffer": 27648e3}, 17 | "nvidia_A800_80G_SXM": {"bandwidth": 2039e9, "FP16": 312e12, "INT8": 624e12, "onchip_buffer": 27648e3}, 18 | "nvidia_A40": {"bandwidth": 696e9, "FP16": 149.7e12, "INT8": 299.3e12, "onchip_buffer": 21504e3}, 19 | # https://resources.nvidia.com/en-us-tensor-core/gtc22-whitepaper-hopper 20 | "nvidia_H100": { 21 | "bandwidth": 3072e9, 22 | "FP16": 1979e12 / 2, 23 | "INT8": 3958e12 / 2, 24 | "onchip_buffer": 33792e3, 25 | }, # use SXM data 26 | "nvidia_H100_SXM": {"bandwidth": 3072e9, "FP16": 1979e12 / 2, "INT8": 3958e12 / 2, "onchip_buffer": 33792e3}, 27 | "nvidia_H100_PCIe": {"bandwidth": 2048e9, "FP16": 1513e12 / 2, "INT8": 3026e12 / 2, "onchip_buffer": 29184e3}, 28 | # https://images.nvidia.com/aem-dam/Solutions/Data-Center/l4/nvidia-ada-gpu-architecture-whitepaper-v2.1.pdf 29 | # Ada SM has 256 KB Register File, and 128 KB of L1/Shared Memory 30 | "nvidia_L40": {"bandwidth": 864e9, "FP16": 181e12, "INT8": 362e12, "onchip_buffer": 36352e3}, 31 | # Intel Skylake-X (Skylake-X, Cascade Lake) Intel Xeon Phi (Knights Landing, Knights Mill) Intel Ice Lake, Tiger Lake and Rocket Lake 32 | # support AVX-512 & FMA (512-bit), they has throughput of 1 cycle 33 | # https://www.intel.com/content/www/us/en/products/sku/230496/intel-core-i913900k-processor-36m-cache-up-to-5-80-ghz/specifications.html 34 | "intel_13900k": {"bandwidth": 89.6e9, "FP16": 8 * 5.4e9 * (512 / 16), "onchip_buffer": 36e6}, 35 | } 36 | -------------------------------------------------------------------------------- /configs/DiT.py: -------------------------------------------------------------------------------- 1 | 2 | def get_num_attention_heads(model_params): 3 | return getattr(model_params, "num_heads") 4 | 5 | 6 | def get_hidden_size(model_params): 7 | return getattr(model_params, "hidden_size") 8 | 9 | 10 | def get_num_key_value_heads(model_params): 11 | return getattr(model_params, "num_heads") 12 | 13 | def get_norm_layers(model_params): 14 | return ["attn_norm", "mlp_norm"] 15 | 16 | def get_num_hidden_layers(model_params): 17 | return getattr(model_params, "depth") 18 | 19 | def get_intermediate_size(model_params): 20 | mlp_ratio=getattr(model_params, "mlp_ratio", 4.0) 21 | return getattr(model_params, "hidden_size")*mlp_ratio 22 | 23 | def get_linear_layers(model_params, tp_size: int): 24 | hidden_size=get_hidden_size(model_params) 25 | intermediate_size=get_intermediate_size(model_params) 26 | key_value_heads=get_num_key_value_heads(model_params) 27 | attention_heads=get_num_attention_heads(model_params) 28 | 29 | if tp_size > 1: 30 | assert hidden_size % tp_size == 0 31 | assert intermediate_size % tp_size == 0 32 | assert key_value_heads % tp_size == 0 33 | 34 | return { 35 | "q_proj": [hidden_size, hidden_size // tp_size], 36 | "k_proj": [hidden_size, hidden_size * key_value_heads // attention_heads // tp_size], 37 | "v_proj": [hidden_size, hidden_size * key_value_heads // attention_heads // tp_size], 38 | "out_proj": [hidden_size // tp_size, hidden_size], 39 | "gate_proj": [hidden_size, intermediate_size // tp_size], 40 | "up_proj": [hidden_size, intermediate_size // tp_size], 41 | "down_proj": [intermediate_size // tp_size, hidden_size], 42 | } 43 | 44 | def post_process(model_params,args): 45 | return [] 46 | 47 | transformer_layer_graph = { 48 | "input": [], 49 | "attn_norm": ["input"], 50 | "q_proj": ["attn_norm"], 51 | "k_proj": ["attn_norm"], 52 | "v_proj": ["attn_norm"], 53 | "qk_matmul": ["q_proj", "k_proj"], 54 | "softmax": ["qk_matmul"], 55 | "sv_matmul": ["softmax", "v_proj"], 56 | "out_proj": ["sv_matmul"], 57 | "attn_add": ["input", "out_proj"], 58 | "mlp_norm": ["attn_add"], 59 | "up_proj": ["mlp_norm"], 60 | "mlp_act": ["up_proj"], 61 | "down_proj": ["mlp_act"], 62 | "mlp_add": ["attn_add", "down_proj"], 63 | "output": ["mlp_add"], 64 | } 65 | 66 | flashattention_transformer_layer_graph = { 67 | "input": [], 68 | "attn_norm": ["input"], 69 | "q_proj": ["attn_norm"], 70 | "k_proj": ["attn_norm"], 71 | "v_proj": ["attn_norm"], 72 | "fused_attention": ["q_proj", "k_proj", "v_proj"], 73 | "out_proj": ["fused_attention"], 74 | "attn_add": ["input", "out_proj"], 75 | "mlp_norm": ["attn_add"], 76 | "up_proj": ["mlp_norm"], 77 | "mlp_act": ["up_proj"], 78 | "down_proj": ["mlp_act"], 79 | "mlp_add": ["attn_add", "down_proj"], 80 | "output": ["mlp_add"], 81 | } 82 | -------------------------------------------------------------------------------- /frontend/.gitignore: -------------------------------------------------------------------------------- 1 | //.gitignore 2 | 3 | .DS_Store 4 | node_modules 5 | /dist 6 | 7 | # local env files 8 | .env.local 9 | .env.*.local 10 | 11 | # Log files 12 | npm-debug.log* 13 | yarn-debug.log* 14 | yarn-error.log* 15 | 16 | # Editor directories and files 17 | .idea 18 | .vscode 19 | *.suo 20 | *.ntvs* 21 | *.njsproj 22 | *.sln 23 | *.sw? 24 | 25 | # Logs 26 | logs 27 | *.log 28 | npm-debug.log* 29 | yarn-debug.log* 30 | yarn-error.log* 31 | lerna-debug.log* 32 | .pnpm-debug.log* 33 | 34 | # Diagnostic reports (https://nodejs.org/api/report.html) 35 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 36 | 37 | # Runtime data 38 | pids 39 | *.pid 40 | *.seed 41 | *.pid.lock 42 | 43 | # Directory for instrumented libs generated by jscoverage/JSCover 44 | lib-cov 45 | 46 | # Coverage directory used by tools like istanbul 47 | coverage 48 | *.lcov 49 | 50 | # nyc test coverage 51 | .nyc_output 52 | 53 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 54 | .grunt 55 | 56 | # Bower dependency directory (https://bower.io/) 57 | bower_components 58 | 59 | # node-waf configuration 60 | .lock-wscript 61 | 62 | # Compiled binary addons (https://nodejs.org/api/addons.html) 63 | build/Release 64 | 65 | # Dependency directories 66 | node_modules/ 67 | jspm_packages/ 68 | 69 | # Snowpack dependency directory (https://snowpack.dev/) 70 | web_modules/ 71 | 72 | # TypeScript cache 73 | *.tsbuildinfo 74 | 75 | # Optional npm cache directory 76 | .npm 77 | 78 | # Optional eslint cache 79 | .eslintcache 80 | 81 | # Optional stylelint cache 82 | .stylelintcache 83 | 84 | # Microbundle cache 85 | .rpt2_cache/ 86 | .rts2_cache_cjs/ 87 | .rts2_cache_es/ 88 | .rts2_cache_umd/ 89 | 90 | # Optional REPL history 91 | .node_repl_history 92 | 93 | # Output of 'npm pack' 94 | *.tgz 95 | 96 | # Yarn Integrity file 97 | .yarn-integrity 98 | 99 | # dotenv environment variable files 100 | .env 101 | .env.development.local 102 | .env.test.local 103 | .env.production.local 104 | .env.local 105 | 106 | # parcel-bundler cache (https://parceljs.org/) 107 | .cache 108 | .parcel-cache 109 | 110 | # Next.js build output 111 | .next 112 | out 113 | 114 | # Nuxt.js build / generate output 115 | .nuxt 116 | dist 117 | 118 | # Gatsby files 119 | .cache/ 120 | # Comment in the public line in if your project uses Gatsby and not Next.js 121 | # https://nextjs.org/blog/next-9-1#public-directory-support 122 | # public 123 | 124 | # vuepress build output 125 | .vuepress/dist 126 | 127 | # vuepress v2.x temp and cache directory 128 | .temp 129 | .cache 130 | 131 | # Docusaurus cache and generated files 132 | .docusaurus 133 | 134 | # Serverless directories 135 | .serverless/ 136 | 137 | # FuseBox cache 138 | .fusebox/ 139 | 140 | # DynamoDB Local files 141 | .dynamodb/ 142 | 143 | # TernJS port file 144 | .tern-port 145 | 146 | # Stores VSCode versions used for testing VSCode extensions 147 | .vscode-test 148 | 149 | # yarn v2 150 | .yarn/cache 151 | .yarn/unplugged 152 | .yarn/build-state.yml 153 | .yarn/install-state.gz 154 | .pnp.* -------------------------------------------------------------------------------- /configs/gpt-j-6B.py: -------------------------------------------------------------------------------- 1 | 2 | def get_num_attention_heads(model_params): 3 | return getattr(model_params, "num_attention_heads") 4 | 5 | 6 | def get_hidden_size(model_params): 7 | return getattr(model_params, "n_embd") 8 | 9 | def get_norm_layers(model_params): 10 | return ["attn_norm"] 11 | 12 | # no group query attention 13 | def get_num_key_value_heads(model_params): 14 | return getattr(model_params, "num_attention_heads") 15 | 16 | def get_num_hidden_layers(model_params): 17 | return getattr(model_params, "num_hidden_layers") 18 | 19 | def get_intermediate_size(model_params): 20 | return 16384 21 | 22 | def get_vocab_size(model_params): 23 | return getattr(model_params, "vocab_size") 24 | 25 | def post_process(model_params,args): 26 | hiddensize=get_hidden_size(model_params) 27 | vocab_size=get_vocab_size(model_params) 28 | layers=[] 29 | for stage in ["prefill", "decode"]: 30 | layers.append({ 31 | 'name': 'lm_head', 32 | 'stage':stage, 33 | 'OPs':args['batchsize']*hiddensize*vocab_size*1, 34 | 'load_weight':hiddensize*vocab_size *args['w_byte'], 35 | 'load_act':hiddensize*args['a_byte'], 36 | 'store_act':vocab_size*args['a_byte'], 37 | }) 38 | return layers 39 | 40 | def get_linear_layers(model_params, tp_size: int): 41 | hidden_size=get_hidden_size(model_params) 42 | intermediate_size=get_intermediate_size(model_params) 43 | key_value_heads=get_num_key_value_heads(model_params) 44 | attention_heads=get_num_attention_heads(model_params) 45 | 46 | if tp_size > 1: 47 | assert hidden_size % tp_size == 0 48 | assert intermediate_size % tp_size == 0 49 | assert key_value_heads % tp_size == 0 50 | 51 | return { 52 | "q_proj":[hidden_size, hidden_size // tp_size], 53 | "k_proj":[hidden_size, hidden_size * key_value_heads // attention_heads // tp_size], 54 | "v_proj":[hidden_size, hidden_size * key_value_heads // attention_heads // tp_size], 55 | "out_proj":[hidden_size // tp_size, hidden_size], 56 | #"gate_proj":[hidden_size, intermediate_size], 57 | "up_proj":[hidden_size, intermediate_size // tp_size], 58 | "down_proj":[intermediate_size // tp_size, hidden_size], 59 | } 60 | 61 | # name, input_names 62 | transformer_layer_graph={ 63 | "input":[], 64 | "attn_norm": ["input"], 65 | "q_proj":["attn_norm"], 66 | "k_proj":["attn_norm"], 67 | "v_proj":["attn_norm"], 68 | "qk_matmul":["q_proj","k_proj"], 69 | "softmax":["qk_matmul"], 70 | "sv_matmul":["softmax","v_proj"], 71 | "out_proj":["sv_matmul"], 72 | "attn_add":["input","out_proj"], 73 | "up_proj":["input"], 74 | "mlp_act":["up_proj"], 75 | "down_proj":["mlp_act"], 76 | "mlp_add":["attn_add","down_proj"], 77 | "output":["mlp_add"] 78 | } 79 | 80 | flashattention_transformer_layer_graph={ 81 | "input":[], 82 | "attn_norm": ["input"], 83 | "q_proj":["attn_norm"], 84 | "k_proj":["attn_norm"], 85 | "v_proj":["attn_norm"], 86 | "fused_attention":["q_proj","k_proj","v_proj"], 87 | "out_proj":["fused_attention"], 88 | "attn_add":["input","out_proj"], 89 | "mlp_norm":["attn_add"], 90 | "gate_proj":["mlp_norm"], 91 | "up_proj":["mlp_norm"], 92 | "mlp_act":["up_proj","gate_proj"], 93 | "down_proj":["mlp_act"], 94 | "mlp_add":["attn_add","down_proj"], 95 | "output":["mlp_add"] 96 | } 97 | -------------------------------------------------------------------------------- /configs/opt.py: -------------------------------------------------------------------------------- 1 | def get_num_attention_heads(model_params): 2 | return getattr(model_params, "num_attention_heads") 3 | 4 | 5 | def get_hidden_size(model_params): 6 | return getattr(model_params, "hidden_size") 7 | 8 | 9 | def get_num_key_value_heads(model_params): 10 | return getattr(model_params, "num_attention_heads") 11 | 12 | def get_norm_layers(model_params): 13 | return ["attn_norm", "mlp_norm"] 14 | 15 | def get_num_hidden_layers(model_params): 16 | return getattr(model_params, "num_hidden_layers") 17 | 18 | 19 | def get_intermediate_size(model_params): 20 | return getattr(model_params, "ffn_dim") 21 | 22 | 23 | def get_vocab_size(model_params): 24 | return getattr(model_params, "vocab_size") 25 | 26 | def post_process(model_params,args): 27 | hiddensize=get_hidden_size(model_params) 28 | vocab_size=get_vocab_size(model_params) 29 | layers=[] 30 | for stage in ["prefill", "decode"]: 31 | layers.append({ 32 | 'name': 'lm_head', 33 | 'stage':stage, 34 | 'OPs':args['batchsize']*hiddensize*vocab_size*1, 35 | 'load_weight':hiddensize*vocab_size *args['w_byte'], 36 | 'load_act':hiddensize*args['a_byte'], 37 | 'store_act':vocab_size*args['a_byte'], 38 | }) 39 | return layers 40 | 41 | def get_linear_layers(model_params, tp_size: int): 42 | hidden_size = get_hidden_size(model_params) 43 | intermediate_size = get_intermediate_size(model_params) 44 | key_value_heads = get_num_key_value_heads(model_params) 45 | attention_heads = get_num_attention_heads(model_params) 46 | 47 | if tp_size > 1: 48 | assert hidden_size % tp_size == 0 49 | assert intermediate_size % tp_size == 0 50 | assert key_value_heads % tp_size == 0 51 | 52 | return { 53 | "q_proj": [hidden_size, hidden_size // tp_size], 54 | "k_proj": [hidden_size, hidden_size * key_value_heads // attention_heads // tp_size], 55 | "v_proj": [hidden_size, hidden_size * key_value_heads // attention_heads // tp_size], 56 | "out_proj": [hidden_size // tp_size, hidden_size], 57 | "gate_proj": [hidden_size, intermediate_size // tp_size], 58 | "up_proj": [hidden_size, intermediate_size // tp_size], 59 | "down_proj": [intermediate_size // tp_size, hidden_size], 60 | } 61 | 62 | 63 | transformer_layer_graph = { 64 | "input": [], 65 | "attn_norm": ["input"], 66 | "q_proj": ["attn_norm"], 67 | "k_proj": ["attn_norm"], 68 | "v_proj": ["attn_norm"], 69 | "qk_matmul": ["q_proj", "k_proj"], 70 | "softmax": ["qk_matmul"], 71 | "sv_matmul": ["softmax", "v_proj"], 72 | "out_proj": ["sv_matmul"], 73 | "attn_add": ["input", "out_proj"], 74 | "mlp_norm": ["attn_add"], 75 | "up_proj": ["mlp_norm"], 76 | "mlp_act": ["up_proj"], 77 | "down_proj": ["mlp_act"], 78 | "mlp_add": ["attn_add", "down_proj"], 79 | "output": ["mlp_add"], 80 | } 81 | 82 | flashattention_transformer_layer_graph = { 83 | "input": [], 84 | "attn_norm": ["input"], 85 | "q_proj": ["attn_norm"], 86 | "k_proj": ["attn_norm"], 87 | "v_proj": ["attn_norm"], 88 | "fused_attention": ["q_proj", "k_proj", "v_proj"], 89 | "out_proj": ["fused_attention"], 90 | "attn_add": ["input", "out_proj"], 91 | "mlp_norm": ["attn_add"], 92 | "up_proj": ["mlp_norm"], 93 | "mlp_act": ["up_proj"], 94 | "down_proj": ["mlp_act"], 95 | "mlp_add": ["attn_add", "down_proj"], 96 | "output": ["mlp_add"], 97 | } 98 | -------------------------------------------------------------------------------- /configs/Llama.py: -------------------------------------------------------------------------------- 1 | 2 | def get_num_attention_heads(model_params): 3 | return getattr(model_params, "num_attention_heads") 4 | 5 | def get_hidden_size(model_params): 6 | return getattr(model_params, "hidden_size") 7 | 8 | def get_num_key_value_heads(model_params): 9 | return getattr(model_params, "num_key_value_heads") 10 | 11 | def get_norm_layers(model_params): 12 | return ["attn_norm", "mlp_norm"] 13 | 14 | def get_num_hidden_layers(model_params): 15 | return getattr(model_params, "num_hidden_layers") 16 | 17 | def get_intermediate_size(model_params): 18 | return getattr(model_params, "intermediate_size") 19 | 20 | def get_vocab_size(model_params): 21 | return getattr(model_params, "vocab_size") 22 | 23 | def post_process(model_params,args): 24 | hiddensize=get_hidden_size(model_params) 25 | vocab_size=get_vocab_size(model_params) 26 | layers=[] 27 | for stage in ["prefill", "decode"]: 28 | layers.append({ 29 | 'name': 'lm_head', 30 | 'stage':stage, 31 | 'OPs':args['batchsize']*hiddensize*vocab_size*1, 32 | 'load_weight':hiddensize*vocab_size *args['w_byte'], 33 | 'load_act':hiddensize*args['a_byte'], 34 | 'store_act':vocab_size*args['a_byte'], 35 | }) 36 | return layers 37 | 38 | def get_linear_layers(model_params, tp_size: int): 39 | hidden_size=get_hidden_size(model_params) 40 | intermediate_size=get_intermediate_size(model_params) 41 | key_value_heads=get_num_key_value_heads(model_params) 42 | attention_heads=get_num_attention_heads(model_params) 43 | 44 | if tp_size > 1: 45 | assert hidden_size % tp_size == 0 46 | assert intermediate_size % tp_size == 0 47 | assert key_value_heads % tp_size == 0 48 | 49 | return { 50 | "q_proj":[hidden_size, hidden_size // tp_size], 51 | "k_proj":[hidden_size, hidden_size * key_value_heads // attention_heads // tp_size], 52 | "v_proj":[hidden_size, hidden_size * key_value_heads // attention_heads // tp_size], 53 | "out_proj":[hidden_size // tp_size, hidden_size], 54 | "gate_proj":[hidden_size, intermediate_size // tp_size], 55 | "up_proj":[hidden_size,intermediate_size // tp_size], 56 | "down_proj":[intermediate_size // tp_size, hidden_size], 57 | } 58 | 59 | # name, input_names 60 | transformer_layer_graph={ 61 | "input":[], 62 | "attn_norm": ["input"], 63 | "q_proj":["attn_norm"], 64 | "k_proj":["attn_norm"], 65 | "v_proj":["attn_norm"], 66 | "qk_matmul":["q_proj","k_proj"], 67 | "softmax":["qk_matmul"], 68 | "sv_matmul":["softmax","v_proj"], 69 | "out_proj":["sv_matmul"], 70 | "attn_add":["input","out_proj"], 71 | "mlp_norm":["attn_add"], 72 | "gate_proj":["mlp_norm"], 73 | "up_proj":["mlp_norm"], 74 | "mlp_act":["up_proj","gate_proj"], 75 | "down_proj":["mlp_act"], 76 | "mlp_add":["attn_add","down_proj"], 77 | "output":["mlp_add"] 78 | } 79 | 80 | flashattention_transformer_layer_graph={ 81 | "input":[], 82 | "attn_norm": ["input"], 83 | "q_proj":["attn_norm"], 84 | "k_proj":["attn_norm"], 85 | "v_proj":["attn_norm"], 86 | "fused_attention":["q_proj","k_proj","v_proj"], 87 | "out_proj":["fused_attention"], 88 | "attn_add":["input","out_proj"], 89 | "mlp_norm":["attn_add"], 90 | "gate_proj":["mlp_norm"], 91 | "up_proj":["mlp_norm"], 92 | "mlp_act":["up_proj","gate_proj"], 93 | "down_proj":["mlp_act"], 94 | "mlp_add":["attn_add","down_proj"], 95 | "output":["mlp_add"] 96 | } 97 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LLM-Viewer 2 | 3 | LLM-Viewer 4 | 5 | 6 | LLM-Viewer is a tool for visualizing Language and Learning Models (LLMs) and analyzing the performance on different hardware platforms. It enables network-wise analysis, considering factors such as peak memory consumption and total inference time cost. With LLM-Viewer, you can gain valuable insights into LLM inference and performance optimization. 7 | You can use LLM-Viewer in a web browser or as a command line interface (CLI) tool. The web version provides a user-friendly interface for easy configuration and visualization, you can access it at [LLM-Viewer Web](http://llm-viewer.com). 8 | 9 | We invite you to read our paper [LLM Inference Unveiled: Survey and Roofline Model Insights](https://arxiv.org/pdf/2402.16363.pdf). 10 | In this paper, we provide a comprehensive analysis of the latest advancements in efficient LLM inference using LLM-Viewer. 11 | 12 | This ongoing project will be updated. TODO list: 13 | - Show shape of tensors. 14 | - Pre-process and post-process for non-transformer layers. 15 | - Show the whole network. 16 | - Expand hardware platform compatibility and allow manual configuration of hardware parameters. 17 | - Increase support for more LLMs and enable manual configuration of model graphs. 18 | 19 | ## Workflow 20 | 21 | ![LLM-Viewer Workflow](figs/workflow.svg) 22 | 23 | As shown in the Figure, the workflow consists of the following steps: 24 | 25 | 1. Input the LLM and gather essential information about each layer, including the computation count, input and output tensor shapes, and data dependencies. 26 | 2. Provide input for the hardware and generate a roofline model that takes into account the computation capacity and memory bandwidth of the hardware. 27 | 3. Configure the inference settings, such as the batch size, prompt token length, and generation token length. 28 | 4. Configure the optimization settings, such as the quantization bitwidth, utilization of FlashAttention, decoding methods, and other system optimization techniques. 29 | 5. Use the LLM-Viewer Analyzer to analyze the performance of each layer based on the roofline model and layer information. It also tracks the memory usage of each layer and calculates the peak memory consumption based on data dependencies. The overall network performance of the LLM can be obtained by aggregating the results of all layers. 30 | 6. Generate a report that provides information such as the maximum performance and performance bottlenecks of each layer and the network, as well as the memory footprint. The report can be used to analyze curves, such as batch size-performance and sequence length-performance curves, to understand how different settings impact performance. 31 | 7. Access the LLM-Viewer web viewer for convenient visualization of the network architecture and analysis results. This tool facilitates easy configuration adjustment and provides access to various data for each layer. 32 | 33 | ## Web Usage 34 | 35 | To use LLM-Viewer in a web browser, go to the web-site [LLM-Viewer Web](http://llm-viewer.com). 36 | You can click the node to get the detailed analysis of the layer. 37 | 38 | ## CLI Usage 39 | 40 | Clone the LLM-Viewer repository from GitHub: 41 | ```git clone https://github.com/hahnyuan/LLM-Viewer.git ``` 42 | 43 | Install requirements 44 | ```pip install transformers flask flask_cors easydict``` 45 | 46 | To analyze an LLM using LLM-Viewer in command line interface (cli), run the following command: 47 | 48 | ```bash 49 | python3 analyze_cli.py facebook/opt-125m nvidia_A6000 50 | python3 analyze_cli.py meta-llama/Llama-2-7b-hf nvidia_A6000 --batchsize 1 --seqlen 2048 51 | python3 analyze_cli.py meta-llama/Llama-2-13b-hf nvidia_A6000 --batchsize 16 --seqlen 2048 52 | python3 analyze_cli.py meta-llama/Llama-2-13b-hf nvidia_A6000 --batchsize 1 --seqlen 8192 53 | 54 | # DiT models 55 | python3 analyze_cli.py DiT-XL/2 nvidia_A6000 --batchsize 1 --seqlen 256 --source DiT 56 | ``` 57 | 58 | NOTE: The time estimated by the roofline model represents the theoretical performance that the hardware can achieve. 59 | The purpose of creating this tool is to help readers gain a clearer understanding of the key factors that influence LLM inference. 60 | Only the relative relationships can be referenced. 61 | 62 | ## Citation 63 | 64 | If you are using LLM-Viewer in your research, please cite our paper: 65 | 66 | ``` 67 | @misc{yuan2024llm, 68 | title={LLM Inference Unveiled: Survey and Roofline Model Insights}, 69 | author={Zhihang Yuan and Yuzhang Shang and Yang Zhou and Zhen Dong and Chenhao Xue and Bingzhe Wu and Zhikai Li and Qingyi Gu and Yong Jae Lee and Yan Yan and Beidi Chen and Guangyu Sun and Kurt Keutzer}, 70 | year={2024}, 71 | eprint={2402.16363}, 72 | archivePrefix={arXiv}, 73 | primaryClass={cs.CL} 74 | } 75 | ``` -------------------------------------------------------------------------------- /get_model_graph.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM 2 | import importlib 3 | import os 4 | from hardwares.hardware_params import hardware_params 5 | from model_analyzer import ModelAnalyzer 6 | from utils import str_number 7 | import numpy as np 8 | import re 9 | from backend_settings import avaliable_model_ids_sources 10 | 11 | config_cache = {} 12 | 13 | 14 | def get_analyer(model_id, hardware, config_path) -> ModelAnalyzer: 15 | config = f"{model_id}_{hardware}_{config_path}" 16 | if config not in config_cache: 17 | config_cache[config] = ModelAnalyzer( 18 | model_id, 19 | hardware, 20 | config_path, 21 | source=avaliable_model_ids_sources[model_id]["source"], 22 | ) 23 | return config_cache[config] 24 | 25 | 26 | # def get_model_config(model_id,config_path): 27 | # if model_id not in config_cache: 28 | # model_config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) 29 | # config = importlib.import_module(config_path.replace("/", ".").replace(".py", "")) 30 | # config_cache[model_id] = model_config,config 31 | # return config_cache[model_id] 32 | 33 | 34 | def get_quant_bit(dtype): 35 | if dtype == "FP16": 36 | return 16 37 | elif dtype == "INT8": 38 | return 8 39 | elif dtype == "INT4": 40 | return 4 41 | elif "bit" in dtype: 42 | bitwidth = int(re.findall(r"\d+", dtype)[0]) 43 | return bitwidth 44 | else: 45 | raise ValueError(f"Unsupported dtype:{dtype}") 46 | 47 | 48 | def get_model_graph(model_id, hardware, config_path, inference_config): 49 | 50 | # Roofline model 51 | w_bit = get_quant_bit(inference_config["w_quant"]) 52 | a_bit = get_quant_bit(inference_config["a_quant"]) 53 | kv_bit = get_quant_bit(inference_config["kv_quant"]) 54 | seq_length = int(inference_config["seq_length"]) 55 | batch_size = int(inference_config["batch_size"]) 56 | use_flashattention = bool(inference_config["use_flashattention"]) 57 | gen_length = int(inference_config["gen_length"]) 58 | tp_size = int(inference_config["tp_size"]) 59 | 60 | analyzer = get_analyer(model_id, hardware, config_path) 61 | result = analyzer.analyze( 62 | seqlen=seq_length, 63 | batchsize=batch_size, 64 | w_bit=w_bit, 65 | a_bit=a_bit, 66 | kv_bit=kv_bit, 67 | use_flashattention=use_flashattention, 68 | tp_size=tp_size 69 | ) 70 | bandwidth, max_OPS, onchip_buffer = analyzer.get_hardware_info() 71 | GQA = analyzer.get_model_info()["GQA"] 72 | hardware_info = { 73 | "bandwidth": bandwidth, 74 | "max_OPS": max_OPS, 75 | "onchip_buffer": onchip_buffer, 76 | } 77 | 78 | nodes = [ 79 | { 80 | "label": "input", 81 | "id": "input", 82 | } 83 | ] 84 | edges = [] 85 | 86 | def write_to_node(name, OPs, memory_access, info, input_names=[]): 87 | node = { 88 | "label": name, 89 | "id": name, 90 | "description": f"OPs:{str_number(OPs)}, Access:{str_number(memory_access)}", 91 | "info": info, 92 | } 93 | if GQA and name in ["qk_matmul", "sv_matmul"]: 94 | node["label"] += "(GQA)" 95 | nodes.append(node) 96 | for input_name in input_names: 97 | edge = {"source": input_name, "target": name} 98 | edges.append(edge) 99 | 100 | if use_flashattention: 101 | layer_graph = analyzer.config.flashattention_transformer_layer_graph 102 | else: 103 | layer_graph = analyzer.config.transformer_layer_graph 104 | stage = inference_config["stage"] 105 | total_results = result["total_results"] 106 | if stage != "chat": 107 | result = result[stage] 108 | else: 109 | result = result["prefill"] 110 | 111 | for name, input_names in layer_graph.items(): 112 | if name in ["input", "output"]: 113 | OPs = 0 114 | memory_access = 0 115 | info = {} 116 | else: 117 | OPs = result[name]["OPs"] 118 | memory_access = result[name]["memory_access"] 119 | info = result[name] 120 | write_to_node(name, OPs, memory_access, info, input_names) 121 | if stage == "chat": 122 | # seq_length:seq_length+gen_length 123 | total_results["chat"] = total_results["prefill"] 124 | n_divide = min(10, gen_length) 125 | for lengthi in np.linspace(seq_length + 1, seq_length + gen_length, n_divide): 126 | gen_result = analyzer.analyze( 127 | seqlen=lengthi, 128 | batchsize=batch_size, 129 | w_bit=w_bit, 130 | a_bit=a_bit, 131 | kv_bit=kv_bit, 132 | use_flashattention=use_flashattention, 133 | ) 134 | for k, v in gen_result["total_results"]["decode"].items(): 135 | total_results["chat"][k] += v * gen_length / n_divide 136 | for name, input_names in layer_graph.items(): 137 | if name in gen_result["decode"]: 138 | result[name]["OPs"] += ( 139 | gen_result["decode"][name]["OPs"] * gen_length / n_divide 140 | ) 141 | result[name]["memory_access"] += ( 142 | gen_result["decode"][name]["memory_access"] 143 | * gen_length 144 | / n_divide 145 | ) 146 | for name, input_names in layer_graph.items(): 147 | if name in ["input", "output"]: 148 | OPs = 0 149 | memory_access = 0 150 | info = {} 151 | else: 152 | OPs = result[name]["OPs"] 153 | memory_access = result[name]["memory_access"] 154 | info = {} 155 | write_to_node(name, OPs, memory_access, info, input_names) 156 | return nodes, edges, total_results, hardware_info 157 | -------------------------------------------------------------------------------- /frontend/src/components/Header.vue: -------------------------------------------------------------------------------- 1 | 66 | 67 | 121 | 122 | -------------------------------------------------------------------------------- /frontend/src/components/graphs/graph_config.js: -------------------------------------------------------------------------------- 1 | import G6 from "@antv/g6" 2 | 3 | const ICON_MAP = { 4 | normal: 'https://gw.alipayobjects.com/mdn/rms_8fd2eb/afts/img/A*0HC-SawWYUoAAAAAAAAAAABkARQnAQ', 5 | b: 'https://gw.alipayobjects.com/mdn/rms_8fd2eb/afts/img/A*sxK0RJ1UhNkAAAAAAAAAAABkARQnAQ', 6 | }; 7 | 8 | G6.registerNode( 9 | 'card-node', 10 | { 11 | drawShape: function drawShape(cfg, group) { 12 | const color = cfg.is_linear ? '#F4664A' : '#30BF78'; 13 | const r = 2; 14 | const shape = group.addShape('rect', { 15 | attrs: { 16 | x: 0, 17 | y: 0, 18 | width: 150, 19 | height: 60, 20 | stroke: color, 21 | radius: r, 22 | }, 23 | // must be assigned in G6 3.3 and later versions. it can be any string you want, but should be unique in a custom item type 24 | name: 'main-box', 25 | draggable: true, 26 | }); 27 | 28 | group.addShape('rect', { 29 | attrs: { 30 | x: 0, 31 | y: 0, 32 | width: 150, 33 | height: 21, 34 | fill: color, 35 | radius: [r, r, 0, 0], 36 | }, 37 | // must be assigned in G6 3.3 and later versions. it can be any string you want, but should be unique in a custom item type 38 | name: 'title-box', 39 | draggable: true, 40 | }); 41 | 42 | // left icon 43 | group.addShape('image', { 44 | attrs: { 45 | x: 4, 46 | y: 2, 47 | height: 16, 48 | width: 16, 49 | cursor: 'pointer', 50 | img: ICON_MAP[cfg.nodeType || 'app'], 51 | }, 52 | // must be assigned in G6 3.3 and later versions. it can be any string you want, but should be unique in a custom item type 53 | name: 'node-icon', 54 | }); 55 | 56 | // title text 57 | group.addShape('text', { 58 | attrs: { 59 | textBaseline: 'top', 60 | y: 5, 61 | x: 24, 62 | lineHeight: 20, 63 | text: cfg.title, 64 | fill: '#fff', 65 | }, 66 | // must be assigned in G6 3.3 and later versions. it can be any string you want, but should be unique in a custom item type 67 | name: 'title', 68 | }); 69 | 70 | // if (cfg.nodeLevel > 0) { 71 | // group.addShape('marker', { 72 | // attrs: { 73 | // x: 184, 74 | // y: 30, 75 | // r: 6, 76 | // cursor: 'pointer', 77 | // symbol: cfg.collapse ? G6.Marker.expand : G6.Marker.collapse, 78 | // stroke: '#666', 79 | // lineWidth: 1, 80 | // }, 81 | // // must be assigned in G6 3.3 and later versions. it can be any string you want, but should be unique in a custom item type 82 | // name: 'collapse-icon', 83 | // }); 84 | // } 85 | 86 | // The content list 87 | cfg.panels.forEach((item, index) => { 88 | // name text 89 | group.addShape('text', { 90 | attrs: { 91 | textBaseline: 'top', 92 | y: 27, 93 | x: 24 + index * 60, 94 | lineHeight: 20, 95 | text: item.title, 96 | fill: 'rgba(0,0,0, 0.4)', 97 | }, 98 | // must be assigned in G6 3.3 and later versions. it can be any string you want, but should be unique in a custom item type 99 | name: `index-title-${index}`, 100 | }); 101 | 102 | // value text 103 | group.addShape('text', { 104 | attrs: { 105 | textBaseline: 'top', 106 | y: 45, 107 | x: 24 + index * 60, 108 | lineHeight: 20, 109 | text: item.value, 110 | fill: '#595959', 111 | }, 112 | // must be assigned in G6 3.3 and later versions. it can be any string you want, but should be unique in a custom item type 113 | name: `index-value-${index}`, 114 | }); 115 | }); 116 | return shape; 117 | }, 118 | }, 119 | 'single-node', 120 | ); 121 | 122 | 123 | export const graph_config = { 124 | container: 'graphContainer', // String | HTMLElement,必须,在 Step 1 中创建的容器 id 或容器本身 125 | width: window.innerWidth, // Number,必须,图的宽度 126 | height: window.innerHeight, // Number,必须,图的高度 127 | defaultEdge: { 128 | // type: 'line', 129 | type: 'polyline', 130 | // type: 'quadratic', 131 | sourceAnchor: 1, 132 | // // 该边连入 target 点的第 0 个 anchorPoint, 133 | targetAnchor: 0, 134 | style: { 135 | 136 | endArrow: { 137 | path: G6.Arrow.triangle(5, 10), // 使用内置箭头路径函数,参数为箭头的 宽度、长度、偏移量(默认为 0,与 d 对应) 138 | fill: "#aaaaaa", 139 | opacity: 50, 140 | }, 141 | stroke: "#000000", 142 | }, 143 | }, 144 | defaultNode: { 145 | // ... 其他属性 146 | // type: 'card-node', 147 | type: 'modelRect', 148 | // type: 'rect', 149 | // hight: 200, 150 | size: [190, 60], // 设置节点的默认宽度和高度 151 | anchorPoints: [ 152 | [0.5, 0], 153 | [0.5, 1] 154 | ], 155 | // anchorPoints: [ 156 | // [0, 0.5], 157 | // [1, 0.5], 158 | // ], 159 | logoIcon: { 160 | show: false, 161 | }, 162 | stateIcon: { 163 | show: false, 164 | img: 165 | 'https://gw.alipayobjects.com/zos/basement_prod/c781088a-c635-452a-940c-0173663456d4.svg', 166 | }, 167 | 168 | // style: { 169 | // radius: 5, 170 | // // fill: '#C6E5FF', 171 | // // stroke: '#5B8FF9', 172 | // }, 173 | labelCfg:{ 174 | offset: 15, 175 | style: { 176 | fill: '#000000', 177 | fontSize: 20, 178 | stroke: '#E7E7E7', 179 | } 180 | }, 181 | descriptionCfg: { 182 | style: { 183 | fill: '#656565', 184 | fontSize: 14, 185 | }, 186 | }, 187 | 188 | }, 189 | // fitView: true, 190 | // plugins: [minimap], // 将 minimap 实例配置到图上 191 | modes: { 192 | // default: ['drag-canvas', 'zoom-canvas', 'drag-node', 'lasso-select'], // 允许拖拽画布、放缩画布、拖拽节点 193 | default: ['drag-canvas', 'zoom-canvas', 'lasso-select'], // 允许拖拽画布、放缩画布、拖拽节点 194 | }, 195 | layout: { 196 | type: 'dagre', 197 | // rankdir: 'LR', // The center of the graph by default 198 | // align: 'UR', 199 | nodesep: 10, 200 | ranksep: 20, 201 | controlPoints: true, 202 | }, 203 | } -------------------------------------------------------------------------------- /frontend/src/components/left_controls/Config.vue: -------------------------------------------------------------------------------- 1 | 125 | 126 | 201 | 202 | -------------------------------------------------------------------------------- /frontend/src/components/Graph.vue: -------------------------------------------------------------------------------- 1 | 31 | 32 | 335 | 336 | -------------------------------------------------------------------------------- /model_analyzer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import importlib 3 | from hardwares.hardware_params import hardware_params 4 | from roofline_model import roofline_analyze 5 | from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM 6 | from utils import str_number, str_number_time 7 | import math 8 | 9 | ALL_DATA_NAMES = [ 10 | "OPs", 11 | "memory_access", 12 | "load_weight", 13 | "load_act", 14 | "store_act", 15 | "load_kv_cache", 16 | "store_kv_cache", 17 | "inference_time", 18 | ] 19 | 20 | 21 | class ModelAnalyzer: 22 | def __init__(self, model_id, hardware, config_file=None, source="huggingface"): 23 | """ 24 | source: 'huggingface' or 'DiT' 25 | """ 26 | self.model_id = model_id 27 | self.hardware = hardware 28 | if config_file is None: 29 | # get the current file directory 30 | current_dir = os.path.dirname(os.path.abspath(__file__)) 31 | # auto search the config 32 | for file in os.listdir(current_dir + "/configs"): 33 | if file.endswith(".py") and file.replace(".py", "") in model_id: 34 | config_file = "configs/" + file 35 | # print(f"auto search config file {config_file} {file} {model_id}") 36 | assert config_file is not None, "config file is not found, please specify it manually." 37 | print(f"use config file {config_file} for {model_id}") 38 | if source == "huggingface": 39 | self.model_params = AutoConfig.from_pretrained(model_id, trust_remote_code=True) 40 | else: 41 | if not os.path.exists(f"model_params/{source}.py"): 42 | raise Exception(f"model_params/{source}.py is not found") 43 | # from model_params.DiT import model_params 44 | module = importlib.import_module(f"model_params.{source}") 45 | self.model_params = module.model_params[model_id] 46 | self.config = importlib.import_module(config_file.replace("/", ".").replace(".py", "")) 47 | 48 | # temporary variables 49 | self.results = None 50 | self.w_bit = None 51 | self.a_bit = None 52 | self.kv_bit = None 53 | self.batchsize = None 54 | self.seqlen = None 55 | 56 | def _analyze_to_results( 57 | self, 58 | stage, 59 | name, 60 | OPs=0, 61 | load_weight=0, 62 | load_act=0, 63 | store_act=0, 64 | load_kv_cache=0, 65 | store_kv_cache=0, 66 | ): 67 | 68 | bandwidth, max_OPS, onchip_buffer = self.get_hardware_info() 69 | memory_access = load_weight + load_act + store_act + load_kv_cache + store_kv_cache 70 | arithmetic_intensity, performance, bound = roofline_analyze(bandwidth, max_OPS, OPs, memory_access) 71 | inference_time = OPs / performance 72 | self.results[stage][name] = { 73 | "OPs": OPs, 74 | "memory_access": memory_access, 75 | "arithmetic_intensity": arithmetic_intensity, 76 | "performance": performance, 77 | "bound": bound, 78 | "load_weight": load_weight, 79 | "load_act": load_act, 80 | "store_act": store_act, 81 | "load_kv_cache": load_kv_cache, 82 | "store_kv_cache": store_kv_cache, 83 | "inference_time": inference_time, 84 | } 85 | 86 | def save_csv(self, save_path=None): 87 | if save_path is None: 88 | save_path = f"output/{self.model_id[:self.model_id.rfind('/')]}" 89 | if not os.path.exists(save_path): 90 | os.makedirs(save_path) 91 | save_path += f"{self.model_id[self.model_id.rfind('/'):]}" 92 | 93 | decode_file_name = f"{save_path}_decode.csv" 94 | prefill_file_name = f"{save_path}_prefill.csv" 95 | print(f"save to {decode_file_name} and {prefill_file_name}") 96 | 97 | for file_name, stage in [ 98 | (decode_file_name, "decode"), 99 | (prefill_file_name, "prefill"), 100 | ]: 101 | with open(file_name, "a+") as f: 102 | 103 | f.write( 104 | f"\n\n=== {self.model_id} {self.hardware} w_bit={self.w_bit} a_bit={self.a_bit} kv_bit={self.kv_bit} batchsize={self.batchsize} seqlen={self.seqlen} tp_size={self.tp_size} ===\n" 105 | ) 106 | # legend 107 | f.write( 108 | f"layer_name,OPs,Access,arithmetic_intensity,performance,bound,load_weight,load_act,store_act,load_kv_cache,store_kv_cache,inference_time\n" 109 | ) 110 | with open(file_name, "a+") as f: 111 | for layer_name, result in self.results[stage].items(): 112 | f.write( 113 | f"{layer_name},{str_number(result['OPs'])},{str_number(result['memory_access'])}B,{str_number(result['arithmetic_intensity'])},{str_number(result['performance'])}," 114 | f"{result['bound']},{str_number(result['load_weight'])}B,{str_number(result['load_act'])}B,{str_number(result['store_act'])}B,{str_number(result['load_kv_cache'])}B," 115 | f"{str_number(result['store_kv_cache'])}B,{str_number_time(result['inference_time'])}s\n" 116 | ) 117 | 118 | def analyze( 119 | self, 120 | seqlen, 121 | batchsize, 122 | w_bit=16, 123 | a_bit=16, 124 | kv_bit=None, 125 | use_flashattention=False, 126 | kv_token_ratio=1, 127 | tp_size: int = 1 128 | ): 129 | """ 130 | seqlen: sequence length 131 | batchsize: batch size 132 | w_bit: weight bit 133 | a_bit: activation bit 134 | kv_bit: key and value bit. if it is None, it will be the same as a_bit 135 | use_flashattention: use flash attention/flash decoding 136 | kv_token_ratio: use this for KV compression 137 | tp_size: the number of devices for tensor parallelism to use 138 | 139 | return is a dict with the following format: 140 | { 141 | "decode": { 142 | "layer_name": { 143 | "OPs": "", 144 | "memory_access": "", 145 | "arithmetic_intensity": "", 146 | "performance": "", 147 | "bound": "", 148 | "load_weight": "", 149 | "load_act": "", 150 | "store_act": "", 151 | "load_kv_cache": "", 152 | "store_kv_cache": "", 153 | "inference_time": "" 154 | } 155 | }, 156 | "prefill": { 157 | "layer_name": { 158 | "OPs": "", 159 | "memory_access": "", 160 | "arithmetic_intensity": "", 161 | "performance": "", 162 | "bound": "", 163 | "load_weight": "", 164 | "load_act": "", 165 | "store_act": "", 166 | "load_kv_cache": "", 167 | "store_kv_cache": "", 168 | "inference_time": "" 169 | } 170 | }, 171 | "total_results": { 172 | "decode": {}, 173 | "prefill": {} 174 | } 175 | } 176 | """ 177 | assert seqlen > 0 178 | assert batchsize > 0 179 | self.results = {"decode": {}, "prefill": {}} 180 | if kv_bit is None: 181 | kv_bit = a_bit 182 | self.w_bit = w_bit 183 | self.a_bit = a_bit 184 | self.kv_bit = kv_bit 185 | self.batchsize = batchsize 186 | self.seqlen = seqlen 187 | self.tp_size = tp_size 188 | 189 | w_byte = self.w_bit / 8 190 | a_byte = self.a_bit / 8 191 | kv_byte = self.kv_bit / 8 192 | 193 | config = self.config 194 | model_params = self.model_params 195 | num_attention_heads = config.get_num_attention_heads(model_params) 196 | hidden_size = config.get_hidden_size(model_params) 197 | num_key_value_heads = config.get_num_key_value_heads(model_params) 198 | num_hidden_layers = config.get_num_hidden_layers(model_params) 199 | 200 | for name, (ic, oc) in config.get_linear_layers(model_params, tp_size).items(): 201 | # for linear layers 202 | is_kv_proj = name in ["k_proj", "v_proj"] 203 | is_normal_proj = not is_kv_proj 204 | self._analyze_to_results( 205 | "decode", 206 | name, 207 | OPs=ic * oc * batchsize * 2, 208 | load_weight=ic * oc * w_byte, 209 | load_act=ic * batchsize * a_byte, 210 | store_act=0 if is_kv_proj else oc * batchsize * a_byte, 211 | load_kv_cache=0, 212 | store_kv_cache=(0 if is_normal_proj else oc * batchsize * kv_byte), 213 | ) 214 | # for prefill 215 | self._analyze_to_results( 216 | "prefill", 217 | name, 218 | OPs=ic * oc * batchsize * seqlen * 2, 219 | load_weight=ic * oc * w_byte, 220 | load_act=ic * batchsize * seqlen * a_byte, 221 | store_act=(0 if is_kv_proj else oc * batchsize * seqlen * a_byte), 222 | load_kv_cache=0, 223 | store_kv_cache=(0 if is_normal_proj else oc * batchsize * seqlen * kv_byte), 224 | ) 225 | 226 | # for attention 227 | head_size = hidden_size // num_attention_heads 228 | # for decode 229 | qk_matmul_OPs = seqlen * head_size * num_attention_heads * batchsize * 2 230 | sv_matmul_OPs = 1 * head_size * seqlen * num_attention_heads * batchsize * 2 231 | # the softmax operation takes five steps: 232 | # max_x=max(x) 233 | # x=x-max_x 234 | # x_exp=exp(x) 235 | # sum_x_exp=sum(x_exp) 236 | # y=x_exp/sum(x_exp) 237 | softmax_OPs = batchsize * num_attention_heads * seqlen * 1 * 5 238 | if use_flashattention: 239 | name = f"fused_attention" 240 | bandwidth, max_OPS, onchip_buffer = self.get_hardware_info() 241 | # flashattention-2 https://arxiv.org/pdf/2307.08691.pdf 242 | block_size_r = min(math.ceil(onchip_buffer / (kv_byte * head_size)), head_size) 243 | n_blocks_r = math.ceil(1 / block_size_r) 244 | q_numel = (1) * head_size * batchsize * num_attention_heads * a_byte 245 | o_numel = 1 * seqlen * batchsize * num_attention_heads * a_byte 246 | self._analyze_to_results( 247 | "decode", 248 | name, 249 | OPs=qk_matmul_OPs + sv_matmul_OPs + softmax_OPs, 250 | load_weight=0, 251 | load_act=q_numel, 252 | store_act=o_numel * 2, # initialize O and save O 253 | load_kv_cache=n_blocks_r * (seqlen) * head_size * batchsize * num_key_value_heads * kv_byte * 2, 254 | store_kv_cache=0, 255 | ) 256 | 257 | else: 258 | name = f"qk_matmul" 259 | self._analyze_to_results( 260 | "decode", 261 | name, 262 | OPs=qk_matmul_OPs, 263 | load_weight=0, 264 | load_act=(1) * head_size * batchsize * num_attention_heads * a_byte, 265 | store_act=1 * seqlen * batchsize * num_attention_heads * a_byte, 266 | load_kv_cache=(seqlen) * head_size * batchsize * num_key_value_heads * kv_byte, 267 | store_kv_cache=0, 268 | ) 269 | name = f"sv_matmul" 270 | self._analyze_to_results( 271 | "decode", 272 | name, 273 | OPs=sv_matmul_OPs, 274 | load_weight=0, 275 | load_act=(1 * seqlen * batchsize * num_attention_heads) * a_byte, 276 | store_act=1 * head_size * batchsize * num_attention_heads * a_byte, 277 | load_kv_cache=(seqlen * head_size * batchsize * num_key_value_heads) * kv_byte, 278 | store_kv_cache=0, 279 | ) 280 | 281 | name = f"softmax" 282 | # max sub exp sum div 283 | self._analyze_to_results( 284 | "decode", 285 | name, 286 | OPs=softmax_OPs, 287 | load_weight=0, 288 | load_act=batchsize * num_attention_heads * seqlen * 1 * a_byte, 289 | store_act=batchsize * num_attention_heads * seqlen * 1 * a_byte, 290 | load_kv_cache=0, 291 | store_kv_cache=0, 292 | ) 293 | 294 | for name in config.get_norm_layers(model_params): 295 | # sum sub pow sum div mul add 296 | self._analyze_to_results( 297 | "decode", 298 | name, 299 | OPs=batchsize * hidden_size * 1 * 7, 300 | load_weight=0, 301 | load_act=batchsize * hidden_size * 1 * a_byte, 302 | store_act=batchsize * hidden_size * 1 * a_byte, 303 | load_kv_cache=0, 304 | store_kv_cache=0, 305 | ) 306 | 307 | for name in ["attn_add", "mlp_add"]: 308 | self._analyze_to_results( 309 | "decode", 310 | name, 311 | OPs=batchsize * hidden_size * 1, 312 | load_weight=0, 313 | load_act=batchsize * hidden_size * 1 * a_byte, 314 | store_act=batchsize * hidden_size * 1 * a_byte, 315 | load_kv_cache=0, 316 | store_kv_cache=0, 317 | ) 318 | for name in ["mlp_act"]: 319 | self._analyze_to_results( 320 | "decode", 321 | name, 322 | OPs=batchsize * hidden_size * 1 * 2, 323 | load_weight=0, 324 | load_act=batchsize * hidden_size * 1 * a_byte * 2, 325 | store_act=batchsize * hidden_size * 1 * a_byte, 326 | load_kv_cache=0, 327 | store_kv_cache=0, 328 | ) 329 | 330 | # for prefill 331 | qk_matmul_OPs = seqlen * seqlen * head_size * num_attention_heads * batchsize * 2 332 | sv_matmul_OPs = seqlen * head_size * seqlen * num_attention_heads * batchsize * 2 333 | softmax_OPs = batchsize * num_attention_heads * seqlen * seqlen * 5 334 | if use_flashattention: 335 | name = f"fused_attention" 336 | bandwidth, max_OPS, onchip_buffer = self.get_hardware_info() 337 | # flashattention-2 https://arxiv.org/pdf/2307.08691.pdf 338 | block_size_r = min(math.ceil(onchip_buffer / (kv_byte * head_size)), head_size) 339 | n_blocks_r = math.ceil(seqlen / block_size_r) 340 | q_numel = seqlen * head_size * batchsize * num_attention_heads * a_byte 341 | o_numel = seqlen * seqlen * batchsize * num_attention_heads * a_byte 342 | self._analyze_to_results( 343 | "prefill", 344 | name, 345 | OPs=qk_matmul_OPs + sv_matmul_OPs + softmax_OPs, 346 | load_weight=0, 347 | load_act=q_numel, 348 | store_act=o_numel * 2, # initialize O and save O 349 | load_kv_cache=n_blocks_r * (seqlen) * head_size * batchsize * num_key_value_heads * kv_byte * 2, 350 | store_kv_cache=0, 351 | ) 352 | else: 353 | name = f"qk_matmul" 354 | self._analyze_to_results( 355 | "prefill", 356 | name, 357 | OPs=qk_matmul_OPs, 358 | load_weight=0, 359 | load_act=seqlen * head_size * batchsize * num_key_value_heads * a_byte, 360 | store_act=seqlen * seqlen * batchsize * num_attention_heads * a_byte, 361 | load_kv_cache=seqlen * head_size * batchsize * num_key_value_heads * kv_byte, 362 | store_kv_cache=0, 363 | ) 364 | name = f"sv_matmul" 365 | self._analyze_to_results( 366 | "prefill", 367 | name, 368 | OPs=sv_matmul_OPs, 369 | load_weight=0, 370 | load_act=seqlen * seqlen * batchsize * num_attention_heads * a_byte, 371 | store_act=seqlen * head_size * batchsize * num_attention_heads * a_byte, 372 | load_kv_cache=seqlen * head_size * batchsize * num_key_value_heads * kv_byte, 373 | store_kv_cache=0, 374 | ) 375 | name = f"softmax" 376 | self._analyze_to_results( 377 | "prefill", 378 | name, 379 | OPs=softmax_OPs, 380 | load_weight=0, 381 | load_act=batchsize * num_attention_heads * seqlen * seqlen * a_byte, 382 | store_act=batchsize * num_attention_heads * seqlen * seqlen * a_byte, 383 | load_kv_cache=0, 384 | store_kv_cache=0, 385 | ) 386 | for name in config.get_norm_layers(model_params): 387 | self._analyze_to_results( 388 | "prefill", 389 | name, 390 | OPs=batchsize * hidden_size * seqlen * 7, 391 | load_weight=0, 392 | load_act=batchsize * hidden_size * seqlen * a_byte, 393 | store_act=batchsize * hidden_size * seqlen * a_byte, 394 | load_kv_cache=0, 395 | store_kv_cache=0, 396 | ) 397 | for name in ["attn_add", "mlp_add"]: 398 | self._analyze_to_results( 399 | "prefill", 400 | name, 401 | OPs=batchsize * hidden_size * seqlen * 1, 402 | load_weight=0, 403 | load_act=batchsize * hidden_size * seqlen * a_byte, 404 | store_act=batchsize * hidden_size * seqlen * a_byte, 405 | load_kv_cache=0, 406 | store_kv_cache=0, 407 | ) 408 | for name in ["mlp_act"]: 409 | self._analyze_to_results( 410 | "prefill", 411 | name, 412 | OPs=batchsize * hidden_size * seqlen * 1 * 2, 413 | load_weight=0, 414 | load_act=batchsize * hidden_size * seqlen * a_byte * 2, 415 | store_act=batchsize * hidden_size * seqlen * a_byte, 416 | load_kv_cache=0, 417 | store_kv_cache=0, 418 | ) 419 | 420 | # compute total 421 | total_results = {"decode": {}, "prefill": {}} 422 | for data_name in ALL_DATA_NAMES: 423 | total_results["decode"][data_name] = 0 424 | total_results["prefill"][data_name] = 0 425 | for stage in ["decode", "prefill"]: 426 | for layer_name, result in self.results[stage].items(): 427 | for data_name in ALL_DATA_NAMES: 428 | total_results[stage][data_name] += result[data_name] * num_hidden_layers 429 | 430 | # memory footprint 431 | weight_kv_footprint = total_results["prefill"]["load_weight"] + total_results["prefill"]["store_kv_cache"] 432 | decode_tmp_act = 0 433 | for layer_name, result in self.results["decode"].items(): 434 | decode_tmp_act += result["store_act"] 435 | total_results["decode"]["memory_consumption"] = decode_tmp_act + weight_kv_footprint 436 | total_results["decode"]["memory_consumption_tmp_act"] = decode_tmp_act 437 | total_results["decode"]["memory_consumption_weight"] = total_results["prefill"]["load_weight"] 438 | total_results["decode"]["memory_consumption_kv_cache"] = total_results["prefill"]["store_kv_cache"] 439 | prefill_tmp_act = 0 440 | for layer_name, result in self.results["prefill"].items(): 441 | prefill_tmp_act += result["store_act"] 442 | total_results["prefill"]["memory_consumption"] = prefill_tmp_act + weight_kv_footprint 443 | total_results["prefill"]["memory_consumption_tmp_act"] = prefill_tmp_act 444 | total_results["prefill"]["memory_consumption_weight"] = total_results["prefill"]["load_weight"] 445 | total_results["prefill"]["memory_consumption_kv_cache"] = total_results["prefill"]["store_kv_cache"] 446 | 447 | # lm_head 448 | name = "lm_head" 449 | args = {"batchsize": batchsize, "a_byte": a_byte, "w_byte": w_byte} 450 | for layer_info in self.config.post_process(self.model_params, args): 451 | self._analyze_to_results(**layer_info) 452 | for data_name in ALL_DATA_NAMES: 453 | total_results[layer_info["stage"]][data_name] += self.results[layer_info["stage"]][layer_info["name"]][ 454 | data_name 455 | ] 456 | # for stage in ["prefill", "decode"]: 457 | # self._analyze_to_results( 458 | # stage, 459 | # name, 460 | # OPs=batchsize * hidden_size * vocab_size * 1, 461 | # load_weight=hidden_size * vocab_size, 462 | # load_act=hidden_size * a_byte, 463 | # store_act=vocab_size * a_byte, 464 | # load_kv_cache=0, 465 | # store_kv_cache=0, 466 | # ) 467 | # for data_name in ALL_DATA_NAMES: 468 | # total_results[stage][data_name] += self.results[stage][name][data_name] 469 | 470 | self.results["total_results"] = total_results 471 | return self.results 472 | 473 | def analyze_generate_task( 474 | self, 475 | prompt_len, 476 | gen_len, 477 | batchsize, 478 | w_bit=16, 479 | a_bit=16, 480 | kv_bit=None, 481 | use_flashattention = False, 482 | tp_size: int = 1 483 | ): 484 | prefill_result = self.analyze( 485 | prompt_len, 486 | batchsize, 487 | w_bit, 488 | a_bit, 489 | kv_bit, 490 | use_flashattention=use_flashattention, 491 | tp_size=tp_size 492 | ) 493 | prefill_time = inference_time = prefill_result["total_results"]["prefill"]["inference_time"] 494 | 495 | for i in range(prompt_len, prompt_len + gen_len): 496 | result = self.analyze(i, batchsize, w_bit, a_bit, kv_bit, use_flashattention=use_flashattention, tp_size=tp_size) 497 | inference_time += result["total_results"]["decode"]["inference_time"] 498 | return {"inference_time": inference_time, "prefill_time": prefill_time} 499 | 500 | def get_hardware_info(self): 501 | bandwidth = hardware_params[self.hardware]["bandwidth"] 502 | if self.w_bit <= 8 and self.a_bit <= 8 and self.kv_bit <= 8: 503 | max_OPS = hardware_params[self.hardware]["INT8"] 504 | else: 505 | max_OPS = hardware_params[self.hardware]["FP16"] 506 | onchip_buffer = hardware_params[self.hardware]["onchip_buffer"] 507 | return bandwidth, max_OPS, onchip_buffer 508 | 509 | def get_model_info(self): 510 | if self.config.get_num_attention_heads(self.model_params) != self.config.get_num_key_value_heads( 511 | self.model_params 512 | ): 513 | GQA = True 514 | else: 515 | GQA = False 516 | 517 | info = {"GQA": GQA} # group query attention 518 | return info 519 | -------------------------------------------------------------------------------- /examples/plot_hardware.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import sys\n", 10 | "sys.path.append('..')\n", 11 | "import numpy as np\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "from hardwares.hardware_params import hardware_params\n", 14 | "from model_analyzer import ModelAnalyzer\n", 15 | "%load_ext autoreload\n", 16 | "%autoreload 2" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 30, 22 | "metadata": {}, 23 | "outputs": [ 24 | { 25 | "name": "stdout", 26 | "output_type": "stream", 27 | "text": [ 28 | "use config file configs/Llama.py for meta-llama/Llama-2-13b-hf\n", 29 | "use config file configs/Llama.py for meta-llama/Llama-2-13b-hf\n", 30 | "use config file configs/Llama.py for meta-llama/Llama-2-13b-hf\n", 31 | "use config file configs/Llama.py for meta-llama/Llama-2-13b-hf\n", 32 | "use config file configs/Llama.py for meta-llama/Llama-2-13b-hf\n", 33 | "use config file configs/Llama.py for meta-llama/Llama-2-13b-hf\n", 34 | "use config file configs/Llama.py for meta-llama/Llama-2-13b-hf\n", 35 | "use config file configs/Llama.py for meta-llama/Llama-2-13b-hf\n", 36 | "use config file configs/Llama.py for meta-llama/Llama-2-13b-hf\n", 37 | "use config file configs/Llama.py for meta-llama/Llama-2-13b-hf\n", 38 | "use config file configs/Llama.py for meta-llama/Llama-2-13b-hf\n", 39 | "use config file configs/Llama.py for meta-llama/Llama-2-13b-hf\n", 40 | "use config file configs/Llama.py for meta-llama/Llama-2-13b-hf\n", 41 | "use config file configs/Llama.py for meta-llama/Llama-2-13b-hf\n", 42 | "use config file configs/Llama.py for meta-llama/Llama-2-13b-hf\n", 43 | "use config file configs/Llama.py for meta-llama/Llama-2-13b-hf\n", 44 | "use config file configs/Llama.py for meta-llama/Llama-2-13b-hf\n", 45 | "use config file configs/Llama.py for meta-llama/Llama-2-13b-hf\n", 46 | "use config file configs/Llama.py for meta-llama/Llama-2-13b-hf\n", 47 | "use config file configs/Llama.py for meta-llama/Llama-2-13b-hf\n" 48 | ] 49 | } 50 | ], 51 | "source": [ 52 | "import os\n", 53 | "os.environ[\"https_proxy\"]=\"127.0.0.1:7890\"\n", 54 | "\n", 55 | "model_id=\"meta-llama/Llama-2-13b-hf\"\n", 56 | "bandwidths=[]\n", 57 | "performances=[]\n", 58 | "for bandwidth_scale in np.linspace(1,20,20):\n", 59 | " hardware_name=f\"h{bandwidth_scale}\"\n", 60 | " bandwidth=200e9*bandwidth_scale\n", 61 | " hardware_params[hardware_name] = {\"bandwidth\": bandwidth, \"FP16\": 200e12, \"onchip_buffer\": 10240e3}\n", 62 | " analyzer=ModelAnalyzer(model_id,hardware_name)\n", 63 | " result=analyzer.analyze(1024,1)\n", 64 | " \n", 65 | " bandwidths.append(bandwidth)\n", 66 | " performances.append(result[\"total_results\"][\"decode\"][\"inference_time\"])\n", 67 | "throughput=1/np.array(performances)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 43, 73 | "metadata": {}, 74 | "outputs": [ 75 | { 76 | "data": { 77 | "image/png": "", 78 | "text/plain": [ 79 | "
" 80 | ] 81 | }, 82 | "metadata": {}, 83 | "output_type": "display_data" 84 | } 85 | ], 86 | "source": [ 87 | "hardware=\"nvidia_A6000\"\n", 88 | "bandwidth = hardware_params[hardware][\"bandwidth\"]\n", 89 | "max_OPS = hardware_params[hardware][\"FP16\"]\n", 90 | "\n", 91 | "fig=plt.figure(figsize=(6, 2.5))\n", 92 | "plt.subplot(1, 2, 1)\n", 93 | "y_max = max_OPS\n", 94 | "turning_point = y_max / bandwidth\n", 95 | "\n", 96 | "plt.plot(\n", 97 | " [0, turning_point/3, turning_point * 3], [0, y_max, y_max], color=\"black\"\n", 98 | ")\n", 99 | "plt.plot(\n", 100 | " [0, turning_point, turning_point * 3], [0, y_max, y_max], color=\"black\"\n", 101 | ")\n", 102 | "plt.xlabel(\"Arithmetic Intensity (OPs/byte)\")\n", 103 | "plt.ylabel(\"Performance (OPS)\")\n", 104 | "\n", 105 | "plt.ylim(0, y_max * 1.1)\n", 106 | "plt.xlim(0, turning_point * 1.2)\n", 107 | "\n", 108 | "plt.vlines(50, 0, max_OPS*2, color=\"black\", linestyle=\"--\")\n", 109 | "plt.annotate(\n", 110 | " \"low\\nbandwidth\",\n", 111 | " xy=(50, max_OPS*(50/(max_OPS / bandwidth))),\n", 112 | " xytext=(75, max_OPS*(50/(max_OPS / bandwidth))*0.7),\n", 113 | " arrowprops=dict(arrowstyle=\"->\", color=\"red\"),\n", 114 | ")\n", 115 | "\n", 116 | "# plt.vlines(180, 0, max_OPS*2, color=\"black\", linestyle=\"--\")\n", 117 | "plt.annotate(\n", 118 | " \"high\\nbandwidth\",\n", 119 | " xy=(50, max_OPS*(50*3/(max_OPS / bandwidth))),\n", 120 | " xytext=(75, max_OPS*(50*3/(max_OPS / bandwidth))*1.05),\n", 121 | " arrowprops=dict(arrowstyle=\"->\", color=\"red\"),\n", 122 | ")\n", 123 | "plt.title(\"Roofline Model\")\n", 124 | "\n", 125 | "plt.subplot(1, 2, 2)\n", 126 | "plt.plot(\n", 127 | " bandwidths, performances\n", 128 | ")\n", 129 | "plt.xlabel(\"Bandwidth (byte/s)\")\n", 130 | "plt.ylabel(\"Decode Time (s)\")\n", 131 | "\n", 132 | "plt.title(\"LLaMA-2-13b\")\n", 133 | "\n", 134 | "plt.tight_layout()\n", 135 | "# save pdf\n", 136 | "plt.savefig(f\"../output/hardware_bandwidth.pdf\", bbox_inches=\"tight\")" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [] 145 | } 146 | ], 147 | "metadata": { 148 | "kernelspec": { 149 | "display_name": "houmo_llm", 150 | "language": "python", 151 | "name": "python3" 152 | }, 153 | "language_info": { 154 | "codemirror_mode": { 155 | "name": "ipython", 156 | "version": 3 157 | }, 158 | "file_extension": ".py", 159 | "mimetype": "text/x-python", 160 | "name": "python", 161 | "nbconvert_exporter": "python", 162 | "pygments_lexer": "ipython3", 163 | "version": "3.10.13" 164 | } 165 | }, 166 | "nbformat": 4, 167 | "nbformat_minor": 2 168 | } 169 | -------------------------------------------------------------------------------- /examples/plot_memory.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import sys\n", 10 | "sys.path.append('..')\n", 11 | "import numpy as np\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "from hardwares.hardware_params import hardware_params\n", 14 | "from model_analyzer import ModelAnalyzer" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "name": "stdout", 24 | "output_type": "stream", 25 | "text": [ 26 | "use config file configs/Llama.py for meta-llama/Llama-2-13b-hf\n" 27 | ] 28 | } 29 | ], 30 | "source": [ 31 | "model_id=\"meta-llama/Llama-2-13b-hf\"\n", 32 | "hardware=\"nvidia_A6000\"\n", 33 | "analyzer=ModelAnalyzer(model_id,hardware)" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 3, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "data": { 43 | "image/png": "", 44 | "text/plain": [ 45 | "
" 46 | ] 47 | }, 48 | "metadata": {}, 49 | "output_type": "display_data" 50 | } 51 | ], 52 | "source": [ 53 | "# Quantization\n", 54 | "\n", 55 | "fig=plt.figure(figsize=(5, 5))\n", 56 | "bar_width = 0.7\n", 57 | "\n", 58 | "for step in ['decode','prefill']:\n", 59 | " plt.subplot(2, 1, 1 if step=='decode' else 2)\n", 60 | " weights=[]\n", 61 | " kvs=[]\n", 62 | " tmp_acts=[]\n", 63 | " categories=[]\n", 64 | " annotation_xs=[]\n", 65 | " annotation_texts=[]\n", 66 | " xs=[]\n", 67 | " x_st=0\n", 68 | " for batchsize,seqlen in [(1,1024),(1,4096),(16,1024)]:\n", 69 | " FP16_sum=0\n", 70 | " annotation_xs.append(x_st)\n", 71 | " annotation_texts.append(f\"{step}\\nbatchsize={batchsize}\\n seqlen={seqlen}\")\n", 72 | " for w,a,kv,quantization in [(16,16,16,\"FP16\"),(4,16,16,\"W4\"),(4,16,4,\"W4KV4\"),(4,4,4,\"W4A4\")]:\n", 73 | " # print(f\"batchsize={batchsize}, seqlen={seqlen}, w={w}, a={a}, kv={kv}\")\n", 74 | " result=analyzer.analyze(seqlen,batchsize,w,a,kv)\n", 75 | " weights.append(result[\"total_results\"][step][\"memory_consumption_weight\"])\n", 76 | " kvs.append(result[\"total_results\"][step][\"memory_consumption_kv_cache\"])\n", 77 | " tmp_acts.append(result[\"total_results\"][step][\"memory_consumption_tmp_act\"])\n", 78 | " xs.append(x_st)\n", 79 | " x_st+=1\n", 80 | " categories.append(f\"{quantization}\")\n", 81 | " if quantization==\"FP16\":\n", 82 | " FP16_sum=weights[-1]+kvs[-1]+tmp_acts[-1]\n", 83 | " weights[-1]/=FP16_sum\n", 84 | " kvs[-1]/=FP16_sum\n", 85 | " tmp_acts[-1]/=FP16_sum\n", 86 | " x_st+=1\n", 87 | " plt.bar(xs, weights, bar_width, label='Weight')\n", 88 | " plt.bar(xs, kvs, bar_width, bottom=weights, label='KV cache')\n", 89 | " plt.bar(xs, tmp_acts, bar_width, bottom=np.array(kvs) + np.array(weights), label='Tmp Act')\n", 90 | "\n", 91 | " for ann_x,ann_text in zip(annotation_xs,annotation_texts):\n", 92 | " plt.annotate(ann_text, (ann_x+2.2, 0.75), ha='center')\n", 93 | " plt.ylabel('Relative Memory Consumption', fontsize=9)\n", 94 | " plt.xticks(xs, categories, rotation=90)\n", 95 | " plt.legend()\n", 96 | " plt.tight_layout()\n", 97 | "plt.savefig(\"../output/quantization_memory_consumption.pdf\",bbox_inches='tight')" 98 | ] 99 | } 100 | ], 101 | "metadata": { 102 | "kernelspec": { 103 | "display_name": "houmo_llm", 104 | "language": "python", 105 | "name": "python3" 106 | }, 107 | "language_info": { 108 | "codemirror_mode": { 109 | "name": "ipython", 110 | "version": 3 111 | }, 112 | "file_extension": ".py", 113 | "mimetype": "text/x-python", 114 | "name": "python", 115 | "nbconvert_exporter": "python", 116 | "pygments_lexer": "ipython3", 117 | "version": "3.10.13" 118 | } 119 | }, 120 | "nbformat": 4, 121 | "nbformat_minor": 2 122 | } 123 | --------------------------------------------------------------------------------