├── LICENSE
├── README.md
├── deepseek3mfu.html
├── deepseek_v3_pretrain_mfu.py
├── images
    ├── deepseek_architecture.png
    ├── gpt_architecture.png
    ├── llama_architecture.png
    ├── llama_mlp.png
    ├── lm_head_weights.png
    ├── mla_architecture.png
    ├── mla_formulas.png
    ├── mtp_architecture.png
    └── transformer.png
├── index.html
├── mfu_calculation.ipynb
└── mfu_detail.ipynb


/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # MFU计算器
  2 | 
  3 | **MFU计算器**：用于评估LLM训练的MFU（Model Flops Utilization）计算工具，
  4 | 
  5 | 在线MFU运行工具：[链接](https://calvinxky.github.io/mfu_calculation/) 
  6 | 
  7 | Deepseek3 mfu计算在线运行：[链接](https://calvinxky.github.io/mfu_calculation/deepseek3mfu.html) 
  8 | 
  9 | [mfu_calculation](mfu_calculation.ipynb)里面给出了简化版本的MFU计算器，可以在colab运行。
 10 | 
 11 | [mfu_detail](./mfu_detail.ipynb) 给出了MFU计算器搭建的详解。 
 12 | 
 13 | # 更新：
 14 | 
 15 | 2025/2/25：添加deepseek-V3 mfu计算部分
 16 | 
 17 | 
 18 | # 内容：
 19 | 
 20 | 主要介绍LLM（GPT/Llama/MoE）中一些操作层、模块的flops计算量，帮助理解MFU计算过程。
 21 | 
 22 | ![transformer架构](./images/transformer.png)
 23 | 
 24 | # 基本模块flops计算
 25 | ## 线性层的计算量
 26 | 
 27 | 线性层的计算公式为 Y = wX + b 涉及到矩阵的乘法与加法运算。
 28 | 
 29 | 矩阵乘法与加法的flops的计算为：
 30 | 
 31 | **乘法计算量**：对于两个矩阵A和B的乘法C=AB，其中A是m×n矩阵，B是n×p矩阵，C是m×p矩阵。每个元素Cij需要进行 n 次乘法和n-1次加法，总共有mp个元素，因此总FLOPS为：
 32 | 
 33 | mp(n+(n-1)) = 2mnp - mp。
 34 | 
 35 | **加法/减法计算量**：对于两个矩阵A和B的加法C=A+B，其中A和B都是m×n矩阵，C也是m×n矩阵。每个元素Cij需要进行一次加法，总共有mn个元素，因此总FLOPS为mn。
 36 | 
 37 | 对于linear计算，里面涉及一个矩阵乘和一个矩阵加法，由于元素需要展平再运算，权重w的维度[m, n] 输入的维度是[1, n] 输出维度[1, m]，其计算量为
 38 | 
 39 | 2mn
 40 | 
 41 | 不考虑bias的计算量为
 42 | 
 43 | 2mn  - m
 44 | 
 45 | 对于transformer的线性层输入与输出一般用相同的大小，形状都为：[batch_size, seq_len, d_model],
 46 | 线性层的创建一般使用 nn.Linear(hidden_size, hidden_size, bias=False)
 47 | 所以计算量为：
 48 | 
 49 | flops = 2 * batch_size * seq_len * hidden_size * hidden_size
 50 | 
 51 | 如果不一致时：
 52 | flops = 2 * batch_size * seq_len * size_1 * size_2
 53 | 
 54 | ```
 55 | def calcu_linear_flops(batch_size, seq_len, hidden_size, head=0, d_model=0,bias=False):
 56 |     bias_flops = 0 if not bias else batch_size * seq_len * hidden_size
 57 |     if head ==0:
 58 |         flops = 2 * batch_size * seq_len * hidden_size * hidden_size + bias_flops
 59 |     else:
 60 |         flops = 2 * batch_size * seq_len * hidden_size * head * d_model + bias_flops
 61 |     return flops
 62 | 
 63 | ```
 64 | 
 65 | ## Attention模块的计算
 66 | 
 67 | 一般的MHA(MultiHeadAttention)计算的构造如下：
 68 | 
 69 | ```
 70 | class Attention(nn.Module):
 71 |     def __init__(self, input_dim, output_dim):
 72 |         super().__init__()
 73 |         self.query = nn.Linear(input_dim, output_dim)
 74 |         self.key = nn.Linear(input_dim, output_dim)
 75 |         self.value = nn.Linear(input_dim, output_dim)
 76 |         self.dk = output_dim
 77 | 
 78 |     # Scaled dot-product attention:
 79 |     def self_attention(self, query, key, value, mask):
 80 |         # query/key/value:  (bs,  seq_len, dk)/(bs, heads, seq_len, dk)
 81 |         # mask shape = (bs, 1, seq_len)/(bs, 1, 1, seq_len)
 82 |         scores = torch.matmul(query, key.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.dk)) # (bs, seq_len, seq_len)/(bs, heads, seq_len, seq_len)
 83 |         if mask is not None:
 84 |             scores.masked_fill_(mask == torch.tensor(False), float("-inf"))
 85 |         # Softmax dim=-1 stands for apply the softmax along the last dimension
 86 |         attention_weights = nn.Softmax(dim=-1)(scores)  # (bs, heads, seq_len, seq_len)/(bs, seq_len, seq_len)
 87 |         attention_qkv = torch.matmul(attention_weights, value)   # (bs, seq_len, dk)/(bs, heads, seq_len, dk)
 88 |         return attention_qkv
 89 | 
 90 |     def forward(self, query, key, value, mask):
 91 |         # qkv shape: (bs, seq_len, d_model)
 92 |         query = self.query(query)
 93 |         key = self.key(key)
 94 |         value = self.value(value)
 95 |         attention_qkv = self.self_attention(query, key, value, mask)  # shape:  (bs, seq_len, d_model)
 96 |         return attention_qkv
 97 | 
 98 | class MultiHeadedAttention(Attention):
 99 |     def __init__(self, d_model, heads):
100 |         super().__init__(d_model, d_model)
101 |         assert d_model % heads == 0
102 |         self.dk = d_model // heads  # head dimension
103 |         self.heads = heads
104 |         self.out_linear = nn.Linear(d_model, d_model)
105 |         self.sqrt_dk = torch.sqrt(torch.tensor(self.dk))
106 | 
107 |     def forward(self, query, key, value, mask):
108 |         batch_size = query.shape[0]
109 |         # qkv shape: (bs, seq_len, dk*heads)
110 |         # dk * heads = d_model
111 |         query = self.query(query).view(batch_size, -1, self.heads, self.dk).transpose(1, 2)
112 |         key = self.key(key).view(batch_size, -1, self.heads, self.dk).transpose(1, 2)
113 |         value = self.value(value).view(batch_size, -1, self.heads, self.dk).transpose(1, 2)
114 |         attention_qkv = self.self_attention(query, key, value, mask)  # shape:  (bs, heads, seq_len, dk)
115 |         #  (bs, heads, seq_len, dk) -> (bs, seq_len, dk*heads)
116 |         reshaped = attention_qkv.transpose(1, 2).reshape(batch_size, -1, self.heads * self.dk)
117 |         representations_batch = self.out_linear(reshaped)
118 |         return representations_batch
119 | ```
120 | 
121 | 主要运算：
122 | * Q/K/V: 线性映射
123 | * scores: QK乘法运算
124 | * attention_qkv: V和attention_weights乘法运算
125 | * out_linear: 线性度计算
126 | 
127 | 次要运算：
128 | * softmax计算
129 | * masked_fill计算
130 | 
131 | 对于主要运算中有个需要考虑点：
132 | * Attention的变化：query attention中KV的heads数量与Q的heads数量不一致。
133 | * 序列并行（context parallel/ring attention）: 考虑并行度。
134 | 
135 | 次要运算在估算flops时通常可以忽略，这里例出其计算方式：
136 | 
137 | softmax的flops计算量： 输入的shape：(bs, heads, seq_len, seq_len)
138 | 元素计算涉及指数运算、加法运算、除法运算。计算量：
139 | 
140 |    3 * bs * heads * seq_len * (seq_len - 1)
141 | 
142 | masked_fill是一个掩码操作，包含判断操作和赋值操作，假设是需要遍历整个矩阵，每个元素操作一次，而赋值操作仅对需要操作的元素赋值，输入矩阵的大小为[bs, heads, seq_len, seq_len], 操作的个数为X。所以计算量：
143 | 
144 |    bs * heads *  seq_len * seq_len + X
145 | 
146 | 由于X操作相对来说较小, 公式简化为：
147 | 
148 |    bs * heads *  seq_len * seq_len
149 | 
150 | ```
151 | def calcu_attention_flops(batch_size, seq_len, heads, d_model, num_query_groups=0, context_parallel=1):
152 |     num_query_groups = num_query_groups if num_query_groups != 0 else heads
153 |     q_linear_flops = calcu_linear_flops(batch_size, seq_len, heads * d_model)
154 |     k_linear_flops = calcu_linear_flops(batch_size, seq_len, heads * d_model, num_query_groups, d_model)
155 |     v_linear_flops = k_linear_flops
156 | 
157 |     kv_scores_flops = 2 * batch_size * seq_len**2 * heads * d_model * (context_parallel + 1) / (2 * context_parallel)
158 |     mask_flops = batch_size * heads *  seq_len * seq_len
159 |     softmax_flops = 3 * batch_size * heads * seq_len * (seq_len - 1)
160 | 
161 |     qkv_flops = kv_scores_flops
162 |     out_linear_flops = calcu_linear_flops(batch_size, seq_len, heads * d_model)
163 |     return q_linear_flops + k_linear_flops + v_linear_flops + kv_scores_flops + mask_flops + softmax_flops + qkv_flops + out_linear_flops
164 | 
165 | ```
166 | 
167 | 对于MLA（[Multi-head Latent Attention](https://arxiv.org/pdf/2405.04434))结构如下：
168 | 
169 | ![MLA架构](./images/mla_architecture.png)
170 | 
171 | 主要的计算变化是qkv的linear计算发生了变化，MLA的计算公式如下：
172 | 
173 | ![mla计算公式](./images/mla_formulas.png)
174 | 
175 | 构建其mfu的计算时，关注linear和attention的部分，flops的调整如下：
176 | 
177 | ```
178 | # 计算公式如下
179 |         # attention flops:
180 |         args = self.model_args
181 |         gbs = args.gbs
182 |         num_heads = args.n_heads
183 |         hidden_size = args.dim
184 |         qk_head_dim = args.qk_nope_head_dim + args.qk_rope_head_dim
185 |         q_down_proj = 2 * args.gbs * args.seq_len * hidden_size * args.q_lora_rank
186 |         q_up_proj = 2 * args.gbs * args.seq_len * args.q_lora_rank * num_heads * qk_head_dim
187 |         q_linear = q_down_proj + q_up_proj
188 | 
189 |         kv_down_proj = 2 * gbs * args.seq_len * hidden_size * (args.kv_lora_rank + args.qk_rope_head_dim)
190 |         kv_up_proj = 2 * gbs * args.seq_len * args.kv_lora_rank * num_heads * (qk_head_dim + args.v_head_dim)
191 |         kv_linear = kv_down_proj + kv_up_proj
192 | 
193 |         kv_scores = 2 * gbs * args.seq_len ** 2 * num_heads * qk_head_dim
194 |         qkv = 2 * gbs * args.seq_len ** 2 * num_heads * args.v_head_dim
195 | 
196 |         out_linear = 2 * gbs * args.seq_len * args.n_heads * args.v_head_dim * hidden_size
197 | ```
198 | 
199 | 进一步简化：
200 | ```
201 | def calcu_mla_flops(batch_size, seq_len, heads, d_model, q_lora_rank, kv_lora_rank, context_parallel=1):
202 |     q_down_proj = calcu_linear_flops(batch_size, seq_len, heads * d_model, 1, q_lora_rank)         
203 |     q_up_proj  = calcu_linear_flops(batch_size, seq_len, q_lora_rank, heads, d_model)
204 |     q_linear_flops = q_down_proj + q_up_proj
205 |     kv_down_proj = calcu_linear_flops(batch_size, seq_len, heads * d_model, 1, kv_lora_rank)
206 |     kv_up_proj =calcu_linear_flops(batch_size, seq_len, kv_lora_rank, heads, d_model) * 2
207 |     kv_linear = kv_down_proj + kv_up_proj
208 | 
209 |     kv_scores_flops = 2 * batch_size * seq_len**2 * heads * d_model
210 |     mask_flops = batch_size * heads *  seq_len * seq_len
211 |     softmax_flops = 3 * batch_size * heads * seq_len * (seq_len - 1)
212 | 
213 |     qkv_flops = kv_scores_flops
214 |     out_linear_flops = calcu_linear_flops(batch_size, seq_len, heads * d_model)
215 |     return q_linear_flops + kv_linear + kv_scores_flops + mask_flops + softmax_flops + qkv_flops + out_linear_flops
216 | 
217 | ```
218 | 
219 | ## LayerNorm/RMSNorm
220 | 
221 | Layer_norm的计算内容一般如下：
222 | 
223 | ```
224 | import numpy as np
225 | 
226 | def layer_normalization(x, epsilon=1e-8):
227 |     mean = np.mean(x, axis=-1, keepdims=True) # 最后一个维度
228 |     std = np.std(x, axis=-1, keepdims=True)
229 |     normalized_x = (x - mean) / (std + epsilon)
230 |     return normalized_x
231 | 
232 | ```
233 | 假设数据的长度为L
234 | 包含平均值计算、标准差计算、偏移计算；
235 | * mean计算包含L加法和一次除法：  L + 1
236 | * std计算，每个元素进行一个减法、一个乘法、一个加法。最后进行一个除法和一个乘法操作： 3*L + 2
237 | * 标准化：每个元素一次减法、一次除法操作： 2*L
238 | 
239 | 忽略单次运算，所以操作计算量：
240 | 
241 | 6 * batch_size * seq_len * hidden_size
242 | 
243 | ```
244 | def calcu_layer_norm_flops(batch_size, seq_len, hidden_size):
245 |   return 6 * batch_size * seq_len * hidden_size
246 | 
247 | ```
248 | 
249 | RMSNorm 常见的代码实现如下：
250 | 
251 | ```
252 | # 参考Llama定义
253 | class LlamaRMSNorm(nn.Module):
254 |     def __init__(self, hidden_size, eps=1e-6):
255 |         '''
256 |         LlamaRMSNorm is equivalent to T5LayerNorm
257 |         '''
258 |         super().__init__()
259 |         self.weight = nn.Parameter(torch.ones(hidden_size))
260 |         self.variance_epsilon = eps
261 | 
262 |     def forward(self, hidden_states):
263 |         input_dtype = hidden_states.dtype
264 |         hidden_states = hidden_states.to(torch.float32)
265 |         variance = hidden_states.pow(2).mean(-1, keepdim=True)
266 |         hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
267 |         return self.weight * hidden_states.to(input_dtype)
268 | ```
269 | 
270 | 主要计算内容：
271 | * 元素二次方、元素求平均（n-1）、一个rsqrt运算、一个求和运算
272 | * 两个乘法操作
273 | 
274 | 忽略单次运算，flops数等于：
275 | 
276 |   4 * batch_size * seq_len * hidden_size
277 | 
278 | ```
279 | def calcu_rmsnorm_flops(batch_size, seq_len, hidden_size):
280 |   return 4 * batch_size * seq_len * hidden_size
281 | 
282 | ```
283 | 
284 | ## MLP/FFN层的计算
285 | 
286 | MLP层的构建常见的方式如下：
287 | 
288 | ```
289 | class PositionwiseFeedForward(nn.Module):
290 |     def __init__(self, d_model, dff=2048):
291 |         super().__init__()
292 |         self.linear1 = nn.Linear(d_model, dff)
293 |         self.linear2 = nn.Linear(dff, d_model)
294 |         self.relu = nn.ReLU()
295 | 
296 |     def forward(self, representations_batch):
297 |         return self.linear2(self.relu(self.linear1(representations_batch)))
298 | ```
299 | 
300 | 主要包含两个线性层操作和一个Relu计算。
301 | 
302 | 输入/输出: [batch_size, seq_len, hidden_size]
303 | dff值：ffn_hidden_size
304 | 
305 | 计算量为两次线性运算 + 一个relu操作，其flops操作数量如下：
306 | 2 * batch_size * seq_len * hidden_size * ffn_hidden_size + batch_size * seq_len * ffn_hidden_size
307 | 
308 | Llama的MLP有些改动，一般的计算包含三次线性运算（gate_proj、up_proj、down_proj, 参看hugging face的LlamaMLP定义）一个silu运算，一个元素乘法运算。
309 | 
310 | [LlamaMLP定义](https://github.com/huggingface/transformers/blob/v4.48.0/src/transformers/models/llama/modeling_llama.py#L174):
311 | 
312 | ![llama_mlp](./images/llama_mlp.png)
313 | 
314 | ```
315 | # L174
316 | class LlamaMLP(nn.Module):
317 |     def __init__(self, config):
318 |         super().__init__()
319 |         self.config = config
320 |         self.hidden_size = config.hidden_size
321 |         self.intermediate_size = config.intermediate_size
322 |         self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
323 |         self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
324 |         self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
325 |         self.act_fn = ACT2FN[config.hidden_act]
326 |         
327 |     def forward(self, x):
328 |         down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
329 |         return down_proj
330 | ```
331 | 
332 | 对应的flops计算工作：
333 | 
334 | 3 * batch_size * seq_len * hidden_size * ffn_hidden_size + 2 * batch_size * seq_len * ffn_hidden_size
335 | 
336 | 
337 | ```
338 | def calcu_mlp_flops(batch_size, seq_len, hidden_size, ffn_hidden_size, use_gate=True):
339 |   """
340 |    use_gate=True SwiGLU structure FFN.
341 |   """
342 |   if use_gate:
343 |     flops = 3 * 2 * batch_size * seq_len * hidden_size * ffn_hidden_size + 2 * batch_size * seq_len * ffn_hidden_size
344 |   else:
345 |     flops = 2 * 2 * batch_size * seq_len * hidden_size * ffn_hidden_size + batch_size * seq_len * ffn_hidden_size
346 |   return flops
347 | 
348 | ```
349 | 
350 | ## Logits计算
351 | 
352 | logits计算包含三个运算：
353 | 
354 | * layernorm
355 | * linear（词表映射）
356 | * softmax
357 | 
358 | 对应尺寸
359 | * layernorm/rmsnorm: [batch_size, seq_len, hidden_size]
360 | * linear: input:[batch_size，seq_len*hidden_size] output: :[batch_size，seq_len*vocab_size]
361 | * softmax: [batch_size，seq_len*vocab_size]
362 | 
363 | 对应计算量：
364 | 
365 | 6 * batch_size * seq_len * hidden_size
366 | 
367 | batch_size * seq_len * hidden_size * vocab_size
368 | 
369 | 3 * batch_size * seq_len * (vocab_size - 1)
370 | 
371 | ```
372 | def calcu_logits_flops(batch_size, seq_len, heads, d_model, hidden_size, vocab_size, RMSNorm=True):
373 |     norm_flops = calcu_rmsnorm_flops(batch_size, seq_len, hidden_size) if RMSNorm else \
374 |     calcu_layer_norm_flops(batch_size, seq_len, hidden_size)
375 | 
376 |     linear_flops = 2 * batch_size * seq_len * hidden_size * vocab_size
377 | 
378 |     softmax_flos = 3 * batch_size * seq_len * (vocab_size - 1)
379 |     return norm_flops + linear_flops + softmax_flos
380 | 
381 | ```
382 | 
383 | ## 位置编码计算
384 | 
385 | Transformer采用的位置编码PE：包含正弦/余弦运算，对每个位置进行d_model/2正弦,d_model/2余弦，计算量为：
386 | 
387 | seq_len * d_model
388 | 
389 | 注：如果进行了多头切分， d_model = d_model * heads
390 | 
391 | 如果采用旋转位置编码RoPE：
392 | 
393 | * 旋转角度计算：d_model
394 | * 每个位置计算构造旋转矩阵：seq_len * d_model
395 | * Q,K与旋转矩阵乘法：4 * batch_size * seq_len * d_model
396 | 
397 | 
398 | ```
399 | def calcu_position_encoding(batch_size, seq_len, heads, d_model, pe_type="rope"):
400 |   if pe_type == "rope":
401 |     return 4 * batch_size * seq_len * d_model * heads
402 |   else:
403 |     return seq_len * d_model * heads
404 | 
405 | ```
406 | 
407 | ## Router计算
408 | 
409 | router计算主要是在MoE中应用，其计算一般包括：
410 | 
411 | ```
412 | self.gate = nn.Linear(config.hidden_size, config.num_experts, bias=False)
413 | hidden_states = hidden_states.view(-1, hidden_dim)
414 | # router_logits: (batch * sequence_length, n_experts)
415 | router_logits = self.gate(hidden_states)
416 | 
417 | routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
418 | routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
419 | ```
420 | 主要是一个gate线性层计算：
421 | 
422 | flops = 2 * batch_size * seq_len * hidden_size * num_experts
423 | 
424 | ```
425 | def calcu_router_flops(batch_size, seq_len, hidden_size, experts):
426 |   return 2 * batch_size * seq_len * hidden_size * experts
427 | 
428 | ```
429 | 
430 | ## MTP模块
431 | 
432 | MTP(Multi-Token Prediction) 参考DeepSeekV3里面MTP内容，如下所示：
433 | 
434 | ![MTP](./images/mtp_architecture.png)
435 | 
436 | MTP Module主要的结构:
437 | * embedding/output head
438 | * transformer block
439 | * linear proj
440 | * RMSNorm
441 | 
442 | 所有模块计算均能复用前面的内容，需要注意的是linear proj的输入是两个输入Concat后的数值。计算参考:
443 | 
444 | ```
445 |         linear_proj = 2 * 3 * gbs * args.seq_len * hidden_size * (hidden_size * 2)
446 |         embedding_flops = self.calcu_embedding_layer()
447 |         mla_layer_flops = self.calcu_mla_flops()
448 |         moe_layer_flops = self.calcu_moe_flops()
449 |         mtp_flops = 3 * (embedding_flops + mla_layer_flops + moe_layer_flops + linear_proj)
450 | ```
451 | 
452 | # 典型训练模型的flops计算
453 | 
454 | 训练估算约定:
455 | 
456 | 1、模型backward计算是forward计算大约两倍, 因为需要计算输入 + 权重的梯度 [参考](https://arxiv.org/pdf/2205.05198)。
457 | 
458 | 2、输入的Embedding层主要完成映射计算, 输入维度[batch_size, seq_len] 输出维度： (bs, seq_len, d_model)，其flops计算量可以忽略。 其权重用于LM-head计算时对应的计算量在logits中考虑。
459 | 
460 | ![LM_Head_Weights](./images/lm_head_weights.png)
461 | 
462 | 3、位置编码的计算量相对较小，给与忽略。
463 | 
464 | ## GPT结构flops计算
465 | 
466 | 模型涉及计算的主要结构
467 | 
468 | decoder_layer x N + logtis
469 | 
470 | 其中N是层数，decoder构成：
471 | 
472 | MHA + FFN + 2 LayerNorm
473 | 
474 | ![GPT架构](./images/gpt_architecture.png)
475 | 
476 | 
477 | ```
478 | def caclu_gpt_flops(batch_size, seq_len, heads, d_model, hidden_size, vocab_size, ffn_hidden_size, layer_nums):
479 |   attention_flops = calcu_attention_flops(batch_size, seq_len, heads, d_model, num_query_groups=0, context_parallel=1)
480 |   ffn_flops = calcu_mlp_flops(batch_size, seq_len, hidden_size, ffn_hidden_size, use_gate=False)
481 |   layer_norm_flops = calcu_layer_norm_flops(batch_size, seq_len, hidden_size)
482 |   logits_flops = calcu_logits_flops(batch_size, seq_len, heads, d_model, hidden_size, vocab_size, False)
483 |   return 3 * (logits_flops + (layer_norm_flops * 2 + attention_flops + ffn_flops) * layer_nums)
484 | 
485 | ```
486 | 
487 | ## LLAMA结构flops计算
488 | 
489 | 结构：
490 | 
491 | (GMQA + FFN + RMSNorm) x L + logtis
492 | 
493 | 其中GMQA 是group attention， FFN： Feed ForwardSwiGLU结构。
494 | 
495 | ![Llama架构](./images/llama_architecture.png)
496 | 
497 | ```
498 | def caclu_llama_flops(batch_size, seq_len, heads, d_model, hidden_size, vocab_size, ffn_hidden_size, layer_nums, num_query_groups):
499 |   attention_flops = calcu_attention_flops(batch_size, seq_len, heads, d_model, num_query_groups=num_query_groups, context_parallel=1)
500 |   ffn_flops = calcu_mlp_flops(batch_size, seq_len, hidden_size, ffn_hidden_size, use_gate=True)
501 |   layer_norm_flops = calcu_layer_norm_flops(batch_size, seq_len, hidden_size)
502 |   logits_flops = calcu_logits_flops(batch_size, seq_len, heads, d_model, hidden_size, vocab_size)
503 |   return 3 * (logits_flops + (layer_norm_flops * 2 + attention_flops + ffn_flops) * layer_nums)
504 | 
505 | ```
506 | 
507 | ## MoE模型flops计算
508 | 
509 | 在llama结构基础上ffn增加topk专家数量系数，计算公式：
510 | 
511 | (MQA + FFN * Experts_topk + Router + RMSNorm) x L + logtis
512 | 
513 | 
514 | ```
515 | def caclu_moe_flops(batch_size, seq_len, heads, d_model, hidden_size, vocab_size, ffn_hidden_size, layer_nums, num_query_groups, topk, experts):
516 |   attention_flops = calcu_attention_flops(batch_size, seq_len, heads, d_model, num_query_groups=num_query_groups, context_parallel=1)
517 |   ffn_flops = calcu_mlp_flops(batch_size, seq_len, hidden_size, ffn_hidden_size, use_gate=True)
518 |   layer_norm_flops = calcu_layer_norm_flops(batch_size, seq_len, hidden_size)
519 |   logits_flops = calcu_logits_flops(batch_size, seq_len, heads, d_model, hidden_size, vocab_size)
520 |   router_flops = calcu_router_flops(batch_size, seq_len, hidden_size, experts)
521 |   return 3 * (logits_flops + (layer_norm_flops * 2 + attention_flops + ffn_flops * topk + router_flops) * layer_nums)
522 | 
523 | ```
524 | 
525 | MoE如果包括了共享专家(shared experts)，上述计算公式中将topk的数量设置为：
526 | 
527 | topk + shared_experts_nums
528 | 
529 | ## MoE（deepseek）模型flops计算
530 | 
531 | 在MoE结构基础上添加共享专家，MQA替换成MLA：
532 | 
533 | ![deepseek结构](./images/deepseek_architecture.png)
534 | 
535 | 同样忽略一些影响较小的/低阶项，计算公式为：
536 | 
537 | (MQA + FFN * (topk + shared) + Router) x L + logtis
538 | 
539 | ```
540 | def caclu_moe_deepseek_flops(batch_size, seq_len, heads, d_model, hidden_size, vocab_size, ffn_hidden_size, layer_nums, q_lora_rank, kv_lora_rank, topk, shared, experts):
541 |   attention_flops = calcu_mla_flops(batch_size, seq_len, heads, d_model, q_lora_rank, kv_lora_rank, context_parallel=1)
542 |   ffn_flops = calcu_mlp_flops(batch_size, seq_len, hidden_size, ffn_hidden_size, use_gate=True)
543 |   layer_norm_flops = calcu_layer_norm_flops(batch_size, seq_len, hidden_size)
544 |   logits_flops = calcu_logits_flops(batch_size, seq_len, heads, d_model, hidden_size, vocab_size)
545 |   router_flops = calcu_router_flops(batch_size, seq_len, hidden_size, experts)
546 |   return 3 * (logits_flops + (layer_norm_flops * 2 + attention_flops + ffn_flops * (topk + shared) + router_flops) * layer_nums)
547 | 
548 | ```
549 | 
550 | # MFU计算
551 | 
552 | MFU(Model Flops Utilization)计算的公式为：
553 | 
554 | MFU = 单位时间实际flops/单位时间名义flops
555 | 
556 | 单位时间实际flops = 单步模型计算flops总数/单步迭代时间
557 | 
558 | MFU = model_flops_sum / iter_time * (device_peak_flops * device_num)
559 | 
560 | 通常device_peak_flops的单位为： TFlops/s
561 | ```
562 | def calcu_moe_mfu(iter_time, batch_size, seq_len, heads, d_model, hidden_size, vocab_size, ffn_hidden_size, layer_nums, num_query_groups, topk, experts, device_nums, device_peak_flops):
563 |     model_flops = caclu_moe_flops(batch_size, seq_len, heads, d_model, hidden_size, vocab_size, ffn_hidden_size, layer_nums, num_query_groups, topk, experts)
564 |     return model_flops / (iter_time * device_peak_flops * device_nums * 10 ** 12)
565 | ```
566 | 
567 | 测试：
568 | ```
569 | calcu_moe_mfu(iter_time=1.5,
570 |        batch_size=1024, seq_len=4096, heads=8, d_model=128,
571 |        hidden_size=1024, vocab_size=32768, ffn_hidden_size=2048,
572 |        layer_nums=100, num_query_groups=4, topk=9, experts=100,
573 |        device_nums=1024, device_peak_flops=280)
574 | ```
575 | 
576 | 
577 | # 参考内容：
578 | 
579 | https://zhuanlan.zhihu.com/p/691126108
580 | 
581 | https://github.com/naklecha/llama3-from-scratch/blob/main/README.md
582 | 
583 | https://arxiv.org/pdf/2205.05198
584 | 
585 | https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/te_llama/tutorial_accelerate_hf_llama_with_te.html
586 | 
587 | https://github.com/huggingface/transformers/blob/v4.48.0/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py#L603
588 | 
589 | https://bbycroft.net/llm
590 | 
591 | https://arxiv.org/pdf/2405.04434


--------------------------------------------------------------------------------
/deepseek3mfu.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="zh">
  3 | <head>
  4 |     <meta charset="UTF-8">
  5 |     <title>DeepSeek V3 MFU Calculator</title>
  6 |     <style>
  7 |         body {
  8 |             font-family: Arial, sans-serif;
  9 |             max-width: 1400px;
 10 |             margin: 0 auto;
 11 |             padding: 20px;
 12 |             background-color: #e8eaed;
 13 |             display: flex;
 14 |             flex-wrap: wrap;
 15 |             justify-content: center;
 16 |         }
 17 |         .main-content {
 18 |             flex: 0 0 700px;
 19 |             margin-right: 30px;
 20 |         }
 21 |         .formula-sidebar {
 22 |             flex: 1;
 23 |             min-width: 500px;
 24 |             background: #f8f9fa;
 25 |             padding: 20px;
 26 |             border-radius: 8px;
 27 |             box-shadow: 0 2px 4px rgba(0,0,0,0.08);
 28 |             height: fit-content;
 29 |             position: sticky;
 30 |             top: 20px;
 31 |         }
 32 |         .formula-block {
 33 |             font-family: monospace;
 34 |             background: #ffffff;
 35 |             padding: 15px;
 36 |             border-radius: 4px;
 37 |             margin: 10px 0;
 38 |             white-space: pre-wrap;
 39 |             font-size: 14px;
 40 |             color: #2c3e50;
 41 |             border: 1px solid #ebedef;
 42 |         }
 43 |         h1 {
 44 |             flex: 0 0 100%;
 45 |         }
 46 |         .form-group {
 47 |             margin-bottom: 8px;
 48 |             display: flex;
 49 |             align-items: center;
 50 |             padding: 0 15px;
 51 |         }
 52 |         label {
 53 |             display: inline-block;
 54 |             width: 280px;
 55 |             font-weight: 500;
 56 |             color: #333;
 57 |         }
 58 |         input[type="number"], select {
 59 |             width: 160px;
 60 |             padding: 6px;
 61 |             border: 1px solid #d1d5db;
 62 |             border-radius: 4px;
 63 |             background-color: #ffffff;
 64 |             color: #374151;
 65 |         }
 66 |         input[type="number"]:focus, select:focus {
 67 |             outline: none;
 68 |             border-color: #4CAF50;
 69 |             box-shadow: 0 0 5px rgba(76, 175, 80, 0.2);
 70 |         }
 71 |         .section {
 72 |             border: 1px solid #d1d5db;
 73 |             padding: 15px 0;
 74 |             margin-bottom: 25px;
 75 |             border-radius: 8px;
 76 |             background-color: #f8f9fa;
 77 |             box-shadow: 0 2px 4px rgba(0,0,0,0.1);
 78 |             max-width: 700px;
 79 |             margin-left: auto;
 80 |             margin-right: auto;
 81 |         }
 82 |         .section h2 {
 83 |             margin-top: 0;
 84 |             margin-bottom: 15px;
 85 |             color: #2c3e50;
 86 |             border-bottom: 2px solid #eee;
 87 |             padding: 0 15px 10px;
 88 |             display: flex;
 89 |             justify-content: space-between;
 90 |             align-items: center;
 91 |         }
 92 |         .section h2 .button-group {
 93 |             display: flex;
 94 |             gap: 10px;
 95 |         }
 96 |         .section h2 button {
 97 |             padding: 6px 12px;
 98 |             font-size: 14px;
 99 |             width: auto;
100 |             margin: 0;
101 |         }
102 |         .result {
103 |             margin-top: 25px;
104 |             padding: 20px;
105 |             background-color: #f0f9ff;
106 |             border-radius: 8px;
107 |             border: 1px solid #bfdbfe;
108 |             max-width: 700px;
109 |             margin-left: auto;
110 |             margin-right: auto;
111 |         }
112 |         .result h2 {
113 |             color: #2c3e50;
114 |             margin-top: 0;
115 |             padding: 0 0 10px;
116 |             border-bottom: 2px solid #c8e6c9;
117 |         }
118 |         .result p {
119 |             margin: 15px 0;
120 |             font-size: 16px;
121 |             display: flex;
122 |             justify-content: space-between;
123 |             padding: 0 20px;
124 |         }
125 |         .result span {
126 |             font-weight: bold;
127 |             color: #4CAF50;
128 |             min-width: 150px;
129 |             text-align: right;
130 |         }
131 |         button {
132 |             background-color: #4CAF50;
133 |             color: white;
134 |             padding: 12px 24px;
135 |             border: none;
136 |             border-radius: 4px;
137 |             cursor: pointer;
138 |             font-size: 16px;
139 |             transition: background-color 0.3s;
140 |             display: inline-block;
141 |             margin: 0 10px;
142 |             width: 200px;
143 |         }
144 |         button:hover {
145 |             background-color: #45a049;
146 |         }
147 |         h1, h2, h3 {
148 |             color: #1f2937;
149 |         }
150 |         .ads-container {
151 |             width: 100%;
152 |             margin-bottom: 20px;
153 |             display: flex;
154 |             gap: 10px;
155 |         }
156 |         .ads {
157 |             background-color: #24292e;
158 |             color: white;
159 |             padding: 8px 16px;
160 |             border-radius: 20px;
161 |             text-decoration: none;
162 |             font-size: 14px;
163 |             display: flex;
164 |             align-items: center;
165 |             transition: all 0.3s ease;
166 |             box-shadow: 0 2px 4px rgba(0,0,0,0.1);
167 |         }
168 |         .ads:hover {
169 |             background-color: #1a1f24;
170 |             transform: translateY(-2px);
171 |             box-shadow: 0 4px 8px rgba(0,0,0,0.2);
172 |         }
173 |         .ads.github::after {
174 |             content: "★";
175 |             margin-left: 6px;
176 |             color: #ff7b72;
177 |         }
178 |         .ads.index {
179 |             background-color: #4CAF50;
180 |         }
181 |         .ads.index:hover {
182 |             background-color: #45a049;
183 |         }
184 |         .button-container {
185 |             text-align: center;
186 |             margin: 20px 0;
187 |         }
188 |         .drop-zone {
189 |             border: 2px dashed #4CAF50;
190 |             border-radius: 4px;
191 |             padding: 20px;
192 |             text-align: center;
193 |             margin: 20px auto;
194 |             max-width: 700px;
195 |             background-color: #f8f9fa;
196 |             cursor: pointer;
197 |         }
198 |         .drop-zone.dragover {
199 |             background-color: #e8f5e9;
200 |             border-color: #2e7d32;
201 |         }
202 |     </style>
203 | </head>
204 | <body>
205 |     <div class="ads-container">
206 |         <a class="ads github" href="https://github.com/CalvinXKY/mfu_calculation">Welcome to star this project❤</a>
207 |         <a class="ads index" href="https://calvinxky.github.io/mfu_calculation">Back to MFU Calculator</a>
208 |     </div>
209 |     
210 |     <h1>DeepSeek V3 MFU Calculator</h1>
211 | 
212 |     <div class="main-content">
213 |         <div class="section">
214 |             <h2>
215 |                 Model Arguments
216 |                 <div class="button-group">
217 |                     <button onclick="saveParameters()">Save Parameters</button>
218 |                     <button onclick="document.getElementById('fileInput').click()">Load Parameters</button>
219 |                     <input type="file" id="fileInput" style="display: none" accept=".json">
220 |                 </div>
221 |             </h2>
222 |             <div class="form-group">
223 |                 <label>Global Batch Size:</label>
224 |                 <input type="number" id="gbs" value="1024">
225 |             </div>
226 |             <div class="form-group">
227 |                 <label>Max Batch Size:</label>
228 |                 <input type="number" id="max_batch_size" value="8">
229 |             </div>
230 |             <div class="form-group">
231 |                 <label>Max Sequence Length:</label>
232 |                 <input type="number" id="max_seq_len" value="16384">
233 |             </div>
234 |             <div class="form-group">
235 |                 <label>Sequence Length:</label>
236 |                 <input type="number" id="seq_len" value="4096">
237 |             </div>
238 |             <div class="form-group">
239 |                 <label>Data Type:</label>
240 |                 <select id="dtype">
241 |                     <option value="bf16">bf16</option>
242 |                     <option value="fp8">fp8</option>
243 |                 </select>
244 |             </div>
245 |             <div class="form-group">
246 |                 <label>Vocabulary Size:</label>
247 |                 <input type="number" id="vocab_size" value="129280">
248 |             </div>
249 |             <div class="form-group">
250 |                 <label>Model Dimension:</label>
251 |                 <input type="number" id="dim" value="7168">
252 |             </div>
253 |             <div class="form-group">
254 |                 <label>Intermediate Dimension:</label>
255 |                 <input type="number" id="inter_dim" value="18432">
256 |             </div>
257 |             <div class="form-group">
258 |                 <label>MoE Intermediate Dimension:</label>
259 |                 <input type="number" id="moe_inter_dim" value="2048">
260 |             </div>
261 |             <div class="form-group">
262 |                 <label>Number of Layers:</label>
263 |                 <input type="number" id="n_layers" value="61">
264 |             </div>
265 |             <div class="form-group">
266 |                 <label>Number of Dense Layers:</label>
267 |                 <input type="number" id="n_dense_layers" value="3">
268 |             </div>
269 |             <div class="form-group">
270 |                 <label>Number of Heads:</label>
271 |                 <input type="number" id="n_heads" value="128">
272 |             </div>
273 |             <div class="form-group">
274 |                 <label>Number of MTP Modules:</label>
275 |                 <input type="number" id="n_mtp_modules" value="2">
276 |             </div>
277 |             <div class="form-group">
278 |                 <label>Number of Routed Experts:</label>
279 |                 <input type="number" id="n_routed_experts" value="256">
280 |             </div>
281 |             <div class="form-group">
282 |                 <label>Number of Shared Experts:</label>
283 |                 <input type="number" id="n_shared_experts" value="1">
284 |             </div>
285 |             <div class="form-group">
286 |                 <label>Number of Activated Experts:</label>
287 |                 <input type="number" id="n_activated_experts" value="8">
288 |             </div>
289 |             <div class="form-group">
290 |                 <label>Number of Expert Groups:</label>
291 |                 <input type="number" id="n_expert_groups" value="8">
292 |             </div>
293 |             <div class="form-group">
294 |                 <label>Number of Limited Groups:</label>
295 |                 <input type="number" id="n_limited_groups" value="4">
296 |             </div>
297 |             <div class="form-group">
298 |                 <label>Q LoRA Rank:</label>
299 |                 <input type="number" id="q_lora_rank" value="1536">
300 |             </div>
301 |             <div class="form-group">
302 |                 <label>KV LoRA Rank:</label>
303 |                 <input type="number" id="kv_lora_rank" value="512">
304 |             </div>
305 |             <div class="form-group">
306 |                 <label>QK NoPE Head Dimension:</label>
307 |                 <input type="number" id="qk_nope_head_dim" value="128">
308 |             </div>
309 |             <div class="form-group">
310 |                 <label>QK RoPE Head Dimension:</label>
311 |                 <input type="number" id="qk_rope_head_dim" value="64">
312 |             </div>
313 |             <div class="form-group">
314 |                 <label>V Head Dimension:</label>
315 |                 <input type="number" id="v_head_dim" value="128">
316 |             </div>
317 |             <div class="form-group">
318 |                 <label>Causal Mask:</label>
319 |                 <select id="causal_mask">
320 |                     <option value="false">False</option>
321 |                     <option value="true">True</option>
322 |                 </select>
323 |             </div>
324 |         </div>
325 | 
326 |         <div class="section">
327 |             <h2>Calculation Parameters</h2>
328 |             <div class="form-group">
329 |                 <label>Step Time(s):</label>
330 |                 <input type="number" id="step_time" value="21.00" step="0.01">
331 |             </div>
332 |             <div class="form-group">
333 |                 <label>World Size(gpu_nums):</label>
334 |                 <input type="number" id="world_size" value="512">
335 |             </div>
336 |             <div class="form-group">
337 |                 <label>GPU Peak BF16 FLOPS(TFlops):</label>
338 |                 <input type="number" id="gpu_peak_bf16_flops" value="354">
339 |             </div>
340 |         </div>
341 | 
342 |         <div class="button-container">
343 |             <button onclick="calculate()">Calculate</button>
344 |         </div>
345 | 
346 |         <div class="result" id="result">
347 |             <h2>Results</h2>
348 |             <p>MFU: <span id="mfu_result">-</span></p>
349 |             <p>Total FLOPS: <span id="flops_result">-</span></p>
350 |         </div>
351 |     </div>
352 |     
353 |     <div class="formula-sidebar">
354 |         <h2>Calculation Formulas</h2>
355 |         
356 |         <h3>Embedding Layer</h3>
357 |         <div class="formula-block">
358 | embedding_flops = 2 * gbs * seq_len * dim * vocab_size</div>
359 |         
360 |         <h3>MLA (Multi-Head Latent Attention)</h3>
361 |         <div class="formula-block">
362 | q_down_proj = 2 * gbs * seq_len * hidden_size * q_lora_rank
363 | q_up_proj = 2 * gbs * seq_len * q_lora_rank * num_heads * qk_head_dim
364 | q_linear = q_down_proj + q_up_proj
365 | 
366 | kv_down_proj = 2 * gbs * seq_len * hidden_size * (kv_lora_rank + qk_rope_head_dim)
367 | kv_up_proj = 2 * gbs * seq_len * kv_lora_rank * num_heads * (qk_head_dim + v_head_dim)
368 | kv_linear = kv_down_proj + kv_up_proj
369 | 
370 | // When causal_mask is True:
371 | kv_scores = (2 * gbs * seq_len² * num_heads * qk_head_dim) / (causal_mask ? 2 : 1)
372 | qkv = (2 * gbs * seq_len² * num_heads * v_head_dim) / (causal_mask ? 2 : 1)
373 | 
374 | out_linear = 2 * gbs * seq_len * n_heads * v_head_dim * hidden_size</div>
375 | 
376 |         <h3>MoE Layer</h3>
377 |         <div class="formula-block">
378 | linear_layer_flops = 2 * 3 * gbs * seq_len * hidden_size * moe_inter_dim
379 | route_flops = 2 * gbs * seq_len * hidden_size * n_routed_experts
380 | moe_layer_flops = linear_layer_flops * (n_shared_experts + n_activated_experts) + route_flops</div>
381 | 
382 |         <h3>MLP Layer</h3>
383 |         <div class="formula-block">
384 | mlp_flops = 2 * 3 * gbs * seq_len * hidden_size * inter_dim</div>
385 | 
386 |         <h3>Total FLOPS</h3>
387 |         <div class="formula-block">
388 | main_model_flops = 3 * (embedding_flops + 
389 |     moe_layers * (mla_layer_flops + moe_layer_flops) +
390 |     n_dense_layers * (mla_layer_flops + mlp_layer_flops))
391 | 
392 | mtp_flops = 3 * (embedding_flops + mla_layer_flops + moe_layer_flops + linear_proj)
393 | total_flops = main_model_flops + mtp_flops * n_mtp_modules</div>
394 | 
395 |         <h3>MFU (Model FLOPS Utilization)</h3>
396 |         <div class="formula-block">
397 | mfu = total_flops / (world_size * step_time * 10¹²) / gpu_peak_bf16_flops</div>
398 |     </div>
399 | 
400 |     <script>
401 |         class DeepSeekV3Calculation {
402 |             constructor(modelArgs) {
403 |                 this.model_args = modelArgs;
404 |             }
405 | 
406 |             calcu_embedding_layer() {
407 |                 const args = this.model_args;
408 |                 const embedding_flops = 2 * args.gbs * args.seq_len * args.dim * args.vocab_size;
409 |                 return embedding_flops;
410 |             }
411 | 
412 |             calcu_mla_flops() {
413 |                 const args = this.model_args;
414 |                 const gbs = args.gbs;
415 |                 const num_heads = args.n_heads;
416 |                 const hidden_size = args.dim;
417 |                 const qk_head_dim = args.qk_nope_head_dim + args.qk_rope_head_dim;
418 | 
419 |                 let q_down_proj = 2 * args.gbs * args.seq_len * hidden_size * args.q_lora_rank;
420 |                 let q_up_proj = 2 * args.gbs * args.seq_len * args.q_lora_rank * num_heads * qk_head_dim;
421 |                 let q_linear = q_down_proj + q_up_proj;
422 | 
423 |                 let kv_down_proj = 2 * gbs * args.seq_len * hidden_size * (args.kv_lora_rank + args.qk_rope_head_dim);
424 |                 let kv_up_proj = 2 * gbs * args.seq_len * args.kv_lora_rank * num_heads * (qk_head_dim + args.v_head_dim);
425 |                 let kv_linear = kv_down_proj + kv_up_proj;
426 | 
427 |                 let kv_scores = 2 * gbs * Math.pow(args.seq_len, 2) * num_heads * qk_head_dim;
428 |                 let qkv = 2 * gbs * Math.pow(args.seq_len, 2) * num_heads * args.v_head_dim;
429 | 
430 |                 if (args.causal_mask) {
431 |                     kv_scores /= 2;
432 |                     qkv /= 2;
433 |                 }
434 | 
435 |                 let out_linear = 2 * gbs * args.seq_len * args.n_heads * args.v_head_dim * hidden_size;
436 | 
437 |                 if (args.dtype === 'fp8') {
438 |                     q_linear /= 2;
439 |                     kv_linear /= 2;
440 |                     out_linear /= 2;
441 |                 }
442 | 
443 |                 return q_linear + kv_linear + kv_scores + qkv + out_linear;
444 |             }
445 | 
446 |             calcu_moe_flops() {
447 |                 const args = this.model_args;
448 |                 const hidden_size = args.dim;
449 |                 const share = args.n_shared_experts;
450 |                 const top_k = args.n_activated_experts;
451 | 
452 |                 let linear_layer_flops = 2 * 3 * args.gbs * args.seq_len * hidden_size * args.moe_inter_dim;
453 |                 const route_flops = 2 * args.gbs * args.seq_len * hidden_size * args.n_routed_experts;
454 | 
455 |                 if (args.dtype === 'fp8') {
456 |                     linear_layer_flops /= 2;
457 |                 }
458 | 
459 |                 return linear_layer_flops * (share + top_k) + route_flops;
460 |             }
461 | 
462 |             calcu_mlp_flops() {
463 |                 const args = this.model_args;
464 |                 const hidden_size = args.dim;
465 |                 let linear_layer_flops = 2 * 3 * args.gbs * args.seq_len * hidden_size * args.inter_dim;
466 | 
467 |                 if (args.dtype === 'fp8') {
468 |                     linear_layer_flops /= 2;
469 |                 }
470 | 
471 |                 return linear_layer_flops;
472 |             }
473 | 
474 |             calcu_main_model() {
475 |                 const moe_layers = this.model_args.n_layers - this.model_args.n_dense_layers;
476 |                 const embedding_flops = this.calcu_embedding_layer();
477 |                 const mla_layer_flops = this.calcu_mla_flops();
478 |                 const moe_layer_flops = this.calcu_moe_flops();
479 |                 const mlp_layer_flops = this.calcu_mlp_flops();
480 | 
481 |                 return 3 * (embedding_flops +
482 |                     moe_layers * (mla_layer_flops + moe_layer_flops) +
483 |                     this.model_args.n_dense_layers * (mla_layer_flops + mlp_layer_flops));
484 |             }
485 | 
486 |             calcu_mtp_model() {
487 |                 const args = this.model_args;
488 |                 const gbs = args.gbs;
489 |                 const hidden_size = args.dim;
490 |                 let linear_proj = 2 * 3 * gbs * args.seq_len * hidden_size * (hidden_size * 2);
491 | 
492 |                 if (args.dtype === 'fp8') {
493 |                     linear_proj /= 2;
494 |                 }
495 | 
496 |                 const embedding_flops = this.calcu_embedding_layer();
497 |                 const mla_layer_flops = this.calcu_mla_flops();
498 |                 const moe_layer_flops = this.calcu_moe_flops();
499 | 
500 |                 return 3 * (embedding_flops + mla_layer_flops + moe_layer_flops + linear_proj);
501 |             }
502 | 
503 |             calculate(step_time, world_size, gpu_peak_bf16_flops) {
504 |                 const main_model_flops = this.calcu_main_model();
505 |                 const mtp_flops = this.calcu_mtp_model();
506 |                 const total_flops = main_model_flops + mtp_flops * this.model_args.n_mtp_modules;
507 |                 const mfu = total_flops / (world_size * step_time * (10 ** 12)) / gpu_peak_bf16_flops;
508 |                 return [mfu, total_flops];
509 |             }
510 |         }
511 | 
512 |         function calculate() {
513 |             const modelArgs = {
514 |                 gbs: Number(document.getElementById('gbs').value),
515 |                 max_batch_size: Number(document.getElementById('max_batch_size').value),
516 |                 max_seq_len: Number(document.getElementById('max_seq_len').value),
517 |                 seq_len: Number(document.getElementById('seq_len').value),
518 |                 dtype: document.getElementById('dtype').value,
519 |                 vocab_size: Number(document.getElementById('vocab_size').value),
520 |                 dim: Number(document.getElementById('dim').value),
521 |                 inter_dim: Number(document.getElementById('inter_dim').value),
522 |                 moe_inter_dim: Number(document.getElementById('moe_inter_dim').value),
523 |                 n_layers: Number(document.getElementById('n_layers').value),
524 |                 n_dense_layers: Number(document.getElementById('n_dense_layers').value),
525 |                 n_heads: Number(document.getElementById('n_heads').value),
526 |                 n_mtp_modules: Number(document.getElementById('n_mtp_modules').value),
527 |                 n_routed_experts: Number(document.getElementById('n_routed_experts').value),
528 |                 n_shared_experts: Number(document.getElementById('n_shared_experts').value),
529 |                 n_activated_experts: Number(document.getElementById('n_activated_experts').value),
530 |                 n_expert_groups: Number(document.getElementById('n_expert_groups').value),
531 |                 n_limited_groups: Number(document.getElementById('n_limited_groups').value),
532 |                 q_lora_rank: Number(document.getElementById('q_lora_rank').value),
533 |                 kv_lora_rank: Number(document.getElementById('kv_lora_rank').value),
534 |                 qk_nope_head_dim: Number(document.getElementById('qk_nope_head_dim').value),
535 |                 qk_rope_head_dim: Number(document.getElementById('qk_rope_head_dim').value),
536 |                 v_head_dim: Number(document.getElementById('v_head_dim').value),
537 |                 causal_mask: document.getElementById('causal_mask').value === 'true',
538 |             };
539 | 
540 |             const step_time = Number(document.getElementById('step_time').value);
541 |             const world_size = Number(document.getElementById('world_size').value);
542 |             const gpu_peak_bf16_flops = Number(document.getElementById('gpu_peak_bf16_flops').value);
543 | 
544 |             const calculator = new DeepSeekV3Calculation(modelArgs);
545 |             const [mfu, total_flops] = calculator.calculate(step_time, world_size, gpu_peak_bf16_flops);
546 | 
547 |             document.getElementById('mfu_result').textContent = mfu.toFixed(4);
548 |             document.getElementById('flops_result').textContent = total_flops.toExponential(4);
549 |         }
550 | 
551 |         function saveParameters() {
552 |             const params = {
553 |                 gbs: Number(document.getElementById('gbs').value),
554 |                 max_batch_size: Number(document.getElementById('max_batch_size').value),
555 |                 max_seq_len: Number(document.getElementById('max_seq_len').value),
556 |                 seq_len: Number(document.getElementById('seq_len').value),
557 |                 dtype: document.getElementById('dtype').value,
558 |                 vocab_size: Number(document.getElementById('vocab_size').value),
559 |                 dim: Number(document.getElementById('dim').value),
560 |                 inter_dim: Number(document.getElementById('inter_dim').value),
561 |                 moe_inter_dim: Number(document.getElementById('moe_inter_dim').value),
562 |                 n_layers: Number(document.getElementById('n_layers').value),
563 |                 n_dense_layers: Number(document.getElementById('n_dense_layers').value),
564 |                 n_heads: Number(document.getElementById('n_heads').value),
565 |                 n_mtp_modules: Number(document.getElementById('n_mtp_modules').value),
566 |                 n_routed_experts: Number(document.getElementById('n_routed_experts').value),
567 |                 n_shared_experts: Number(document.getElementById('n_shared_experts').value),
568 |                 n_activated_experts: Number(document.getElementById('n_activated_experts').value),
569 |                 n_expert_groups: Number(document.getElementById('n_expert_groups').value),
570 |                 n_limited_groups: Number(document.getElementById('n_limited_groups').value),
571 |                 route_scale: 2.5,
572 |                 score_func: "sigmoid",
573 |                 q_lora_rank: Number(document.getElementById('q_lora_rank').value),
574 |                 kv_lora_rank: Number(document.getElementById('kv_lora_rank').value),
575 |                 qk_nope_head_dim: Number(document.getElementById('qk_nope_head_dim').value),
576 |                 qk_rope_head_dim: Number(document.getElementById('qk_rope_head_dim').value),
577 |                 v_head_dim: Number(document.getElementById('v_head_dim').value),
578 |                 causal_mask: document.getElementById('causal_mask').value === 'true',
579 |             };
580 | 
581 |             // 创建一个隐藏的文件选择器
582 |             const fileInput = document.createElement('input');
583 |             fileInput.type = 'file';
584 |             fileInput.style.display = 'none';
585 |             fileInput.nwsaveas = 'deepseek_parameters.json'; // 设置默认文件名
586 |             fileInput.accept = '.json';
587 | 
588 |             // 将文件选择器添加到文档中
589 |             document.body.appendChild(fileInput);
590 | 
591 |             // 创建 Blob 对象
592 |             const blob = new Blob([JSON.stringify(params, null, 2)], { type: 'application/json' });
593 | 
594 |             // 创建下载链接
595 |             const a = document.createElement('a');
596 |             a.href = URL.createObjectURL(blob);
597 |             a.download = 'deepseek_parameters.json';
598 | 
599 |             // 触发文件保存对话框
600 |             a.click();
601 | 
602 |             // 清理
603 |             URL.revokeObjectURL(a.href);
604 |             document.body.removeChild(fileInput);
605 |         }
606 | 
607 |         function loadParameters(jsonData) {
608 |             try {
609 |                 const params = JSON.parse(jsonData);
610 |                 
611 |                 // 更新表单字段
612 |                 for (const [key, value] of Object.entries(params)) {
613 |                     const element = document.getElementById(key);
614 |                     if (element) {
615 |                         element.value = value;
616 |                     }
617 |                 }
618 |                 
619 |                 calculate(); // 自动计算新参数的结果
620 |             } catch (error) {
621 |                 console.error('Error loading parameters:', error);
622 |                 alert('加载参数文件失败，请确保文件格式正确');
623 |             }
624 |         }
625 | 
626 |         // 添加拖拽功能到整个文档
627 |         document.addEventListener('dragover', (e) => {
628 |             e.preventDefault();
629 |         });
630 | 
631 |         document.addEventListener('drop', (e) => {
632 |             e.preventDefault();
633 |             
634 |             const file = e.dataTransfer.files[0];
635 |             if (file) {
636 |                 handleFile(file);
637 |             }
638 |         });
639 | 
640 |         // 移除之前的 dropZone 相关代码，保留 fileInput 和 handleFile 相关代码
641 |         const fileInput = document.getElementById('fileInput');
642 | 
643 |         fileInput.addEventListener('change', (e) => {
644 |             const file = e.target.files[0];
645 |             if (file) {
646 |                 handleFile(file);
647 |             }
648 |         });
649 | 
650 |         function handleFile(file) {
651 |             if (file.type !== 'application/json' && !file.name.endsWith('.json')) {
652 |                 alert('请上传JSON文件');
653 |                 return;
654 |             }
655 | 
656 |             const reader = new FileReader();
657 |             reader.onload = (e) => {
658 |                 loadParameters(e.target.result);
659 |             };
660 |             reader.readAsText(file);
661 |         }
662 |     </script>
663 | </body>
664 | </html>


--------------------------------------------------------------------------------
/deepseek_v3_pretrain_mfu.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | from typing import Literal
  3 | 
  4 | 
  5 | @dataclass
  6 | class ModelArgs:
  7 |     """
  8 |     Data class for defining model arguments and hyperparameters.
  9 | 
 10 |     Attributes:
 11 |         gbs (int): Global batch size
 12 |         max_batch_size (int): Maximum batch size.
 13 |         max_seq_len (int): Maximum sequence length.
 14 |         dtype (Literal["bf16", "fp8"]): Data type for computations.
 15 |         vocab_size (int): Vocabulary size.
 16 |         dim (int): Model dimension.
 17 |         inter_dim (int): Intermediate dimension for MLP layers.
 18 |         moe_inter_dim (int): Intermediate dimension for MoE layers.
 19 |         n_layers (int): Number of transformer layers.
 20 |         n_dense_layers (int): Number of dense layers in the model.
 21 |         n_heads (int): Number of attention heads.
 22 |         n_mtp_modules (int): Number of mtp modules.
 23 |         n_routed_experts (int): Number of routed experts for MoE layers.
 24 |         n_shared_experts (int): Number of shared experts for MoE layers.
 25 |         n_activated_experts (int): Number of activated experts in MoE layers.
 26 |         n_expert_groups (int): Number of expert groups.
 27 |         n_limited_groups (int): Number of limited groups for MoE routing.
 28 |         score_func (Literal["softmax", "sigmoid"]): Scoring function for MoE routing.
 29 |         route_scale (float): Scaling factor for routing scores.
 30 |         q_lora_rank (int): LoRA rank for query projections.
 31 |         kv_lora_rank (int): LoRA rank for key-value projections.
 32 |         qk_nope_head_dim (int): Dimension for query-key projections without positional embeddings.
 33 |         qk_rope_head_dim (int): Dimension for query-key projections with rotary embeddings.
 34 |         v_head_dim (int): Dimension for value projections.
 35 |         is_causal (bool): Attention calculation type.
 36 |     """
 37 |     # block
 38 |     gbs: int = 1024
 39 |     max_batch_size: int = 8
 40 |     max_seq_len: int = 4096 * 4
 41 |     seq_len: int = 4096
 42 |     dtype: Literal["bf16", "fp8"] = "bf16"
 43 |     vocab_size: int = 129280
 44 |     dim: int = 7168  # embed
 45 |     inter_dim: int = 18432
 46 |     moe_inter_dim: int = 2048
 47 |     n_layers: int = 61
 48 |     n_dense_layers: int = 3
 49 |     n_heads: int = 128
 50 |     n_mtp_modules: int = 2
 51 |     # moe
 52 |     n_routed_experts: int = 256
 53 |     n_shared_experts: int = 1
 54 |     n_activated_experts: int = 8
 55 |     n_expert_groups: int = 8
 56 |     n_limited_groups: int = 4
 57 |     score_func: Literal["softmax", "sigmoid"] = "sigmoid"
 58 |     route_scale: float = 2.5
 59 |     # mla
 60 |     q_lora_rank: int = 1536
 61 |     kv_lora_rank: int = 512
 62 |     qk_nope_head_dim: int = 128
 63 |     qk_rope_head_dim: int = 64
 64 |     v_head_dim: int = 128
 65 |     is_causal: bool = False
 66 | 
 67 | 
 68 | class DeepSeekV3Calculation:
 69 |     def __init__(self,
 70 |                  model_args: ModelArgs,
 71 |                  ):
 72 |         self.model_args = model_args
 73 | 
 74 |     def calcu_embedding_layer(self):
 75 |         args = self.model_args
 76 |         embedding_flops = 2 * args.gbs * args.seq_len * args.dim * args.vocab_size
 77 |         return embedding_flops
 78 | 
 79 |     def calcu_mla_flops(self):
 80 |         # attention flops:
 81 |         args = self.model_args
 82 |         gbs = args.gbs
 83 |         num_heads = args.n_heads
 84 |         hidden_size = args.dim
 85 |         qk_head_dim = args.qk_nope_head_dim + args.qk_rope_head_dim
 86 |         q_down_proj = 2 * args.gbs * args.seq_len * hidden_size * args.q_lora_rank
 87 |         q_up_proj = 2 * args.gbs * args.seq_len * args.q_lora_rank * num_heads * qk_head_dim
 88 |         q_linear = q_down_proj + q_up_proj
 89 | 
 90 |         kv_down_proj = 2 * gbs * args.seq_len * hidden_size * (args.kv_lora_rank + args.qk_rope_head_dim)
 91 |         kv_up_proj = 2 * gbs * args.seq_len * args.kv_lora_rank * num_heads * (qk_head_dim + args.v_head_dim)
 92 |         kv_linear = kv_down_proj + kv_up_proj
 93 | 
 94 |         kv_scores = 2 * gbs * args.seq_len ** 2 * num_heads * qk_head_dim
 95 |         qkv = 2 * gbs * args.seq_len ** 2 * num_heads * args.v_head_dim
 96 | 
 97 |         out_linear = 2 * gbs * args.seq_len * args.n_heads * args.v_head_dim * hidden_size
 98 |         if args.dtype == 'fp8':
 99 |             q_linear /= 2
100 |             kv_linear /= 2
101 |             out_linear /= 2
102 |         attention_layer_flops = q_linear + kv_linear + kv_scores + qkv + out_linear
103 |         return attention_layer_flops
104 | 
105 |     def calcu_moe_flops(self):
106 |         args = self.model_args
107 |         hidden_size = args.dim
108 |         share = args.n_shared_experts
109 |         top_k = args.n_activated_experts
110 |         linear_layer_flops = 2 * 3 * args.gbs * args.seq_len * hidden_size * args.moe_inter_dim
111 |         route_flops = 2 * args.gbs * args.seq_len * hidden_size * args.n_routed_experts
112 |         if args.dtype == 'fp8':
113 |             linear_layer_flops /= 2
114 |         moe_layer_flops = linear_layer_flops * (share + top_k) + route_flops
115 |         return moe_layer_flops
116 | 
117 |     def calcu_mlp_flops(self):
118 |         args = self.model_args
119 |         hidden_size = args.dim
120 |         linear_layer_flops = 2 * 3 * args.gbs * args.seq_len * hidden_size * args.inter_dim
121 |         if args.dtype == 'fp8':
122 |             linear_layer_flops /= 2
123 |         return linear_layer_flops
124 | 
125 |     def calcu_main_model(self):
126 |         moe_layers = self.model_args.n_layers - self.model_args.n_dense_layers
127 |         embedding_flops = self.calcu_embedding_layer()
128 |         mla_layer_flops = self.calcu_mla_flops()
129 |         moe_layer_flops = self.calcu_moe_flops()
130 |         mlp_layer_flops = self.calcu_mlp_flops()
131 | 
132 |         main_model_flops = 3 * (embedding_flops +
133 |                       moe_layers * (mla_layer_flops + moe_layer_flops) +
134 |                       self.model_args.n_dense_layers * (mla_layer_flops + mlp_layer_flops))
135 |         return main_model_flops
136 | 
137 |     def calcu_mtp_model(self):
138 |         args = self.model_args
139 |         gbs = args.gbs
140 |         hidden_size = args.dim
141 |         linear_proj = 2 * 3 * gbs * args.seq_len * hidden_size * (hidden_size * 2)
142 |         if args.dtype == 'fp8':
143 |             linear_proj /= 2
144 | 
145 |         embedding_flops = self.calcu_embedding_layer()
146 |         mla_layer_flops = self.calcu_mla_flops()
147 |         moe_layer_flops = self.calcu_moe_flops()
148 |         mtp_flops = 3 * (embedding_flops + mla_layer_flops + moe_layer_flops + linear_proj)
149 |         return mtp_flops
150 | 
151 |     def calculate(self, step_time, world_size, gpu_peak_bf16_flops):
152 |         main_model_flops = self.calcu_main_model()
153 |         mtp_flops = self.calcu_mtp_model()
154 |         total_flops = main_model_flops + mtp_flops * self.model_args.n_mtp_modules
155 |         mfu = total_flops / (world_size * step_time * (10 ** 12)) / gpu_peak_bf16_flops
156 |         return mfu, total_flops
157 | 
158 | 
159 | if __name__ == "__main__":
160 |     args = ModelArgs()
161 |     calculation = DeepSeekV3Calculation(args)
162 |     mfu, total_flops_bf16 = calculation.calculate(21.00, 512, 354)
163 |     print(f"MFU:{mfu}, Total flops：{total_flops_bf16}")
164 | 


--------------------------------------------------------------------------------
/images/deepseek_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CalvinXKY/mfu_calculation/764c88a1f00c8fb5bd4473ddd019f23ab5ad4bb7/images/deepseek_architecture.png


--------------------------------------------------------------------------------
/images/gpt_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CalvinXKY/mfu_calculation/764c88a1f00c8fb5bd4473ddd019f23ab5ad4bb7/images/gpt_architecture.png


--------------------------------------------------------------------------------
/images/llama_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CalvinXKY/mfu_calculation/764c88a1f00c8fb5bd4473ddd019f23ab5ad4bb7/images/llama_architecture.png


--------------------------------------------------------------------------------
/images/llama_mlp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CalvinXKY/mfu_calculation/764c88a1f00c8fb5bd4473ddd019f23ab5ad4bb7/images/llama_mlp.png


--------------------------------------------------------------------------------
/images/lm_head_weights.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CalvinXKY/mfu_calculation/764c88a1f00c8fb5bd4473ddd019f23ab5ad4bb7/images/lm_head_weights.png


--------------------------------------------------------------------------------
/images/mla_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CalvinXKY/mfu_calculation/764c88a1f00c8fb5bd4473ddd019f23ab5ad4bb7/images/mla_architecture.png


--------------------------------------------------------------------------------
/images/mla_formulas.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CalvinXKY/mfu_calculation/764c88a1f00c8fb5bd4473ddd019f23ab5ad4bb7/images/mla_formulas.png


--------------------------------------------------------------------------------
/images/mtp_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CalvinXKY/mfu_calculation/764c88a1f00c8fb5bd4473ddd019f23ab5ad4bb7/images/mtp_architecture.png


--------------------------------------------------------------------------------
/images/transformer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CalvinXKY/mfu_calculation/764c88a1f00c8fb5bd4473ddd019f23ab5ad4bb7/images/transformer.png


--------------------------------------------------------------------------------
/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="zh-CN">
  3 | <head>
  4 |     <meta charset="UTF-8">
  5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
  6 |     <title>MFU Calculation</title>
  7 |     <style>
  8 |         body {
  9 |             font-family: Arial, sans-serif;
 10 |             margin: 20px;
 11 |             background-color: #f4f4f9;
 12 |         }
 13 |         .container {
 14 |             max-width: 600px;
 15 |             margin: auto;
 16 |             background: white;
 17 |             padding: 20px;
 18 |             border-radius: 8px;
 19 |             box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
 20 |         }
 21 |         .input-group {
 22 |             display: flex;
 23 |             align-items: center;
 24 |             margin-bottom: 10px;
 25 |         }
 26 |         .input-group label {
 27 |             margin-right: 10px;
 28 |             white-space: nowrap;
 29 |             width: 300px;
 30 |             text-align: right;
 31 |         }
 32 |         .input-group input {
 33 |             width: 250px;
 34 |             padding: 5px;
 35 |             box-sizing: border-box;
 36 |             border: 1px solid #ccc;
 37 |             border-radius: 4px;
 38 |         }
 39 |         .result {
 40 |             margin-top: 20px;
 41 |             font-size: 1.2em;
 42 |             font-weight: bold;
 43 |             text-align: center;
 44 |         }
 45 |         button {
 46 |             display: block;
 47 |             margin: 20px auto;
 48 |             padding: 10px 20px;
 49 |             background-color: #007BFF;
 50 |             color: white;
 51 |             border: none;
 52 |             border-radius: 4px;
 53 |             cursor: pointer;
 54 |         }
 55 |         button:hover {
 56 |             background-color: #0056b3;
 57 |         }
 58 |         .main-container {
 59 |             display: flex;
 60 |             max-width: 1200px;
 61 |             margin: auto;
 62 |             gap: 20px;
 63 |         }
 64 |         .input-container {
 65 |             flex: 1;
 66 |             background: white;
 67 |             padding: 20px;
 68 |             border-radius: 8px;
 69 |             box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
 70 |         }
 71 |         .formula-container {
 72 |             flex: 1;
 73 |             background: white;
 74 |             padding: 20px;
 75 |             border-radius: 8px;
 76 |             box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
 77 |             font-family: monospace;
 78 |         }
 79 |         .formula-group {
 80 |             margin-bottom: 15px;
 81 |         }
 82 |         .formula-input {
 83 |             width: 100%;
 84 |             font-family: monospace;
 85 |             padding: 5px;
 86 |             margin-top: 5px;
 87 |         }
 88 |         .ads-container {
 89 |             margin: 20px;
 90 |             margin-bottom: 0;
 91 |             display: flex;
 92 |             gap: 10px;
 93 |             z-index: 1000;
 94 |         }
 95 |         .ads {
 96 |             background-color: #24292e;
 97 |             color: white;
 98 |             padding: 8px 16px;
 99 |             border-radius: 20px;
100 |             text-decoration: none;
101 |             font-size: 14px;
102 |             display: flex;
103 |             align-items: center;
104 |             transition: all 0.3s ease;
105 |             box-shadow: 0 2px 4px rgba(0,0,0,0.1);
106 |             white-space: nowrap;
107 |         }
108 |         .ads:hover {
109 |             background-color: #1a1f24;
110 |             transform: translateY(-2px);
111 |             box-shadow: 0 4px 8px rgba(0,0,0,0.2);
112 |         }
113 |         .ads.github::after {
114 |             content: "★";
115 |             margin-left: 6px;
116 |             color: #ff7b72;
117 |         }
118 |         .ads.deepseek {
119 |             background-color: #4CAF50;
120 |         }
121 |         .ads.deepseek:hover {
122 |             background-color: #45a049;
123 |         }
124 |     </style>
125 | </head>
126 | 
127 | <body>
128 |     <div class="ads-container">
129 |         <a class="ads github" href="https://github.com/CalvinXKY/mfu_calculation">Welcome to star this project❤</a>
130 |         <a class="ads deepseek" href="https://calvinxky.github.io/mfu_calculation/deepseek3mfu.html">DeepSeek V3 Calculator</a>
131 |     </div>
132 |     <div class="main-container">
133 |         <div class="input-container">
134 |             <h1 style="text-align: center;">MFU Calculation</h1>
135 |             <div id="input-container">
136 |                 <div class="input-group">
137 |                     <label for="gpu_top_flops">gpu_top_flops（TFLOS/s）：</label>
138 |                     <input type="number" id="gpu_top_flops" class="number-input" value="280">
139 |                 </div>
140 |                 <div class="input-group">
141 |                     <label for="gpu_nums">gpu_nums：</label>
142 |                     <input type="number" id="gpu_nums" class="number-input" value="1024">
143 |                 </div>
144 |                 <div class="input-group">
145 |                     <label for="gbs">gbs（global batch size）：</label>
146 |                     <input type="number" id="gbs" class="number-input" value="1024">
147 |                 </div>
148 |                 <div class="input-group">
149 |                     <label for="seq_len">seq_len：</label>
150 |                     <input type="number" id="seq_len" class="number-input" value="4096">
151 |                 </div>
152 |                 <div class="input-group">
153 |                     <label for="hidden_size">hidden_size（=num_heads * d_model）：</label>
154 |                     <input type="number" id="hidden_size" class="number-input" value="1024">
155 |                 </div>
156 |                 <div class="input-group">
157 |                     <label for="num_heads">num_heads：</label>
158 |                     <input type="number" id="num_heads" class="number-input" value="8">
159 |                 </div>
160 |                 <div class="input-group">
161 |                     <label for="d_model">d_model：</label>
162 |                     <input type="number" id="d_model" class="number-input" value="128">
163 |                 </div>
164 |                 <div class="input-group">
165 |                     <label for="vocab_size">vocab_size：</label>
166 |                     <input type="number" id="vocab_size" class="number-input" value="32768">
167 |                 </div>
168 |                 <div class="input-group">
169 |                     <label for="num_query_groups">MQA（num_query_groups）：</label>
170 |                     <input type="number" id="num_query_groups" class="number-input" value="4">
171 |                 </div>
172 |                 <div class="input-group">
173 |                     <label for="ffn_hidden_size">ffn_hidden_size：</label>
174 |                     <input type="number" id="ffn_hidden_size" class="number-input" value="2048">
175 |                 </div>
176 |                 <div class="input-group">
177 |                     <label for="layer_nums">layer_nums：</label>
178 |                     <input type="number" id="layer_nums" class="number-input" value="100">
179 |                 </div>
180 |                 <div class="input-group">
181 |                     <label for="cp">cp(context parallel)：</label>
182 |                     <input type="number" id="cp" class="number-input" value="1">
183 |                 </div>
184 |                 <div class="input-group">
185 |                     <label for="share_experts">share_experts：</label>
186 |                     <input type="number" id="share_experts" class="number-input" value="1">
187 |                 </div>
188 |                 <div class="input-group">
189 |                     <label for="top_k">top_k（default: 0）：</label>
190 |                     <input type="number" id="top_k" class="number-input" value="8">
191 |                 </div>
192 |                 <div class="input-group">
193 |                     <label for="int15">None（default）</label>
194 |                     <input type="number" id="int15" class="number-input" value="0">
195 |                 </div>
196 |                 <div class="input-group">
197 |                     <label for="step_time">step_time(s)：</label>
198 |                     <input type="number" id="step_time" class="number-input" step="0.01" value="1.5">
199 |                 </div>
200 |             </div>
201 |             <button onclick="processParameters()">Get Result</button>
202 |             <div class="result" id="result">0</div>
203 |         </div>
204 | 
205 |         <div class="formula-container">
206 |             <h2>Calculation Formulas</h2>
207 | 
208 |             <!-- 添加模型选择单选按钮组 -->
209 |             <div class="model-selector" style="margin-bottom: 20px;">
210 |                 <label style="font-weight: bold; margin-bottom: 10px; display: block;">Model Type:</label>
211 |                 <input type="radio" id="moe_model" name="model_type" value="moe" checked>
212 |                 <label for="moe_model">MoE</label>
213 |                 <input type="radio" id="llama_model" name="model_type" value="llama" style="margin-left: 15px;">
214 |                 <label for="llama_model">Llama</label>
215 |                 <input type="radio" id="gpt_model" name="model_type" value="gpt" style="margin-left: 15px;">
216 |                 <label for="gpt_model">GPT</label>
217 |             </div>
218 | 
219 |             <div class="formula-group">
220 |                 <label>Embedding Flops:</label>
221 |                 <input type="text" class="formula-input" id="embedding_formula"
222 |                     value="gbs * seq_len * hidden_size * vocab_size" onchange="updateFormulas()">
223 |             </div>
224 |             <div class="formula-group">
225 |                 <label>Attention Layer Flops:</label>
226 |                 <input type="text" class="formula-input" id="attention_formula"
227 |                     value="q_linear + kv_linear + kv_scores + v_projection + out_linear" onchange="updateFormulas()">
228 |             </div>
229 |             <div class="formula-group">
230 |                 <label>MLP Layer Flops:</label>
231 |                 <input type="text" class="formula-input" id="mlp_formula"
232 |                     value="3 * gbs * seq_len * hidden_size * ffn_hidden_size" onchange="updateFormulas()">
233 |             </div>
234 |             <div class="formula-group">
235 |                 <label>MoE Layer Flops:</label>
236 |                 <input type="text" class="formula-input" id="moe_formula"
237 |                     value="mlp_layer_flops * (share_experts + top_k)" onchange="updateFormulas()">
238 |             </div>
239 |             <div class="formula-group">
240 |                 <label>Total Model Flops:</label>
241 |                 <input type="text" class="formula-input" id="model_formula"
242 |                     value="3 * 2 * (embedding_flops + layer_nums * (attention_layer_flops + moe_layer_flops + layer_norm))" onchange="updateFormulas()">
243 |             </div>
244 |             <div class="formula-group">
245 |                 <label>MFU Calculation:</label>
246 |                 <input type="text" class="formula-input" id="mfu_formula"
247 |                     value="model_flops / (gpu_nums * step_time * (10 ** 12)) / gpu_top_flops" onchange="updateFormulas()">
248 |             </div>
249 |         </div>
250 |     </div>
251 | 
252 |     <script>
253 |         // 添加模型类型切换处理函数
254 |         function handleModelTypeChange() {
255 |             const llamaModel = document.getElementById('llama_model').checked;
256 |             const gptModel = document.getElementById('gpt_model').checked;
257 |             const shareExpertsInput = document.getElementById('share_experts');
258 |             const topKInput = document.getElementById('top_k');
259 |             const mlpFormulaInput = document.getElementById('mlp_formula');
260 | 
261 |             if (llamaModel || gptModel) {
262 |                 // Llama/GPT模式：锁定输入框并设置值
263 |                 shareExpertsInput.value = "1";
264 |                 topKInput.value = "0";
265 |                 shareExpertsInput.disabled = true;
266 |                 topKInput.disabled = true;
267 | 
268 |                 // 如果是GPT模式，修改MLP公式显示
269 |                 if (gptModel) {
270 |                     mlpFormulaInput.value = "2 * gbs * seq_len * hidden_size * ffn_hidden_size";
271 |                 } else {
272 |                     mlpFormulaInput.value = "3 * gbs * seq_len * hidden_size * ffn_hidden_size";
273 |                 }
274 |             } else {
275 |                 // MoE模式：解锁输入框并恢复原始公式
276 |                 shareExpertsInput.disabled = false;
277 |                 topKInput.disabled = false;
278 |                 mlpFormulaInput.value = "3 * gbs * seq_len * hidden_size * ffn_hidden_size";
279 |             }
280 | 
281 |             // 更新计算结果
282 |             processParameters();
283 |         }
284 | 
285 |         // 为单选按钮添加事件监听器
286 |         document.getElementById('moe_model').addEventListener('change', handleModelTypeChange);
287 |         document.getElementById('llama_model').addEventListener('change', handleModelTypeChange);
288 |         document.getElementById('gpt_model').addEventListener('change', handleModelTypeChange);
289 | 
290 |         function processParameters() {
291 |             // 获取所有输入参数
292 |             const gpu_top_flops = parseFloat(document.getElementById('gpu_top_flops').value);
293 |             const gpu_nums = parseFloat(document.getElementById('gpu_nums').value);
294 |             const gbs = parseFloat(document.getElementById('gbs').value);
295 |             const seq_len = parseFloat(document.getElementById('seq_len').value);
296 |             const hidden_size = parseFloat(document.getElementById('hidden_size').value);
297 |             const num_heads = parseFloat(document.getElementById('num_heads').value);
298 |             const d_model = parseFloat(document.getElementById('d_model').value);
299 |             const vocab_size = parseFloat(document.getElementById('vocab_size').value);
300 |             const num_query_groups = parseFloat(document.getElementById('num_query_groups').value);
301 |             const ffn_hidden_size = parseFloat(document.getElementById('ffn_hidden_size').value);
302 |             const layer_nums = parseFloat(document.getElementById('layer_nums').value);
303 |             const context_parallel = parseFloat(document.getElementById('cp').value);
304 |             const share_experts = parseFloat(document.getElementById('share_experts').value);
305 |             const top_k = parseFloat(document.getElementById('top_k').value);
306 |             const int15 = parseFloat(document.getElementById('int15').value);
307 |             const step_time = parseFloat(document.getElementById('step_time').value);
308 | 
309 |             if (isNaN(gpu_top_flops) || isNaN(gpu_nums) || isNaN(gbs) || isNaN(seq_len) || isNaN(hidden_size) ||
310 |                 isNaN(num_heads) || isNaN(vocab_size) || isNaN(num_query_groups) || isNaN(ffn_hidden_size) || isNaN(layer_nums) ||
311 |                 isNaN(context_parallel) || isNaN(share_experts) || isNaN(top_k) || isNaN(d_model) || isNaN(int15) ||
312 |                 isNaN(step_time)) {
313 |                 document.getElementById('result').innerText = 'Got invalid value!';
314 |                 return;
315 |             }
316 | 
317 |             try {
318 |                 // 从公式输入框获取最新的公式
319 |                 const embedding_formula = document.getElementById('embedding_formula').value;
320 |                 const attention_formula = document.getElementById('attention_formula').value;
321 |                 const mlp_formula = document.getElementById('mlp_formula').value;
322 |                 const moe_formula = document.getElementById('moe_formula').value;
323 |                 const model_formula = document.getElementById('model_formula').value;
324 |                 const mfu_formula = document.getElementById('mfu_formula').value;
325 | 
326 |                 // 创建计算函数
327 |                 const calculateResult = new Function(
328 |                     'gbs', 'seq_len', 'hidden_size', 'vocab_size', 'num_heads',
329 |                     'd_model', 'num_query_groups', 'ffn_hidden_size', 'layer_nums',
330 |                     'context_parallel', 'share_experts', 'top_k', 'gpu_nums', 'step_time',
331 |                     'gpu_top_flops', 'is_gpt',
332 |                     `
333 |                     // 计算 embedding_flops
334 |                     const embedding_flops = ${embedding_formula};
335 | 
336 |                     // 计算 attention 相关参数
337 |                     const q_linear = gbs * seq_len * hidden_size ** 2;
338 |                     const kv_linear = gbs * seq_len * hidden_size * num_query_groups * d_model;
339 |                     const kv_scores = gbs * seq_len**2 * num_heads * d_model * (context_parallel + 1) / (2 * context_parallel);
340 |                     const v_projection = gbs * seq_len**2 * num_heads * d_model * (context_parallel + 1) / (2 * context_parallel);
341 |                     const out_linear = gbs * seq_len * hidden_size ** 2;
342 | 
343 |                     // GPT特殊处理
344 |                     if (is_gpt) {
345 |                         // 修改 mlp_layer_flops 的计算公式，将系数从 3 改为 2
346 |                         const mlp_layer_flops_original = ${mlp_formula};
347 |                         mlp_layer_flops = (2/3) * mlp_layer_flops_original;  // 将系数从 3 改为 2
348 |                     } else {
349 |                         // 非 GPT 模式使用原始公式
350 |                         mlp_layer_flops = ${mlp_formula};
351 |                     }
352 | 
353 |                     // 计算 attention_layer_flops
354 |                     const attention_layer_flops = ${attention_formula};
355 | 
356 |                     const layer_norm = gbs * seq_len * hidden_size;
357 | 
358 |                     // 计算 moe_layer_flops
359 |                     const moe_layer_flops = ${moe_formula};
360 | 
361 |                     // 计算总 model_flops
362 |                     const model_flops = ${model_formula};
363 | 
364 |                     // 计算 MFU
365 |                     const mfu = ${mfu_formula};
366 | 
367 |                     return {
368 |                         tflops_per_gpu: model_flops / (10 ** 12) / gpu_nums,
369 |                         mfu: mfu
370 |                     };
371 |                     `
372 |                 );
373 | 
374 |                 // 执行计算
375 |                 const is_gpt = document.getElementById('gpt_model').checked;
376 |                 const result = calculateResult(
377 |                     gbs, seq_len, hidden_size, vocab_size, num_heads,
378 |                     d_model, num_query_groups, ffn_hidden_size, layer_nums,
379 |                     context_parallel, share_experts, top_k, gpu_nums, step_time,
380 |                     gpu_top_flops, is_gpt
381 |                 );
382 | 
383 |                 document.getElementById('result').innerText = 
384 |                     `TFLOP/GPU: ${result.tflops_per_gpu.toPrecision(4)} \n MFU：${result.mfu.toPrecision(4)}\n`;
385 | 
386 |             } catch (error) {
387 |                 console.error('Calculation error:', error);
388 |                 document.getElementById('result').innerText = 'Formula Error: ' + error.message;
389 |             }
390 |         }
391 | 
392 |         function updateFormulas() {
393 |             processParameters();
394 |         }
395 |     </script>
396 | </body>
397 | </html>


--------------------------------------------------------------------------------
/mfu_calculation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": [],
  7 |       "toc_visible": true,
  8 |       "authorship_tag": "ABX9TyOj7ORG8JnLvUk/+4dWEAag",
  9 |       "include_colab_link": true
 10 |     },
 11 |     "kernelspec": {
 12 |       "name": "python3",
 13 |       "display_name": "Python 3"
 14 |     },
 15 |     "language_info": {
 16 |       "name": "python"
 17 |     }
 18 |   },
 19 |   "cells": [
 20 |     {
 21 |       "cell_type": "markdown",
 22 |       "metadata": {
 23 |         "id": "view-in-github",
 24 |         "colab_type": "text"
 25 |       },
 26 |       "source": [
 27 |         "<a href=\"https://colab.research.google.com/github/CalvinXKY/mfu_calculation/blob/main/mfu_calculation.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 28 |       ]
 29 |     },
 30 |     {
 31 |       "cell_type": "markdown",
 32 |       "source": [
 33 |         "# MFU快速计算器\n",
 34 |         "\n",
 35 |         "## 硬件配置"
 36 |       ],
 37 |       "metadata": {
 38 |         "id": "Fwy25fXe0_Il"
 39 |       }
 40 |     },
 41 |     {
 42 |       "cell_type": "code",
 43 |       "source": [
 44 |         "\"\"\"\n",
 45 |         "Create a mfu calculation for LLM.\n",
 46 |         "\"\"\"\n",
 47 |         "# Hardware setting：\n",
 48 |         "GPU_FLOPS = 280 # 机器理论峰值 单位TFops/s\n",
 49 |         "GPU_NUMS = 1024"
 50 |       ],
 51 |       "metadata": {
 52 |         "id": "20fLIwertwv4"
 53 |       },
 54 |       "execution_count": 18,
 55 |       "outputs": []
 56 |     },
 57 |     {
 58 |       "cell_type": "markdown",
 59 |       "source": [
 60 |         "## 模型参数配置"
 61 |       ],
 62 |       "metadata": {
 63 |         "id": "_mxeXAW_0x0d"
 64 |       }
 65 |     },
 66 |     {
 67 |       "cell_type": "code",
 68 |       "source": [
 69 |         "# parameters eg：\n",
 70 |         "GBS = 1024\n",
 71 |         "SEQ_LEN = 4096\n",
 72 |         "HIDDEN_SIZE = 1024\n",
 73 |         "NUM_HEADS = 8\n",
 74 |         "D_MODEL = 128\n",
 75 |         "VOCAB_SIZE = 32768\n",
 76 |         "NUM_QUERY_GROUPS = 4\n",
 77 |         "FFN_HIDDEN_SIZE = 2048\n",
 78 |         "LAYER_NUMS = 100\n",
 79 |         "CP = 1\n",
 80 |         "STEP_TIME = 1.5\n",
 81 |         "\n",
 82 |         "# 非MoE 模型设置top_k=0, shared_experts=1\n",
 83 |         "SHARE_EXPERTS = 1\n",
 84 |         "TOP_K = 8"
 85 |       ],
 86 |       "metadata": {
 87 |         "id": "b32Q6Vg06uqG"
 88 |       },
 89 |       "execution_count": 19,
 90 |       "outputs": []
 91 |     },
 92 |     {
 93 |       "cell_type": "markdown",
 94 |       "source": [
 95 |         "## 计算函数定义："
 96 |       ],
 97 |       "metadata": {
 98 |         "id": "FY2vkF1m4XN3"
 99 |       }
100 |     },
101 |     {
102 |       "cell_type": "code",
103 |       "source": [
104 |         "import numpy as np\n",
105 |         "\n",
106 |         "# 简化版：\n",
107 |         "def mfu_calculation(step_time=STEP_TIME,\n",
108 |         "          gbs=GBS,\n",
109 |         "          seq_len=SEQ_LEN,\n",
110 |         "          hidden_size=HIDDEN_SIZE,\n",
111 |         "          vocab_size=VOCAB_SIZE,\n",
112 |         "          num_heads=NUM_HEADS,\n",
113 |         "          d_model=D_MODEL,\n",
114 |         "          num_query_groups=NUM_QUERY_GROUPS,\n",
115 |         "          ffn_hidden_size=FFN_HIDDEN_SIZE,\n",
116 |         "          share=SHARE_EXPERTS,\n",
117 |         "          top_k=TOP_K,\n",
118 |         "          layer_nums=LAYER_NUMS,\n",
119 |         "          context_parallel=CP,\n",
120 |         "          mlp_with_gate=True):\n",
121 |         "    embedding_logits_flops = 2 * gbs * seq_len * hidden_size * vocab_size\n",
122 |         "\n",
123 |         "    # attention flops\n",
124 |         "    q_linear = 2 * gbs * seq_len * hidden_size ** 2\n",
125 |         "    kv_linear = 2 * 2 * gbs * seq_len * hidden_size * num_query_groups * d_model\n",
126 |         "    kv_scores = 2 * gbs * seq_len**2 * num_heads * d_model * (context_parallel + 1) / (2 * context_parallel)\n",
127 |         "    v_projection = 2 * gbs * seq_len**2 * num_heads * d_model * (context_parallel + 1) / (2 * context_parallel)\n",
128 |         "\n",
129 |         "    out_linear = 2 * gbs * seq_len * hidden_size ** 2\n",
130 |         "    attention_layer_flops = q_linear + kv_linear + kv_scores + v_projection + out_linear\n",
131 |         "\n",
132 |         "    # consider layer norm. (It can be ignored)\n",
133 |         "    layer_norm = 2 * gbs * seq_len * hidden_size\n",
134 |         "\n",
135 |         "    if mlp_with_gate:\n",
136 |         "        # llama structure\n",
137 |         "        mlp_layer_flops = 3 * 2 * gbs * seq_len * hidden_size * ffn_hidden_size\n",
138 |         "    else:\n",
139 |         "        mlp_layer_flops = 2 * 2 * gbs * seq_len * hidden_size * ffn_hidden_size\n",
140 |         "\n",
141 |         "    moe_layer_flops = mlp_layer_flops * (share + top_k)\n",
142 |         "    model_flops = 3 * (embedding_logits_flops + layer_nums * (attention_layer_flops + moe_layer_flops + layer_norm))\n",
143 |         "\n",
144 |         "    mfu = model_flops / (GPU_NUMS * step_time * (10 ** 12)) / GPU_FLOPS\n",
145 |         "    return mfu"
146 |       ],
147 |       "metadata": {
148 |         "id": "3S4AiF0eA2a7"
149 |       },
150 |       "execution_count": 20,
151 |       "outputs": []
152 |     },
153 |     {
154 |       "cell_type": "markdown",
155 |       "source": [
156 |         "## 计算测试"
157 |       ],
158 |       "metadata": {
159 |         "id": "-3TSl-Qp4cQC"
160 |       }
161 |     },
162 |     {
163 |       "cell_type": "code",
164 |       "source": [
165 |         "mfu_calculation(step_time=1.5)"
166 |       ],
167 |       "metadata": {
168 |         "colab": {
169 |           "base_uri": "https://localhost:8080/"
170 |         },
171 |         "id": "ZtVTqvV0TkS6",
172 |         "outputId": "b8268c23-553a-4f7b-a6cf-9e84b7bf826c"
173 |       },
174 |       "execution_count": 21,
175 |       "outputs": [
176 |         {
177 |           "output_type": "execute_result",
178 |           "data": {
179 |             "text/plain": [
180 |               "0.4007877972553143"
181 |             ]
182 |           },
183 |           "metadata": {},
184 |           "execution_count": 21
185 |         }
186 |       ]
187 |     },
188 |     {
189 |       "cell_type": "markdown",
190 |       "source": [
191 |         "Deepseek v2/v3模型的计算：降MHA替换为MLA\n",
192 |         "\n",
193 |         "注意忽略如下计算：\n",
194 |         "* 旋转编码/RMSNorm\n",
195 |         "* 低精度运算\n",
196 |         "* 重计算参数"
197 |       ],
198 |       "metadata": {
199 |         "id": "8visJML7FiR8"
200 |       }
201 |     },
202 |     {
203 |       "cell_type": "code",
204 |       "source": [
205 |         "import numpy as np\n",
206 |         "\n",
207 |         "# 简化版：\n",
208 |         "def mfu_calculation_deepseek(step_time=STEP_TIME,\n",
209 |         "               gbs=GBS,\n",
210 |         "               seq_len=SEQ_LEN,\n",
211 |         "               hidden_size=HIDDEN_SIZE,\n",
212 |         "               vocab_size=VOCAB_SIZE,\n",
213 |         "               num_heads=NUM_HEADS,\n",
214 |         "               d_model=D_MODEL,\n",
215 |         "               num_query_groups=NUM_QUERY_GROUPS,\n",
216 |         "               ffn_hidden_size=FFN_HIDDEN_SIZE,\n",
217 |         "               share=SHARE_EXPERTS,\n",
218 |         "               top_k=TOP_K,\n",
219 |         "               layer_nums=LAYER_NUMS,\n",
220 |         "               context_parallel=CP,\n",
221 |         "               q_lora_rank=None,\n",
222 |         "               kv_lora_rank=None,\n",
223 |         "               mlp_with_gate=True):\n",
224 |         "    embedding_flops = gbs * seq_len * hidden_size * vocab_size\n",
225 |         "\n",
226 |         "    # attention flops\n",
227 |         "    if q_lora_rank is not None:\n",
228 |         "      q_down_proj = gbs * seq_len * hidden_size * q_lora_rank\n",
229 |         "      q_up_proj  = gbs * seq_len * q_lora_rank * num_heads * d_model\n",
230 |         "      q_linear = q_down_proj + q_up_proj\n",
231 |         "    else:\n",
232 |         "      q_linear = gbs * seq_len * hidden_size ** 2\n",
233 |         "\n",
234 |         "    if kv_lora_rank is not None:\n",
235 |         "      kv_down_proj = gbs * seq_len * hidden_size * kv_lora_rank\n",
236 |         "      kv_up_proj = gbs * seq_len * kv_lora_rank * num_heads * d_model * 2\n",
237 |         "      kv_linear = kv_down_proj + kv_up_proj\n",
238 |         "    else:\n",
239 |         "      kv_linear = gbs * seq_len * hidden_size * num_query_groups * d_model * 2\n",
240 |         "\n",
241 |         "    kv_scores = gbs * seq_len**2 * num_heads * d_model * (context_parallel + 1) / (2 * context_parallel)\n",
242 |         "    v_projection = gbs * seq_len**2 * num_heads * d_model * (context_parallel + 1) / (2 * context_parallel)\n",
243 |         "\n",
244 |         "    out_linear = gbs * seq_len * hidden_size ** 2\n",
245 |         "    attention_layer_flops = q_linear + kv_linear + kv_scores + v_projection + out_linear\n",
246 |         "\n",
247 |         "    # consider layer norm. (It can be ignored)\n",
248 |         "    layer_norm = gbs * seq_len * hidden_size\n",
249 |         "\n",
250 |         "    if mlp_with_gate:\n",
251 |         "        # llama structure\n",
252 |         "        mlp_layer_flops = 3 * gbs * seq_len * hidden_size * ffn_hidden_size\n",
253 |         "    else:\n",
254 |         "        mlp_layer_flops = 2 * gbs * seq_len * hidden_size * ffn_hidden_size\n",
255 |         "\n",
256 |         "    moe_layer_flops = mlp_layer_flops * (share + top_k)\n",
257 |         "    model_flops = 3 * 2 * (embedding_flops + layer_nums * (attention_layer_flops + moe_layer_flops + layer_norm))\n",
258 |         "\n",
259 |         "    mfu = model_flops / (GPU_NUMS * step_time * (10 ** 12)) / GPU_FLOPS\n",
260 |         "    return mfu"
261 |       ],
262 |       "metadata": {
263 |         "id": "GOddzrGko9yE"
264 |       },
265 |       "execution_count": 22,
266 |       "outputs": []
267 |     },
268 |     {
269 |       "cell_type": "code",
270 |       "source": [
271 |         "mfu_calculation_deepseek(step_time=1.5, q_lora_rank=256, kv_lora_rank=128)"
272 |       ],
273 |       "metadata": {
274 |         "id": "d6osexCYq2rd",
275 |         "outputId": "f50f6943-e97a-437b-c961-86d2388142fc",
276 |         "colab": {
277 |           "base_uri": "https://localhost:8080/"
278 |         }
279 |       },
280 |       "execution_count": 23,
281 |       "outputs": [
282 |         {
283 |           "output_type": "execute_result",
284 |           "data": {
285 |             "text/plain": [
286 |               "0.3938851712438857"
287 |             ]
288 |           },
289 |           "metadata": {},
290 |           "execution_count": 23
291 |         }
292 |       ]
293 |     }
294 |   ]
295 | }


--------------------------------------------------------------------------------
/mfu_detail.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": [],
  7 |       "toc_visible": true,
  8 |       "authorship_tag": "ABX9TyOWeMvqOpaMLI4O22YsJucr",
  9 |       "include_colab_link": true
 10 |     },
 11 |     "kernelspec": {
 12 |       "name": "python3",
 13 |       "display_name": "Python 3"
 14 |     },
 15 |     "language_info": {
 16 |       "name": "python"
 17 |     }
 18 |   },
 19 |   "cells": [
 20 |     {
 21 |       "cell_type": "markdown",
 22 |       "metadata": {
 23 |         "id": "view-in-github",
 24 |         "colab_type": "text"
 25 |       },
 26 |       "source": [
 27 |         "<a href=\"https://colab.research.google.com/github/CalvinXKY/mfu_calculation/blob/main/mfu_detail.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 28 |       ]
 29 |     },
 30 |     {
 31 |       "cell_type": "markdown",
 32 |       "source": [
 33 |         "主要介绍LLM（GPT/Llama/MoE）中一些操作层、模块的flops计算量，帮助理解MFU计算过程。\n",
 34 |         "\n",
 35 |         "![transformer架构](https://github.com/CalvinXKY/mfu_calculation/blob/main/images/transformer.png?raw=1)\n",
 36 |         "\n",
 37 |         "# 基本模块flops计算\n",
 38 |         "## 线性层的计算量\n",
 39 |         "\n",
 40 |         "线性层的计算公式为 Y = wX + b 涉及到矩阵的乘法与加法运算。\n",
 41 |         "\n",
 42 |         "矩阵乘法与加法的flops的计算为：\n",
 43 |         "\n",
 44 |         "**乘法计算量**：对于两个矩阵A和B的乘法C=AB，其中A是m×n矩阵，B是n×p矩阵，C是m×p矩阵。每个元素Cij需要进行 n 次乘法和n-1次加法，总共有mp个元素，因此总FLOPS为：\n",
 45 |         "\n",
 46 |         "mp(n+(n-1)) = 2mnp - mp。\n",
 47 |         "\n",
 48 |         "**加法/减法计算量**：对于两个矩阵A和B的加法C=A+B，其中A和B都是m×n矩阵，C也是m×n矩阵。每个元素Cij需要进行一次加法，总共有mn个元素，因此总FLOPS为mn。\n",
 49 |         "\n",
 50 |         "对于linear计算，里面涉及一个矩阵乘和一个矩阵加法，由于元素需要展平再运算，权重w的维度[m, n] 输入的维度是[1, n] 输出维度[1, m]，其计算量为\n",
 51 |         "\n",
 52 |         "2mn\n",
 53 |         "\n",
 54 |         "不考虑bias的计算量为\n",
 55 |         "\n",
 56 |         "2mn  - m\n",
 57 |         "\n",
 58 |         "对于transformer的线性层输入与输出一般用相同的大小，形状都为：[batch_size, seq_len, d_model],\n",
 59 |         "线性层的创建一般使用 Linear(hidden_size, hidden_size, bias=False)\n",
 60 |         "所以计算量为：\n",
 61 |         "\n",
 62 |         "flops = 2 * batch_size * seq_len * hidden_size * hidden_size\n",
 63 |         "\n",
 64 |         "如果不一致时：\n",
 65 |         "flops = 2 * batch_size * seq_len * size_1 * size_2"
 66 |       ],
 67 |       "metadata": {
 68 |         "id": "odaFtoySwx8B"
 69 |       }
 70 |     },
 71 |     {
 72 |       "cell_type": "code",
 73 |       "execution_count": 23,
 74 |       "metadata": {
 75 |         "id": "KWjh_Y9fwgfY"
 76 |       },
 77 |       "outputs": [],
 78 |       "source": [
 79 |         "def calcu_linear_flops(batch_size, seq_len, hidden_size, head=0, d_model=0,bias=False):\n",
 80 |         "    bias_flops = 0 if not bias else batch_size * seq_len * hidden_size\n",
 81 |         "    if head ==0:\n",
 82 |         "        flops = 2 * batch_size * seq_len * hidden_size * hidden_size + bias_flops\n",
 83 |         "    else:\n",
 84 |         "        flops = 2 * batch_size * seq_len * hidden_size * head * d_model + bias_flops\n",
 85 |         "    return flops"
 86 |       ]
 87 |     },
 88 |     {
 89 |       "cell_type": "markdown",
 90 |       "source": [
 91 |         "## Attention模块的计算\n",
 92 |         "\n",
 93 |         "一般的MHA(MultiHeadAttention)计算的构造如下：\n",
 94 |         "\n",
 95 |         "```\n",
 96 |         "class Attention(nn.Module):\n",
 97 |         "    def __init__(self, input_dim, output_dim):\n",
 98 |         "        super().__init__()\n",
 99 |         "        self.query = nn.Linear(input_dim, output_dim)\n",
100 |         "        self.key = nn.Linear(input_dim, output_dim)\n",
101 |         "        self.value = nn.Linear(input_dim, output_dim)\n",
102 |         "        self.dk = output_dim\n",
103 |         "\n",
104 |         "    # Scaled dot-product attention:\n",
105 |         "    def self_attention(self, query, key, value, mask):\n",
106 |         "        # query/key/value:  (bs,  seq_len, dk)/(bs, heads, seq_len, dk)\n",
107 |         "        # mask shape = (bs, 1, seq_len)/(bs, 1, 1, seq_len)\n",
108 |         "        scores = torch.matmul(query, key.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.dk)) # (bs, seq_len, seq_len)/(bs, heads, seq_len, seq_len)\n",
109 |         "        if mask is not None:\n",
110 |         "            scores.masked_fill_(mask == torch.tensor(False), float(\"-inf\"))\n",
111 |         "        # Softmax dim=-1 stands for apply the softmax along the last dimension\n",
112 |         "        attention_weights = nn.Softmax(dim=-1)(scores)  # (bs, heads, seq_len, seq_len)/(bs, seq_len, seq_len)\n",
113 |         "        attention_qkv = torch.matmul(attention_weights, value)   # (bs, seq_len, dk)/(bs, heads, seq_len, dk)\n",
114 |         "        return attention_qkv\n",
115 |         "\n",
116 |         "    def forward(self, query, key, value, mask):\n",
117 |         "        # qkv shape: (bs, seq_len, d_model)\n",
118 |         "        query = self.query(query)\n",
119 |         "        key = self.key(key)\n",
120 |         "        value = self.value(value)\n",
121 |         "        attention_qkv = self.self_attention(query, key, value, mask)  # shape:  (bs, seq_len, d_model)\n",
122 |         "        return attention_qkv\n",
123 |         "\n",
124 |         "class MultiHeadedAttention(Attention):\n",
125 |         "    def __init__(self, d_model, heads):\n",
126 |         "        super().__init__(d_model, d_model)\n",
127 |         "        assert d_model % heads == 0\n",
128 |         "        self.dk = d_model // heads  # head dimension\n",
129 |         "        self.heads = heads\n",
130 |         "        self.out_linear = nn.Linear(d_model, d_model)\n",
131 |         "        self.sqrt_dk = torch.sqrt(torch.tensor(self.dk))\n",
132 |         "\n",
133 |         "    def forward(self, query, key, value, mask):\n",
134 |         "        batch_size = query.shape[0]\n",
135 |         "        # qkv shape: (bs, seq_len, dk*heads)\n",
136 |         "        # dk * heads = d_model\n",
137 |         "        query = self.query(query).view(batch_size, -1, self.heads, self.dk).transpose(1, 2)\n",
138 |         "        key = self.key(key).view(batch_size, -1, self.heads, self.dk).transpose(1, 2)\n",
139 |         "        value = self.value(value).view(batch_size, -1, self.heads, self.dk).transpose(1, 2)\n",
140 |         "        attention_qkv = self.self_attention(query, key, value, mask)  # shape:  (bs, heads, seq_len, dk)\n",
141 |         "        #  (bs, heads, seq_len, dk) -> (bs, seq_len, dk*heads)\n",
142 |         "        reshaped = attention_qkv.transpose(1, 2).reshape(batch_size, -1, self.heads * self.dk)\n",
143 |         "        representations_batch = self.out_linear(reshaped)\n",
144 |         "        return representations_batch\n",
145 |         "```\n",
146 |         "\n",
147 |         "主要运算：\n",
148 |         "* Q/K/V: 线性映射\n",
149 |         "* scores: QK乘法运算\n",
150 |         "* attention_qkv: V和attention_weights乘法运算\n",
151 |         "* out_linear: 线性度计算\n",
152 |         "\n",
153 |         "次要运算：\n",
154 |         "* softmax计算\n",
155 |         "* masked_fill计算\n",
156 |         "\n",
157 |         "对于主要运算中有个需要考虑点：\n",
158 |         "* Attention的变化：query_attention的计算KV的heads数量与Q的heads数量不一致。\n",
159 |         "* 序列并行（context parallel/ring attention）: 考虑并行度。\n",
160 |         "\n",
161 |         "次要运算在估算flops时通常可以忽略，这里例出其计算方式：\n",
162 |         "\n",
163 |         "softmax的flops计算量： 输入的shape：(bs, heads, seq_len, seq_len)\n",
164 |         "元素计算涉及指数运算、加法运算、除法运算。计算量：\n",
165 |         "\n",
166 |         "   3 * bs * heads * seq_len * (seq_len - 1)\n",
167 |         "\n",
168 |         "maked_fill是一个掩模操作包含：判断操作和赋值操作，假设是需要遍历整个矩阵，每个元素操作一次，而赋值操作仅对需要操作的元素赋值，输入矩阵的大小为[bs, heads, seq_len, seq_len], 操作的个数为X。所以计算量：\n",
169 |         "\n",
170 |         "   bs * heads *  seq_len * seq_len + X\n",
171 |         "\n",
172 |         "由于X操作相对来说较小, 公式简化为：\n",
173 |         "\n",
174 |         "   bs * heads *  seq_len * seq_len"
175 |       ],
176 |       "metadata": {
177 |         "id": "zzJMXD-s7MFb"
178 |       }
179 |     },
180 |     {
181 |       "cell_type": "code",
182 |       "source": [
183 |         "def calcu_attention_flops(batch_size, seq_len, heads, d_model, num_query_groups=0, context_parallel=1):\n",
184 |         "    num_query_groups = num_query_groups if num_query_groups != 0 else heads\n",
185 |         "    q_linear_flops = calcu_linear_flops(batch_size, seq_len, heads * d_model)\n",
186 |         "    k_linear_flops = calcu_linear_flops(batch_size, seq_len, heads * d_model, num_query_groups, d_model)\n",
187 |         "    v_linear_flops = k_linear_flops\n",
188 |         "\n",
189 |         "    kv_scores_flops = 2 * batch_size * seq_len**2 * heads * d_model * (context_parallel + 1) / (2 * context_parallel)\n",
190 |         "    mask_flops = batch_size * heads *  seq_len * seq_len\n",
191 |         "    softmax_flops = 3 * batch_size * heads * seq_len * (seq_len - 1)\n",
192 |         "\n",
193 |         "    qkv_flops = kv_scores_flops\n",
194 |         "    out_linear_flops = calcu_linear_flops(batch_size, seq_len, heads * d_model)\n",
195 |         "    return q_linear_flops + k_linear_flops + v_linear_flops + kv_scores_flops + mask_flops + softmax_flops + qkv_flops + out_linear_flops"
196 |       ],
197 |       "metadata": {
198 |         "id": "5azr-HUe7K_u"
199 |       },
200 |       "execution_count": 24,
201 |       "outputs": []
202 |     },
203 |     {
204 |       "cell_type": "markdown",
205 |       "source": [
206 |         "对于MLA（[Multi-head Latent Attention](https://arxiv.org/pdf/2405.04434))结构，计算有所不同：\n",
207 |         "\n",
208 |         "![MLA架构](https://github.com/CalvinXKY/mfu_calculation/blob/main/images/mla_architecture.png?raw=1)\n",
209 |         "\n",
210 |         "主要的计算变化是qkv的linear计算发生了变化，MLA的计算公式如下：\n",
211 |         "\n",
212 |         "![mla计算公式](https://github.com/CalvinXKY/mfu_calculation/blob/main/images/mla_formulas.png?raw=1)\n",
213 |         "\n",
214 |         "构建其mfu的计算时，关注linear和attention的部分，flops的调整如下："
215 |       ],
216 |       "metadata": {
217 |         "id": "sV_sjtJ5iADN"
218 |       }
219 |     },
220 |     {
221 |       "cell_type": "code",
222 |       "source": [
223 |         "def calcu_mla_flops(batch_size, seq_len, heads, d_model, q_lora_rank, kv_lora_rank, context_parallel=1):\n",
224 |         "    q_down_proj = calcu_linear_flops(batch_size, seq_len, heads * d_model, 1, q_lora_rank)\n",
225 |         "    q_up_proj  = calcu_linear_flops(batch_size, seq_len, q_lora_rank, heads, d_model)\n",
226 |         "    q_linear_flops = q_down_proj + q_up_proj\n",
227 |         "    kv_down_proj = calcu_linear_flops(batch_size, seq_len, heads * d_model, 1, kv_lora_rank)\n",
228 |         "    kv_up_proj =calcu_linear_flops(batch_size, seq_len, kv_lora_rank, heads, d_model) * 2\n",
229 |         "    kv_linear = kv_down_proj + kv_up_proj\n",
230 |         "\n",
231 |         "    kv_scores_flops = 2 * batch_size * seq_len**2 * heads * d_model * (context_parallel + 1) / (2 * context_parallel)\n",
232 |         "    mask_flops = batch_size * heads *  seq_len * seq_len\n",
233 |         "    softmax_flops = 3 * batch_size * heads * seq_len * (seq_len - 1)\n",
234 |         "\n",
235 |         "    qkv_flops = kv_scores_flops\n",
236 |         "    out_linear_flops = calcu_linear_flops(batch_size, seq_len, heads * d_model)\n",
237 |         "    return q_linear_flops + kv_linear + kv_scores_flops + mask_flops + softmax_flops + qkv_flops + out_linear_flops\n"
238 |       ],
239 |       "metadata": {
240 |         "id": "iowGjQFFhXWh"
241 |       },
242 |       "execution_count": 25,
243 |       "outputs": []
244 |     },
245 |     {
246 |       "cell_type": "markdown",
247 |       "source": [
248 |         "## LayerNorm/RMSNorm\n",
249 |         "\n",
250 |         "Layer_norm的计算内容一般如下：\n",
251 |         "\n",
252 |         "```\n",
253 |         "import numpy as np\n",
254 |         "\n",
255 |         "def layer_normalization(x, epsilon=1e-8):\n",
256 |         "    mean = np.mean(x, axis=-1, keepdims=True) # 最后一个维度\n",
257 |         "    std = np.std(x, axis=-1, keepdims=True)\n",
258 |         "    normalized_x = (x - mean) / (std + epsilon)\n",
259 |         "    return normalized_x\n",
260 |         "\n",
261 |         "```\n",
262 |         "假设数据的长度为L\n",
263 |         "包含平均值计算、标准差计算、偏移计算；\n",
264 |         "* mean计算包含L加法和一次除法：  L + 1\n",
265 |         "* std计算，每个元素进行一个减法、一个乘法、一个加法。最后进行一个除法和一个乘法操作： 3*L + 2\n",
266 |         "* 标准化：每个元素一次减法、一次除法操作： 2*L\n",
267 |         "\n",
268 |         "忽略单次运算，所以操作计算量：\n",
269 |         "\n",
270 |         "6 * batch_size * seq_len * hidden_size"
271 |       ],
272 |       "metadata": {
273 |         "id": "wTw0eoK49BQb"
274 |       }
275 |     },
276 |     {
277 |       "cell_type": "code",
278 |       "source": [
279 |         "def calcu_layer_norm_flops(batch_size, seq_len, hidden_size):\n",
280 |         "  return 6 * batch_size * seq_len * hidden_size"
281 |       ],
282 |       "metadata": {
283 |         "id": "m-FXTKEsUk-G"
284 |       },
285 |       "execution_count": 26,
286 |       "outputs": []
287 |     },
288 |     {
289 |       "cell_type": "markdown",
290 |       "source": [
291 |         "RMSNorm 常见的代码实现如下：\n",
292 |         "\n",
293 |         "```\n",
294 |         "# 参考Llama定义\n",
295 |         "class LlamaRMSNorm(nn.Module):\n",
296 |         "    def __init__(self, hidden_size, eps=1e-6):\n",
297 |         "        \"\"\"\n",
298 |         "        LlamaRMSNorm is equivalent to T5LayerNorm\n",
299 |         "        \"\"\"\n",
300 |         "        super().__init__()\n",
301 |         "        self.weight = nn.Parameter(torch.ones(hidden_size))\n",
302 |         "        self.variance_epsilon = eps\n",
303 |         "\n",
304 |         "    def forward(self, hidden_states):\n",
305 |         "        input_dtype = hidden_states.dtype\n",
306 |         "        hidden_states = hidden_states.to(torch.float32)\n",
307 |         "        variance = hidden_states.pow(2).mean(-1, keepdim=True)\n",
308 |         "        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)\n",
309 |         "        return self.weight * hidden_states.to(input_dtype)\n",
310 |         "```\n",
311 |         "\n",
312 |         "主要计算内容：\n",
313 |         "* 元素二次方、元素求平均（n-1）、一个rsqrt运算、一个求和运算\n",
314 |         "* 两个乘法操作\n",
315 |         "\n",
316 |         "忽略单次运算，flops数等于：\n",
317 |         "\n",
318 |         "  4 * batch_size * seq_len * hidden_size\n"
319 |       ],
320 |       "metadata": {
321 |         "id": "f7jPf4ankDIY"
322 |       }
323 |     },
324 |     {
325 |       "cell_type": "code",
326 |       "source": [
327 |         "def calcu_rmsnorm_flops(batch_size, seq_len, hidden_size):\n",
328 |         "  return 4 * batch_size * seq_len * hidden_size"
329 |       ],
330 |       "metadata": {
331 |         "id": "oq9splUCpQ7E"
332 |       },
333 |       "execution_count": 27,
334 |       "outputs": []
335 |     },
336 |     {
337 |       "cell_type": "markdown",
338 |       "source": [
339 |         "## MLP/FFN层的计算\n",
340 |         "\n",
341 |         "MLP层的构建常见的方式如下：\n",
342 |         "\n",
343 |         "```\n",
344 |         "class PositionwiseFeedForward(nn.Module):\n",
345 |         "    def __init__(self, d_model, dff=2048):\n",
346 |         "        super().__init__()\n",
347 |         "        self.linear1 = nn.Linear(d_model, dff)\n",
348 |         "        self.linear2 = nn.Linear(dff, d_model)\n",
349 |         "        self.relu = nn.ReLU()\n",
350 |         "\n",
351 |         "    def forward(self, representations_batch):\n",
352 |         "        return self.linear2(self.relu(self.linear1(representations_batch)))\n",
353 |         "```\n",
354 |         "\n",
355 |         "主要包含两个线性层操作和一个Relu计算。\n",
356 |         "\n",
357 |         "输入/输出: [batch_size, seq_len, hidden_size]\n",
358 |         "dff值：ffn_hidden_size\n",
359 |         "\n",
360 |         "计算量为两次线性运算 + 一个relu操作，其flops操作数量如下：\n",
361 |         "2 * batch_size * seq_len * hidden_size * ffn_hidden_size + batch_size * seq_len * ffn_hidden_size\n",
362 |         "\n",
363 |         "Llama的MLP有些改动，一般的计算包含三次线性运算（gate_proj、up_proj、down_proj, 参看hugging face的LlamaMLP定义）一个silu运算，一个元素乘法运算。\n",
364 |         "\n",
365 |         "[LlamaMLP定义](https://github.com/huggingface/transformers/blob/v4.48.0/src/transformers/models/llama/modeling_llama.py#L174):\n",
366 |         "\n",
367 |         "![llama_mlp](https://github.com/CalvinXKY/mfu_calculation/blob/main/images/llama_mlp.png?raw=1)\n",
368 |         "\n",
369 |         "```\n",
370 |         "# L174\n",
371 |         "class LlamaMLP(nn.Module):\n",
372 |         "    def __init__(self, config):\n",
373 |         "        super().__init__()\n",
374 |         "        self.config = config\n",
375 |         "        self.hidden_size = config.hidden_size\n",
376 |         "        self.intermediate_size = config.intermediate_size\n",
377 |         "        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)\n",
378 |         "        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)\n",
379 |         "        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)\n",
380 |         "        self.act_fn = ACT2FN[config.hidden_act]\n",
381 |         "        \n",
382 |         "    def forward(self, x):\n",
383 |         "        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))\n",
384 |         "        return down_proj\n",
385 |         "```\n",
386 |         "\n",
387 |         "对应的flops计算工作：\n",
388 |         "\n",
389 |         "3 * batch_size * seq_len * hidden_size * ffn_hidden_size + 2 * batch_size * seq_len * ffn_hidden_size\n",
390 |         "\n"
391 |       ],
392 |       "metadata": {
393 |         "id": "qGcCHN9FWAqQ"
394 |       }
395 |     },
396 |     {
397 |       "cell_type": "code",
398 |       "source": [
399 |         "def calcu_mlp_flops(batch_size, seq_len, hidden_size, ffn_hidden_size, use_gate=True):\n",
400 |         "  \"\"\"\n",
401 |         "   use_gate=True SwiGLU structure FFN.\n",
402 |         "  \"\"\"\n",
403 |         "  if use_gate:\n",
404 |         "    flops = 3 * 2 * batch_size * seq_len * hidden_size * ffn_hidden_size + 2 * batch_size * seq_len * ffn_hidden_size\n",
405 |         "  else:\n",
406 |         "    flops = 2 * 2 * batch_size * seq_len * hidden_size * ffn_hidden_size + batch_size * seq_len * ffn_hidden_size\n",
407 |         "  return flops"
408 |       ],
409 |       "metadata": {
410 |         "id": "xuKwm6uRWZI5"
411 |       },
412 |       "execution_count": 28,
413 |       "outputs": []
414 |     },
415 |     {
416 |       "cell_type": "markdown",
417 |       "source": [
418 |         "## Logits计算\n",
419 |         "\n",
420 |         "logits计算包含三个运算：\n",
421 |         "\n",
422 |         "* layernorm\n",
423 |         "* linaer，(词表映射)\n",
424 |         "* softmax\n",
425 |         "\n",
426 |         "对应尺寸\n",
427 |         "* layernorm/rmsnorm: [batch_size, seq_len, hidden_size]\n",
428 |         "* linear: input:[batch_size，seq_len*hidden_size] output: :[batch_size，seq_len*vocab_size]\n",
429 |         "* softmax: [batch_size，seq_len*vocab_size]\n",
430 |         "\n",
431 |         "对应计算量：\n",
432 |         "\n",
433 |         "6 * batch_size * seq_len * hidden_size\n",
434 |         "\n",
435 |         "batch_size * seq_len * hidden_size * vocab_size\n",
436 |         "\n",
437 |         "3 * batch_size * seq_len * (vocab_size - 1)"
438 |       ],
439 |       "metadata": {
440 |         "id": "t6DlpZmofGTU"
441 |       }
442 |     },
443 |     {
444 |       "cell_type": "code",
445 |       "source": [
446 |         "def calcu_logits_flops(batch_size, seq_len, heads, d_model, hidden_size, vocab_size, RMSNorm=True):\n",
447 |         "    norm_flops = calcu_rmsnorm_flops(batch_size, seq_len, hidden_size) if RMSNorm else \\\n",
448 |         "    calcu_layer_norm_flops(batch_size, seq_len, hidden_size)\n",
449 |         "\n",
450 |         "    linear_flops = 2 * batch_size * seq_len * hidden_size * vocab_size\n",
451 |         "\n",
452 |         "    softmax_flos = 3 * batch_size * seq_len * (vocab_size - 1)\n",
453 |         "    return norm_flops + linear_flops + softmax_flos"
454 |       ],
455 |       "metadata": {
456 |         "id": "k5JJMN9RMi3e"
457 |       },
458 |       "execution_count": 29,
459 |       "outputs": []
460 |     },
461 |     {
462 |       "cell_type": "markdown",
463 |       "source": [
464 |         "## 位置编码计算\n",
465 |         "\n",
466 |         "Transformer采用的位置编码PE：包含正弦/余弦运算，对每个位置进行d_model/2正弦,d_model/2余弦，计算量为：\n",
467 |         "\n",
468 |         "seq_len * d_model\n",
469 |         "\n",
470 |         "注：如果进行了多头切分， d_model = d_model * heads\n",
471 |         "\n",
472 |         "如果采用旋转位置编码RoPE：\n",
473 |         "\n",
474 |         "* 旋转角度计算：d_model\n",
475 |         "* 每个位置计算构造旋转矩阵：seq_len * d_model\n",
476 |         "* Q，K与旋转矩阵乘法：4 * batch_size * seq_len * d_model"
477 |       ],
478 |       "metadata": {
479 |         "id": "IHGpT-_Mtee2"
480 |       }
481 |     },
482 |     {
483 |       "cell_type": "code",
484 |       "source": [
485 |         "def calcu_position_encoding(batch_size, seq_len, heads, d_model, pe_type=\"rope\"):\n",
486 |         "  if pe_type == \"rope\":\n",
487 |         "    return 4 * batch_size * seq_len * d_model * heads\n",
488 |         "  else:\n",
489 |         "    return seq_len * d_model * heads"
490 |       ],
491 |       "metadata": {
492 |         "id": "T_BV5Tuu6dHQ"
493 |       },
494 |       "execution_count": 30,
495 |       "outputs": []
496 |     },
497 |     {
498 |       "cell_type": "markdown",
499 |       "source": [
500 |         "## Router计算\n",
501 |         "\n",
502 |         "router计算主要是在MoE中应用，其计算一般包括：\n",
503 |         "\n",
504 |         "```\n",
505 |         "self.gate = nn.Linear(config.hidden_size, config.num_experts, bias=False)\n",
506 |         "hidden_states = hidden_states.view(-1, hidden_dim)\n",
507 |         "# router_logits: (batch * sequence_length, n_experts)\n",
508 |         "router_logits = self.gate(hidden_states)\n",
509 |         "\n",
510 |         "routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)\n",
511 |         "routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)\n",
512 |         "```\n",
513 |         "主要是一个gate线性层计算：\n",
514 |         "\n",
515 |         "flops = 2 * batch_size * seq_len * hidden_size * num_experts\n"
516 |       ],
517 |       "metadata": {
518 |         "id": "SKnpS9ZhJPhK"
519 |       }
520 |     },
521 |     {
522 |       "cell_type": "code",
523 |       "source": [
524 |         "def calcu_router_flops(batch_size, seq_len, hidden_size, experts):\n",
525 |         "  return 2 * batch_size * seq_len * hidden_size * experts"
526 |       ],
527 |       "metadata": {
528 |         "id": "4xvLDG7Uk0L9"
529 |       },
530 |       "execution_count": 31,
531 |       "outputs": []
532 |     },
533 |     {
534 |       "cell_type": "markdown",
535 |       "source": [
536 |         "# 典型训练模型的flops计算\n",
537 |         "\n",
538 |         "训练估算约定:\n",
539 |         "\n",
540 |         "1、模型backward计算是forward计算大约两倍, 因为需要计算输入 + 权重的梯度 [参考](https://arxiv.org/pdf/2205.05198)。\n",
541 |         "\n",
542 |         "2、输入的Embedding层主要完成映射计算, 输入维度[batch_size, seq_len] 输出维度： (bs, seq_len, d_model)，其flops计算量可以忽略。 其权重用于LM-head计算时对应的计算量在logits中考虑。\n",
543 |         "\n",
544 |         "![LM_Head_Weights](https://github.com/CalvinXKY/mfu_calculation/blob/main/images/lm_head_weights.png?raw=1)\n",
545 |         "\n",
546 |         "3、位置编码的计算量相对较小，给与忽略。\n",
547 |         "\n",
548 |         "\n",
549 |         "## GPT结构flops计算\n",
550 |         "\n",
551 |         "模型涉及计算的主要结构\n",
552 |         "\n",
553 |         "decoder_layer x N + logtis\n",
554 |         "\n",
555 |         "其中L是层数，decoder构成：\n",
556 |         "\n",
557 |         "MHA + FFN + 2 LayerNorm\n",
558 |         "\n",
559 |         "![GPT架构](https://github.com/CalvinXKY/mfu_calculation/blob/main/images/gpt_architecture.png?raw=1)\n",
560 |         "\n"
561 |       ],
562 |       "metadata": {
563 |         "id": "8fQ7-5fyNC-x"
564 |       }
565 |     },
566 |     {
567 |       "cell_type": "code",
568 |       "source": [
569 |         "def caclu_gpt_flops(batch_size, seq_len, heads, d_model, hidden_size, vocab_size, ffn_hidden_size, layer_nums):\n",
570 |         "  attention_flops = calcu_attention_flops(batch_size, seq_len, heads, d_model, num_query_groups=0, context_parallel=1)\n",
571 |         "  ffn_flops = calcu_mlp_flops(batch_size, seq_len, hidden_size, ffn_hidden_size, use_gate=False)\n",
572 |         "  layer_norm_flops = calcu_layer_norm_flops(batch_size, seq_len, hidden_size)\n",
573 |         "  logits_flops = calcu_logits_flops(batch_size, seq_len, heads, d_model, hidden_size, vocab_size, False)\n",
574 |         "  return 3 * (logits_flops + (layer_norm_flops * 2 + attention_flops + ffn_flops) * layer_nums)"
575 |       ],
576 |       "metadata": {
577 |         "id": "w71MVb2hHifU"
578 |       },
579 |       "execution_count": 32,
580 |       "outputs": []
581 |     },
582 |     {
583 |       "cell_type": "markdown",
584 |       "source": [
585 |         "## LLAMA结构flops计算\n",
586 |         "\n",
587 |         "结构：\n",
588 |         "\n",
589 |         "(GMQA + FFN + RMSNorm) x L + logtis\n",
590 |         "\n",
591 |         "其中GMQA 是group attention， FFN： Feed ForwardSwiGLU结构。\n",
592 |         "\n",
593 |         "![Llama架构](https://github.com/CalvinXKY/mfu_calculation/blob/main/images/llama_architecture.png?raw=1)"
594 |       ],
595 |       "metadata": {
596 |         "id": "BkKN5SASMSqT"
597 |       }
598 |     },
599 |     {
600 |       "cell_type": "code",
601 |       "source": [
602 |         "def caclu_llama_flops(batch_size, seq_len, heads, d_model, hidden_size, vocab_size, ffn_hidden_size, layer_nums, num_query_groups):\n",
603 |         "  attention_flops = calcu_attention_flops(batch_size, seq_len, heads, d_model, num_query_groups=num_query_groups, context_parallel=1)\n",
604 |         "  ffn_flops = calcu_mlp_flops(batch_size, seq_len, hidden_size, ffn_hidden_size, use_gate=True)\n",
605 |         "  layer_norm_flops = calcu_layer_norm_flops(batch_size, seq_len, hidden_size)\n",
606 |         "  logits_flops = calcu_logits_flops(batch_size, seq_len, heads, d_model, hidden_size, vocab_size)\n",
607 |         "  return 3 * (logits_flops + (layer_norm_flops * 2 + attention_flops + ffn_flops) * layer_nums)"
608 |       ],
609 |       "metadata": {
610 |         "id": "ssOaoymHN6Ug"
611 |       },
612 |       "execution_count": 33,
613 |       "outputs": []
614 |     },
615 |     {
616 |       "cell_type": "markdown",
617 |       "source": [
618 |         "## MoE模型flops计算\n",
619 |         "\n",
620 |         "在llama结构基础上ffn增加topk专家数量系数，计算公式：\n",
621 |         "\n",
622 |         "(GMQA + FFN * Experts_topk + Router + RMSNorm) x L + logtis\n",
623 |         "\n"
624 |       ],
625 |       "metadata": {
626 |         "id": "8jvl60XLLo9E"
627 |       }
628 |     },
629 |     {
630 |       "cell_type": "code",
631 |       "source": [
632 |         "def caclu_moe_flops(batch_size, seq_len, heads, d_model, hidden_size, vocab_size, ffn_hidden_size, layer_nums, num_query_groups, topk, experts):\n",
633 |         "  attention_flops = calcu_attention_flops(batch_size, seq_len, heads, d_model, num_query_groups=num_query_groups, context_parallel=1)\n",
634 |         "  ffn_flops = calcu_mlp_flops(batch_size, seq_len, hidden_size, ffn_hidden_size, use_gate=True)\n",
635 |         "  layer_norm_flops = calcu_layer_norm_flops(batch_size, seq_len, hidden_size)\n",
636 |         "  logits_flops = calcu_logits_flops(batch_size, seq_len, heads, d_model, hidden_size, vocab_size)\n",
637 |         "  router_flops = calcu_router_flops(batch_size, seq_len, hidden_size, experts)\n",
638 |         "  return 3 * (logits_flops + (layer_norm_flops * 2 + attention_flops + ffn_flops * topk + router_flops) * layer_nums)"
639 |       ],
640 |       "metadata": {
641 |         "id": "Xis5osj3PURF"
642 |       },
643 |       "execution_count": 34,
644 |       "outputs": []
645 |     },
646 |     {
647 |       "cell_type": "markdown",
648 |       "source": [
649 |         "MoE如果包括了共享专家(shared experts)，上述计算公式中将topk的数量设置为：\n",
650 |         "\n",
651 |         "topk + shared_experts_nums\n",
652 |         "\n"
653 |       ],
654 |       "metadata": {
655 |         "id": "oX3KT77g0bOT"
656 |       }
657 |     },
658 |     {
659 |       "cell_type": "code",
660 |       "source": [
661 |         "def caclu_moe_deepseek_flops(batch_size, seq_len, heads, d_model, hidden_size, vocab_size, ffn_hidden_size, layer_nums, q_lora_rank, kv_lora_rank, topk, shared, experts):\n",
662 |         "  attention_flops = calcu_mla_flops(batch_size, seq_len, heads, d_model, q_lora_rank, kv_lora_rank, context_parallel=1)\n",
663 |         "  ffn_flops = calcu_mlp_flops(batch_size, seq_len, hidden_size, ffn_hidden_size, use_gate=True)\n",
664 |         "  layer_norm_flops = calcu_layer_norm_flops(batch_size, seq_len, hidden_size)\n",
665 |         "  logits_flops = calcu_logits_flops(batch_size, seq_len, heads, d_model, hidden_size, vocab_size)\n",
666 |         "  router_flops = calcu_router_flops(batch_size, seq_len, hidden_size, experts)\n",
667 |         "  return 3 * (logits_flops + (layer_norm_flops * 2 + attention_flops + ffn_flops * (topk + shared) + router_flops) * layer_nums)\n"
668 |       ],
669 |       "metadata": {
670 |         "id": "A4U_VBfOhdd4"
671 |       },
672 |       "execution_count": 35,
673 |       "outputs": []
674 |     },
675 |     {
676 |       "cell_type": "markdown",
677 |       "source": [
678 |         "# MFU计算\n",
679 |         "\n",
680 |         "MFU(Model Flops Utilization)计算的公式为：\n",
681 |         "\n",
682 |         "MFU = 单位时间实际flops/单位时间名义flops\n",
683 |         "\n",
684 |         "单位时间实际flops = 单步模型计算flops总数/单步迭代时间\n",
685 |         "\n",
686 |         "MFU = model_flops_sum / iter_time * (device_peak_flops * device_num)\n",
687 |         "\n",
688 |         "通常device_peak_flops的单位为： TFlops/s"
689 |       ],
690 |       "metadata": {
691 |         "id": "HdfKN3xcPtR3"
692 |       }
693 |     },
694 |     {
695 |       "cell_type": "code",
696 |       "source": [
697 |         "def calcu_moe_mfu(iter_time, batch_size, seq_len, heads, d_model, hidden_size, vocab_size, ffn_hidden_size, layer_nums, num_query_groups, topk, experts, device_nums, device_peak_flops):\n",
698 |         "    model_flops = caclu_moe_flops(batch_size, seq_len, heads, d_model, hidden_size, vocab_size, ffn_hidden_size, layer_nums, num_query_groups, topk, experts)\n",
699 |         "    return model_flops / (iter_time * device_peak_flops * device_nums * 10 ** 12)\n"
700 |       ],
701 |       "metadata": {
702 |         "id": "9MSWETZ2RUmS"
703 |       },
704 |       "execution_count": 36,
705 |       "outputs": []
706 |     },
707 |     {
708 |       "cell_type": "code",
709 |       "source": [
710 |         "def calcu_moe_deepseek_mfu(iter_time, batch_size, seq_len, heads, d_model, hidden_size, vocab_size, ffn_hidden_size, layer_nums, q_lora_rank, kv_lora_rank, topk, shared, experts, device_nums, device_peak_flops):\n",
711 |         "    model_flops = caclu_moe_deepseek_flops(batch_size, seq_len, heads, d_model, hidden_size, vocab_size, ffn_hidden_size, layer_nums, q_lora_rank, kv_lora_rank, topk, shared, experts)\n",
712 |         "    return model_flops / (iter_time * device_peak_flops * device_nums * 10 ** 12)"
713 |       ],
714 |       "metadata": {
715 |         "id": "sYFvcEyLNNan"
716 |       },
717 |       "execution_count": 37,
718 |       "outputs": []
719 |     },
720 |     {
721 |       "cell_type": "markdown",
722 |       "source": [
723 |         "测试："
724 |       ],
725 |       "metadata": {
726 |         "id": "iCFd5ZZyStTX"
727 |       }
728 |     },
729 |     {
730 |       "cell_type": "code",
731 |       "source": [
732 |         "calcu_moe_mfu(iter_time=1.5,\n",
733 |         "       batch_size=1024, seq_len=4096, heads=8, d_model=128,\n",
734 |         "       hidden_size=1024, vocab_size=32768, ffn_hidden_size=2048,\n",
735 |         "       layer_nums=100, num_query_groups=4, topk=9, experts=100,\n",
736 |         "       device_nums=1024, device_peak_flops=280)"
737 |       ],
738 |       "metadata": {
739 |         "colab": {
740 |           "base_uri": "https://localhost:8080/"
741 |         },
742 |         "id": "8hGazgz3Sx8A",
743 |         "outputId": "a21f9a75-62a1-4027-fa05-6740bef09f4e"
744 |       },
745 |       "execution_count": 38,
746 |       "outputs": [
747 |         {
748 |           "output_type": "execute_result",
749 |           "data": {
750 |             "text/plain": [
751 |               "0.4019112012361143"
752 |             ]
753 |           },
754 |           "metadata": {},
755 |           "execution_count": 38
756 |         }
757 |       ]
758 |     },
759 |     {
760 |       "cell_type": "code",
761 |       "source": [
762 |         "calcu_moe_deepseek_mfu(iter_time=1.5,\n",
763 |         "            batch_size=1024, seq_len=4096, heads=8, d_model=128,\n",
764 |         "            hidden_size=1024, vocab_size=32768, ffn_hidden_size=2048,\n",
765 |         "            layer_nums=100, q_lora_rank=128, kv_lora_rank=256, topk=8, shared=1, experts=100,\n",
766 |         "            device_nums=1024, device_peak_flops=280)"
767 |       ],
768 |       "metadata": {
769 |         "id": "AuhCliC4NMPJ",
770 |         "outputId": "a9451bcb-e357-4fb6-cee5-b2df30c4a95f",
771 |         "colab": {
772 |           "base_uri": "https://localhost:8080/"
773 |         }
774 |       },
775 |       "execution_count": 39,
776 |       "outputs": [
777 |         {
778 |           "output_type": "execute_result",
779 |           "data": {
780 |             "text/plain": [
781 |               "0.3957755336704"
782 |             ]
783 |           },
784 |           "metadata": {},
785 |           "execution_count": 39
786 |         }
787 |       ]
788 |     },
789 |     {
790 |       "cell_type": "markdown",
791 |       "source": [
792 |         "\n",
793 |         "# 参考内容：\n",
794 |         "\n",
795 |         "https://zhuanlan.zhihu.com/p/691126108\n",
796 |         "\n",
797 |         "https://github.com/naklecha/llama3-from-scratch/blob/main/README.md\n",
798 |         "\n",
799 |         "https://arxiv.org/pdf/2205.05198\n",
800 |         "\n",
801 |         "https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/te_llama/tutorial_accelerate_hf_llama_with_te.html\n",
802 |         "\n",
803 |         "https://github.com/huggingface/transformers/blob/v4.48.0/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py#L603\n",
804 |         "\n",
805 |         "https://bbycroft.net/llm"
806 |       ],
807 |       "metadata": {
808 |         "id": "DFnkp-9VWZcF"
809 |       }
810 |     }
811 |   ]
812 | }


--------------------------------------------------------------------------------