├── VERSION
├── llmexport
    ├── utils
    │   ├── __init__.py
    │   ├── spinner.py
    │   ├── onnx.py
    │   ├── onnx_rebuilder.py
    │   ├── custom_op.py
    │   ├── torch_utils.py
    │   ├── lora.py
    │   ├── mnn_utils.py
    │   ├── gptq.py
    │   ├── hqq_quantizer.py
    │   ├── talker.py
    │   ├── mtp.py
    │   ├── eagle.py
    │   ├── audio.py
    │   ├── smooth_quantizer.py
    │   ├── token2wav.py
    │   └── mnn_converter.py
    ├── __init__.py
    ├── gguf
    │   └── gguf_reader.py
    └── gguf2mnn.py
├── MANIFEST.in
├── .github
    └── workflows
    │   └── release.yml
├── setup.py
├── README.md
├── README_en.md
└── LICENSE


/VERSION:
--------------------------------------------------------------------------------
1 | 0.0.4
2 | 


--------------------------------------------------------------------------------
/llmexport/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include VERSION
2 | recursive-exclude tests *
3 | 


--------------------------------------------------------------------------------
/llmexport/__init__.py:
--------------------------------------------------------------------------------
1 | __vesion__ = '0.0.1'
2 | 
3 | from .llmexport import export
4 | from .llmexport import main
5 | from .version import __version__


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: release
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 | 
 7 | permissions:
 8 |   contents: read
 9 | 
10 | jobs:
11 |   pypi-release:
12 |     name: upload release to pypi
13 |     runs-on: ubuntu-latest
14 |     environment: release
15 |     permissions:
16 |       id-token: write
17 | 
18 |     steps:
19 |       - uses: actions/checkout@v3
20 |       - name: Set up Python
21 |         uses: actions/setup-python@v3
22 |         with:
23 |           python-version: "3.x"
24 | 
25 |       - name: install
26 |         run: |
27 |           python -m pip install --upgrade pip wheel setuptools
28 |           pip install build
29 | 
30 |       - name: build
31 |         run: python -m build
32 | 
33 |       - name: upload
34 |         uses: pypa/gh-action-pypi-publish@release/v1


--------------------------------------------------------------------------------
/llmexport/utils/spinner.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import functools
 3 | import traceback
 4 | from yaspin import yaspin
 5 | 
 6 | RESET = "\033[0m"
 7 | GREEN = "\033[32;1m"
 8 | YELLOW = "\033[33;4m"
 9 | 
10 | def spinner_run(text='Processing...', hide=False):
11 |     def decorator(func):
12 |         @functools.wraps(func)
13 |         def wrapper(*args, **kwargs):
14 |             with yaspin(text=text, color="cyan") as spinner:
15 |                 start = time.time()
16 |                 try:
17 |                     if hide: spinner.hide()
18 |                     result = func(*args, **kwargs)
19 |                     if hide: spinner.show()
20 |                 except Exception as e:
21 |                     spinner.fail("💥 Failed")
22 |                     traceback.print_exc()
23 |                     exit(1)
24 |                 end = time.time()
25 |                 during = f'[{end-start:05.2f} s]'.replace('[0', '[ ')
26 |                 padding = ' ' * (64 - len(spinner.text) - len(result))
27 |                 spinner.text = f'{spinner.text}{YELLOW}{result}{RESET}{padding}{GREEN}{during}{RESET}'
28 |                 spinner.ok("✅ Done")
29 |                 return result
30 |         return wrapper
31 |     return decorator


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | with open("VERSION", "r") as f:
 4 |     version = f.read().strip()
 5 | 
 6 | with open("llmexport/version.py", "w") as f:
 7 |     f.write(f'__version__ = "{version}"\n')
 8 | 
 9 | setup(
10 |     name="llmexport",
11 |     version=version,
12 |     description="llmexport: A toolkit to export llm to onnx or mnn.",
13 |     long_description=open("README.md", "r", encoding="utf-8").read(),
14 |     long_description_content_type="text/markdown",
15 |     url="https://github.com/wangzhaode/llm-export",
16 |     author="wangzhaode",
17 |     author_email="hi@zhaode.wang",
18 |     project_urls={
19 |         "Bug Tracker": "https://github.com/wangzhaode/llm-export/issues",
20 |     },
21 |     classifiers=[
22 |         "Programming Language :: Python :: 3",
23 |         "License :: OSI Approved :: MIT License",
24 |         "Intended Audience :: Developers",
25 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
26 |     ],
27 |     license="MIT",
28 |     install_requires=["yaspin", "torch", "numpy", "transformers", "sentencepiece", "onnx", "onnxslim", "onnxruntime", "MNN"],
29 |     packages=find_packages(exclude=("tests", "tests.*")),
30 |     entry_points={"console_scripts": ["llmexport=llmexport:main"]},
31 |     zip_safe=True,
32 |     python_requires=">=3.6",
33 | )


--------------------------------------------------------------------------------
/llmexport/utils/onnx.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import onnx
 3 | 
 4 | class OnnxRebuilder:
 5 |     def __init__(self, onnx_path, weight_ops):
 6 |         self.weight_ops = weight_ops
 7 |         self.onnx_model = onnx.load(onnx_path)
 8 |         self.dst_path = onnx_path
 9 |         self.onnx_weight_path = f'{onnx_path}.data'
10 |         self.onnx_weight_offset = 0
11 | 
12 |     def make_external(self, name, data, shape):
13 |         # write to external weight
14 |         length = self.onnx_weight.write(data.tobytes())
15 |         location = os.path.basename(self.onnx_weight_path)
16 |         offset = self.onnx_weight_offset
17 |         self.onnx_weight_offset += length
18 |         tensor = onnx.TensorProto()
19 |         tensor.name = name
20 |         tensor.data_type = onnx.TensorProto.FLOAT
21 |         tensor.dims.extend(shape)
22 |         # external info
23 |         tensor.data_location = onnx.TensorProto.EXTERNAL
24 |         for k, v in { "location": location, "offset": offset, "length": length }.items():
25 |             entry = tensor.external_data.add()
26 |             entry.key = k
27 |             entry.value = str(v)
28 |         self.onnx_model.graph.initializer.append(tensor)
29 | 
30 |     def build_weight(self, name, has_bias, ic, oc):
31 |         assert(name in self.weight_ops)
32 |         linear = self.weight_ops[name]
33 |         assert(linear.in_features == ic and
34 |                linear.out_features == oc and
35 |                (linear.bias is not None) == has_bias)
36 |         weight_name, bias_name = f'{name}_weight', f'{name}_bias'
37 |         weight = linear.weight.data.transpose(1, 0).flatten().float().numpy()
38 |         self.make_external(weight_name, weight, [ic, oc])
39 |         if has_bias:
40 |             bias = linear.bias.data.flatten().float().numpy()
41 |             self.make_external(bias_name, bias, [oc])
42 |         return weight_name, bias_name
43 | 
44 |     def rebuild(self):
45 |         from onnx import helper
46 |         new_nodes = []
47 |         self.onnx_weight = open(self.onnx_weight_path, 'wb')
48 |         for node in self.onnx_model.graph.node:
49 |             if node.op_type == 'FakeLinear':
50 |                 attributes = {a.name: a for a in node.attribute}
51 |                 name = attributes.get('name').s.decode('utf-8')
52 |                 has_bias = attributes.get('has_bias').i
53 |                 ic = attributes.get('in_features').i
54 |                 oc = attributes.get('out_features').i
55 |                 weight, bias = self.build_weight(name, has_bias, ic, oc)
56 |                 if has_bias:
57 |                     # fakelinear -> matmul + add
58 |                     middle_tensor = f'{name}_matmul'
59 |                     new_nodes.append(helper.make_node('MatMul', [node.input[0], weight], [middle_tensor], name))
60 |                     new_nodes.append(helper.make_node('Add', [middle_tensor, bias], node.output, f'{name}/Add'))
61 |                 else:
62 |                     # fakelinear -> matmul
63 |                     new_nodes.append(helper.make_node('MatMul', [node.input[0], weight], node.output, name))
64 |             else:
65 |                 new_nodes.append(node)
66 |         self.onnx_weight.close()
67 |         del self.onnx_model.graph.node[:]
68 |         self.onnx_model.graph.node.extend(new_nodes)
69 |         onnx.save(self.onnx_model, self.dst_path)
70 |         return self.onnx_weight_path


--------------------------------------------------------------------------------
/llmexport/utils/onnx_rebuilder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import onnx
 3 | 
 4 | class OnnxRebuilder:
 5 |     def __init__(self, onnx_path, weight_ops):
 6 |         self.weight_ops = weight_ops
 7 |         self.onnx_model = onnx.load(onnx_path)
 8 |         self.dst_path = onnx_path
 9 |         self.onnx_weight_path = f'{onnx_path}.data'
10 |         self.onnx_weight_offset = 0
11 | 
12 |     def make_external(self, name, data, shape):
13 |         # write to external weight
14 |         length = self.onnx_weight.write(data.tobytes())
15 |         location = os.path.basename(self.onnx_weight_path)
16 |         offset = self.onnx_weight_offset
17 |         self.onnx_weight_offset += length
18 |         tensor = onnx.TensorProto()
19 |         tensor.name = name
20 |         tensor.data_type = onnx.TensorProto.FLOAT
21 |         tensor.dims.extend(shape)
22 |         # external info
23 |         tensor.data_location = onnx.TensorProto.EXTERNAL
24 |         for k, v in { "location": location, "offset": offset, "length": length }.items():
25 |             entry = tensor.external_data.add()
26 |             entry.key = k
27 |             entry.value = str(v)
28 |         self.onnx_model.graph.initializer.append(tensor)
29 | 
30 |     def build_weight(self, name, has_bias, ic, oc):
31 |         assert(name in self.weight_ops)
32 |         linear = self.weight_ops[name]
33 |         assert(linear.in_features == ic and
34 |                linear.out_features == oc and
35 |                (linear.bias is not None) == has_bias)
36 |         weight_name, bias_name = f'{name}_weight', f'{name}_bias'
37 |         weight = linear.weight.data.transpose(1, 0).flatten().float().numpy()
38 |         self.make_external(weight_name, weight, [ic, oc])
39 |         if has_bias:
40 |             bias = linear.bias.data.flatten().float().numpy()
41 |             self.make_external(bias_name, bias, [oc])
42 |         return weight_name, bias_name
43 | 
44 |     def rebuild(self):
45 |         from onnx import helper
46 |         new_nodes = []
47 |         self.onnx_weight = open(self.onnx_weight_path, 'wb')
48 |         for node in self.onnx_model.graph.node:
49 |             if node.op_type == 'FakeLinear':
50 |                 attributes = {a.name: a for a in node.attribute}
51 |                 name = attributes.get('name').s.decode('utf-8')
52 |                 has_bias = attributes.get('has_bias').i
53 |                 ic = attributes.get('in_features').i
54 |                 oc = attributes.get('out_features').i
55 |                 weight, bias = self.build_weight(name, has_bias, ic, oc)
56 |                 if has_bias:
57 |                     # fakelinear -> matmul + add
58 |                     middle_tensor = f'{name}_matmul'
59 |                     new_nodes.append(helper.make_node('MatMul', [node.input[0], weight], [middle_tensor], name))
60 |                     new_nodes.append(helper.make_node('Add', [middle_tensor, bias], node.output, f'{name}/Add'))
61 |                 else:
62 |                     # fakelinear -> matmul
63 |                     new_nodes.append(helper.make_node('MatMul', [node.input[0], weight], node.output, name))
64 |             else:
65 |                 new_nodes.append(node)
66 |         self.onnx_weight.close()
67 |         del self.onnx_model.graph.node[:]
68 |         self.onnx_model.graph.node.extend(new_nodes)
69 |         onnx.save(self.onnx_model, self.dst_path)
70 |         return self.onnx_weight_path


--------------------------------------------------------------------------------
/llmexport/utils/custom_op.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | class FakeLinearOp(torch.autograd.Function):
 4 |     @staticmethod
 5 |     def symbolic(g, input, in_features, out_features, has_bias, name):
 6 |         # These become the operator attributes.
 7 |         kwargs = {
 8 |             "in_features_i": in_features,
 9 |             "out_features_i": out_features,
10 |             "has_bias_i": has_bias,
11 |             "name_s": name
12 |         }
13 |         from torch.onnx.symbolic_helper import _get_tensor_sizes
14 |         out_sizes = _get_tensor_sizes(input)[:-1] + [out_features]
15 |         output_type = input.type().with_sizes(out_sizes)
16 |         return g.op("LlmExporter::FakeLinear", input, **kwargs).setType(output_type)
17 | 
18 |     @staticmethod
19 |     def forward(ctx, input, in_features, out_features, has_bias, name):
20 |         out_shape = list(input.shape)[:-1] + [out_features]
21 |         return input.new_zeros(out_shape)
22 | 
23 | class FakeLinear(torch.nn.Module):
24 |     def __init__(self, in_features, out_features, has_bias, name):
25 |         super(FakeLinear, self).__init__()
26 |         self.in_features = in_features
27 |         self.out_features = out_features
28 |         self.has_bias = has_bias
29 |         self.name = name
30 | 
31 |     def forward(self, x):
32 |         return FakeLinearOp.apply(x, self.in_features, self.out_features, self.has_bias, self.name)
33 | 
34 | class FusedAttentionOp(torch.autograd.Function):
35 |     @staticmethod
36 |     def symbolic(g, query, key, value, attention_mask, hidden_size, name):
37 |         # These become the operator attributes.
38 |         kwargs = {
39 |             "hidden_size_i": hidden_size,
40 |             "name_s": name
41 |         }
42 |         from torch.onnx.symbolic_helper import _get_tensor_sizes
43 |         out_sizes = _get_tensor_sizes(query)
44 |         output_type = query.type().with_sizes(out_sizes)
45 |         return g.op("LlmExporter::FusedAttention", query, key, value, attention_mask, **kwargs).setType(output_type)
46 | 
47 |     @staticmethod
48 |     def forward(ctx, query, key, value, attention_mask, hidden_size, name):
49 |         out_shape = list(query.shape)[:2] + [hidden_size]
50 |         return query.new_zeros(out_shape)
51 | 
52 | class FusedAttention(torch.nn.Module):
53 |     def __init__(self, hidden_size, name):
54 |         super(FusedAttention, self).__init__()
55 |         self.hidden_size = hidden_size
56 |         self.name = name
57 | 
58 |     def forward(self, query, key, value, attention_mask):
59 |         return FusedAttentionOp.apply(query, key, value, attention_mask, self.hidden_size, self.name)
60 | 
61 | class MoEOp(torch.autograd.Function):
62 |     @staticmethod
63 |     def symbolic(g, hidden_states, routing_weights, selected_experts, num_experts, top_k, layer_id):
64 |         kwargs = {
65 |             "num_experts_i": num_experts,
66 |             "top_k_i": top_k,
67 |             "layer_id_i": layer_id
68 |         }
69 |         from torch.onnx.symbolic_helper import _get_tensor_sizes
70 |         out_sizes = _get_tensor_sizes(hidden_states)
71 |         output_type = hidden_states.type().with_sizes(out_sizes)
72 |         return g.op("LlmExporter::MoE", hidden_states, routing_weights, selected_experts, **kwargs).setType(output_type)
73 | 
74 |     @staticmethod
75 |     def forward(ctx, hidden_states, routing_weights, selected_experts, num_experts, top_k, layer_id):
76 |         return hidden_states
77 | 
78 | class MoE(torch.nn.Module):
79 |     def __init__(self, num_experts, top_k, layer_id):
80 |         super(MoE, self).__init__()
81 |         self.num_experts = num_experts
82 |         self.top_k = top_k
83 |         self.layer_id = layer_id
84 | 
85 |     def forward(self, hidden_states, routing_weights, selected_experts):
86 |         return MoEOp.apply(hidden_states, routing_weights, selected_experts, self.num_experts, self.top_k, self.layer_id)


--------------------------------------------------------------------------------
/llmexport/utils/torch_utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from .hqq_quantizer import HQQQuantizer
  3 | 
  4 | def repack_low_bits(x, iNeedBits, block_size):
  5 |     v = []
  6 |     device = x.device
  7 |     block_number = x.shape[0]
  8 |     count = block_size * iNeedBits // 8
  9 |     for i in range(0, count):
 10 |         v.append(torch.zeros([block_number, 1], dtype=torch.uint8, device=device))
 11 |     iOffset = 0
 12 |     cMask = (1 << iNeedBits) - 1
 13 |     index = 0
 14 |     for i in range(0, block_size):
 15 |         p0 = x[:, i:i+1]
 16 |         uShift = 8 - iNeedBits - (iOffset % 8)
 17 |         if uShift < 0:
 18 |             v[index+iOffset // 8] |= ((p0 & cMask) >> (0 - uShift))
 19 |             v[index+(iOffset // 8) + 1] |= ((p0 & cMask) << (8 + uShift))
 20 |         else:
 21 |             v[index+iOffset // 8] |= ((p0 & cMask) << uShift)
 22 |         iOffset += iNeedBits
 23 |         if iOffset % 8 == 0:
 24 |             index += iOffset // 8
 25 |             iOffset = 0
 26 |     return torch.cat(v, axis=1)
 27 | 
 28 | def quant(weight, quant_bit, quant_block, symmetric, awq, hqq):
 29 | 
 30 |     try:
 31 |         if torch.cuda.is_available():
 32 |             weight = weight.cuda()
 33 |         if torch.backends.mps.is_available():
 34 |             weight = weight.to('mps')
 35 |     except:
 36 |         print('Failed to move weight to GPU, fallback to CPU')
 37 | 
 38 |     oc, ic = weight.shape
 39 |     if quant_block == 0:
 40 |         block_size = ic
 41 |     else:
 42 |         block_size = quant_block
 43 |     while ic % block_size != 0:
 44 |         block_size /= 2
 45 |     block_size = int(block_size)
 46 |     block_num = ic // block_size
 47 | 
 48 |     offset = 1 << (quant_bit - 1)
 49 |     clip_max = offset - 1
 50 | 
 51 |     if hqq:
 52 |         hqq_quantizer = HQQQuantizer(weight, quant_bit, block_size, symmetric, weight.dtype, weight.device)
 53 |         hqq_quantizer.quant()
 54 |         if not symmetric:
 55 |             q_weight = hqq_quantizer.W_q.flatten().to(torch.uint8)
 56 |             scale = hqq_quantizer.meta['scale'].flatten()
 57 |             zeros = scale * offset - scale * hqq_quantizer.meta['zero'].flatten()
 58 | 
 59 |             alpha = torch.stack([zeros.flatten(), scale.flatten()], axis=-1).flatten()
 60 |         else:
 61 |             q_weight = (hqq_quantizer.W_q.flatten() + offset).to(torch.uint8)
 62 |             scale = hqq_quantizer.meta['scale'].flatten()
 63 |             alpha = scale.flatten()
 64 |     else:
 65 |         weight = weight.reshape(oc, block_num, block_size)
 66 |         if symmetric:
 67 |             clip_min = -clip_max
 68 |             abs_max, _ = torch.max(torch.abs(weight), axis=-1, keepdims=True)
 69 |             scale = abs_max / clip_max
 70 |             q_weight = torch.round(weight / scale)
 71 |             q_weight = (torch.clamp(q_weight.flatten(), clip_min, clip_max) + offset).to(torch.uint8)
 72 |             alpha = scale.flatten()
 73 | 
 74 |         else:
 75 |             clip_min = -offset
 76 |             max_val, _ = torch.max(weight, axis=-1, keepdims=True)
 77 |             min_val, _ = torch.min(weight, axis=-1, keepdims=True)
 78 |             scale = (max_val - min_val) / (clip_max - clip_min)
 79 | 
 80 |             if awq:
 81 |                 q_weight = torch.round(weight / scale) - torch.round(min_val / scale) + clip_min
 82 |                 zeros =  (torch.round(min_val / scale) - clip_min) * scale
 83 |             else:
 84 |                 q_weight = torch.round((weight - min_val) / scale) + clip_min
 85 |                 zeros =  min_val - scale * clip_min
 86 |             q_weight = (torch.clamp(q_weight.flatten(), clip_min, clip_max) + offset).to(torch.uint8)
 87 |             alpha = torch.stack([zeros.flatten(), scale.flatten()], axis=-1).flatten()
 88 | 
 89 |     if quant_bit < 8 and 8 % quant_bit == 0:
 90 |         group_size = 8 // quant_bit
 91 |         q_weight = q_weight.reshape(-1, group_size)
 92 |         multipliers = [2 ** (quant_bit * (group_size - 1 - i)) for i in range(group_size)]
 93 |         multipliers = torch.tensor(multipliers).to(q_weight.device)
 94 |         q_weight = (q_weight * multipliers).sum(axis=1).to(torch.uint8)
 95 |     elif quant_bit < 8:
 96 |         q_weight = repack_low_bits(q_weight.reshape((block_num * oc, block_size)), quant_bit, block_size)
 97 | 
 98 |     if q_weight.device is not torch.device('cpu'):
 99 |         return q_weight.cpu(), alpha.float().cpu()
100 |     return q_weight, alpha.float()
101 | 
102 | def onnx_export(model, inputs, onnx_model, input_names, output_names, dynamic_axes=None):
103 |     torch.onnx.export(
104 |             model, inputs,
105 |             onnx_model,
106 |             input_names=input_names,
107 |             output_names=output_names,
108 |             dynamic_axes=dynamic_axes,
109 |             do_constant_folding=True,
110 |             verbose=False,
111 |             dynamo=False,
112 |             opset_version=15)


--------------------------------------------------------------------------------
/llmexport/utils/lora.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | from safetensors import safe_open
  4 | 
  5 | class LoRA:
  6 |     def __init__(self, lora_path, scale = 4.0):
  7 |         self.lora_A = {}
  8 |         self.lora_B = {}
  9 |         self.lora_keys = set()
 10 |         self.scale = scale
 11 |         self.load(lora_path)
 12 | 
 13 |     def __str__(self):
 14 |         return str(self.lora_keys)
 15 | 
 16 |     def has_lora(self, op_name):
 17 |         if op_name[0] != '/':
 18 |             return False
 19 |         for key in self.lora_keys:
 20 |             if key in op_name:
 21 |                 return True
 22 |         return False
 23 | 
 24 |     def get_lora(self, tag):
 25 |         lora_a, lora_b = self.lora_A[tag], self.lora_B[tag]
 26 |         return lora_a, lora_b
 27 | 
 28 |     def load(self, path):
 29 |         if os.path.isdir(path):
 30 |             base_dir = path
 31 |             config = json.load(open(os.path.join(base_dir, 'adapter_config.json'), 'rt'))
 32 |             lora_alpha = config['lora_alpha']
 33 |             r = config['r']
 34 |             self.scale = float(lora_alpha) / r
 35 |             path = os.path.join(base_dir, 'adapter_model.safetensors')
 36 |         with safe_open(path, framework="pt") as f:
 37 |             for k in f.keys():
 38 |                 names = k.split('.')
 39 |                 layer, key, name = names[4], names[6], names[7]
 40 |                 tag = layer + key
 41 |                 tensor = f.get_tensor(k).float()
 42 |                 self.lora_keys.add(key)
 43 |                 if 'lora_A' == name:
 44 |                     self.lora_A[tag] = tensor
 45 |                 else:
 46 |                     self.lora_B[tag] = tensor * self.scale
 47 | 
 48 |     def build_conv(self, input_index, output_name, dims, weight):
 49 |         output_index = len(self.base_model['tensorName'])
 50 |         oc, ic = dims
 51 |         bias = [0.0 for i in range(oc)]
 52 |         op = {
 53 |             'type': 'Convolution',
 54 |             'name': output_name,
 55 |             'inputIndexes': [input_index],
 56 |             'outputIndexes': [ output_index ],
 57 |             'main_type': 'Convolution2D',
 58 |             'main': {
 59 |                 'common': {
 60 |                     'dilateX': 1, 'dilateY': 1, 'strideX': 1, 'strideY': 1,
 61 |                     'kernelX': 1, 'kernelY': 1, 'padX': 0, 'padY': 0, 'group': 1,
 62 |                     'outputCount': oc, 'relu': False, 'padMode': 'CAFFE',
 63 |                     'relu6': False, 'inputCount': ic, 'hasOutputShape': False
 64 |                 },
 65 |                 "weight": weight,
 66 |                 "bias": bias
 67 |             },
 68 |             'defaultDimentionFormat': 'NHWC'
 69 |         }
 70 |         self.new_ops.append(op)
 71 |         self.base_model['tensorName'].append(output_name)
 72 |         return output_index
 73 | 
 74 |     def build_binary(self, op_type, input_indexes, output_name):
 75 |         # 0: Add, 2: Mul
 76 |         output_index = len(self.base_model['tensorName'])
 77 |         op = {
 78 |             "type": "BinaryOp",
 79 |             "name": output_name,
 80 |             "inputIndexes": input_indexes,
 81 |             "outputIndexes": [ output_index ],
 82 |             "main_type": "BinaryOp",
 83 |             "main": { "opType": op_type, "T": "DT_FLOAT", "activationType": 0 },
 84 |             "defaultDimentionFormat": "NHWC"
 85 |         }
 86 |         self.new_ops.append(op)
 87 |         self.base_model['tensorName'].append(output_name)
 88 |         return output_index
 89 | 
 90 |     def replace_input(self, origin_idx, new_idx):
 91 |         for op in self.base_model['oplists']:
 92 |             if op['type'] == 'ConvertTensor' and origin_idx in op['inputIndexes']:
 93 |                 op['inputIndexes'] = [new_idx]
 94 | 
 95 |     def apply_lora(self, op):
 96 |         names = op['name'].split('/')
 97 |         tag = names[1].split('.')[1] + names[3]
 98 |         lora_a, lora_b = self.get_lora(tag)
 99 |         input_index = op['inputIndexes'][0]
100 |         outpt_index = op['outputIndexes'][0]
101 |         # lora_B @ lora_A @ x -> lora_B @ (lora_A @ x)
102 |         a_out = self.build_conv(input_index, f'{tag}_A', list(lora_a.shape), lora_a.flatten().tolist())
103 |         b_out = self.build_conv(a_out, f'{tag}_B', list(lora_b.shape), lora_b.flatten().tolist())
104 |         n_out = self.build_binary(0, [outpt_index, b_out], f'{tag}_add')
105 |         self.replace_input(outpt_index, n_out)
106 | 
107 |     def apply(self, base_path, out):
108 |         self.base_model = json.load(open(base_path, 'rt'))
109 |         self.new_ops = []
110 |         for i in range(len(self.base_model['oplists'])):
111 |             op = self.base_model['oplists'][i]
112 |             self.new_ops.append(op)
113 |             if op['type'] == 'Convolution':
114 |                 if self.has_lora(op['name']):
115 |                     self.apply_lora(op)
116 |         self.base_model['oplists'] = self.new_ops
117 |         with open(out, 'w', encoding='utf-8') as file:
118 |             json.dump(self.base_model, file, ensure_ascii=False, indent=4)
119 |         return out
120 | 


--------------------------------------------------------------------------------
/llmexport/utils/mnn_utils.py:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | import json
  3 | def write_quant_header(file, ic, oc, quant_bit):
  4 |     dim_num = file.write(b'\x02')
  5 |     shape_dtype = numpy.int16
  6 |     if oc > 65535 or ic > 65535:
  7 |         shape_dtype = numpy.int32
  8 |     dim_length = file.write(numpy.array([oc, ic]).astype(shape_dtype))
  9 |     offset = 1 << (quant_bit - 1)
 10 |     weight_map = [i for i in range(-offset, offset)]
 11 |     if len(weight_map) == 256:
 12 |         weight_map.insert(0, 0)
 13 |     else:
 14 |         weight_map.insert(0, len(weight_map))
 15 |     map_length = file.write(numpy.array(weight_map, dtype=numpy.int8))
 16 |     header_length = dim_num + dim_length + map_length
 17 |     return header_length, shape_dtype == numpy.int32
 18 | 
 19 | 
 20 | def repack_low_bits(x, iNeedBits, block_size):
 21 |     v = []
 22 |     block_number = x.shape[0]
 23 |     count = block_size * iNeedBits // 8
 24 |     for i in range(0, count):
 25 |         v.append(numpy.zeros([block_number, 1]).astype(numpy.uint8))
 26 |     iOffset = 0
 27 |     cMask = (1 << iNeedBits) - 1
 28 |     index = 0
 29 |     for i in range(0, block_size):
 30 |         p0 = x[:, i:i+1]
 31 |         uShift = 8 - iNeedBits - (iOffset % 8)
 32 |         if uShift < 0:
 33 |             v[index+iOffset // 8] |= ((p0 & cMask) >> (0 - uShift))
 34 |             v[index+(iOffset // 8) + 1] |= ((p0 & cMask) << (8 + uShift))
 35 |         else:
 36 |             v[index+iOffset // 8] |= ((p0 & cMask) << uShift)
 37 |         iOffset += iNeedBits
 38 |         if iOffset % 8 == 0:
 39 |             index += iOffset // 8
 40 |             iOffset = 0
 41 |     return numpy.concatenate(v, axis=1) 
 42 | 
 43 | class Block:
 44 |     def __init__(self):
 45 |         self.conv = []
 46 |         self.layernorm = []
 47 | 
 48 | def load_mnn(filename):
 49 |     mnn = {}
 50 |     with open(filename) as f:
 51 |         mnn = json.load(f)
 52 |     conv_indexes = []
 53 |     layernorm_indexes = []
 54 |     blockops = []
 55 |     for op in mnn["oplists"]:
 56 |         if op['type'] == 'LayerNorm':
 57 |             if 'external' in op['main']:
 58 |                 del op['main']['external']
 59 |             if 'gamma' in op['main']:
 60 |                 del op['main']['gamma']
 61 |             if 'beta' in op['main']:
 62 |                 del op['main']['beta']
 63 |             layernorm_indexes.append(len(blockops))
 64 |             blockops.append(op)
 65 |             continue
 66 |         if op['type'] == 'Convolution':
 67 |             conv_indexes.append(len(blockops))
 68 |             blockops.append(op)
 69 |     block = None
 70 |     blockes = []
 71 |     conv_order = ['attn_q', 'attn_k', 'attn_v', 'attn_output', 'ffn_gate', 'ffn_up', 'ffn_down']
 72 |     blockNumber = len(conv_indexes) // len(conv_order)
 73 |     print("Layers number: ", blockNumber, ", conv number: ", len(conv_indexes), ", layernorm number:", len(layernorm_indexes))
 74 |     block_layernorms = len(layernorm_indexes) // blockNumber
 75 |     assert(len(layernorm_indexes) == block_layernorms * blockNumber + 1)
 76 |     for i in range(0, blockNumber):
 77 |         block = Block()
 78 |         sta_conv = len(conv_order) * i
 79 |         for j in range(0, len(conv_order)):
 80 |             index = conv_indexes[sta_conv + j]
 81 |             block.conv.append(blockops[index])
 82 |         sta_layernorm = block_layernorms * i
 83 |         for j in range(0, block_layernorms):
 84 |             index = layernorm_indexes[sta_layernorm + j]
 85 |             block.layernorm.append(blockops[index])
 86 |         blockes.append(block)
 87 |     # Last layernorm and lm
 88 |     output_norm = blockops[layernorm_indexes[len(layernorm_indexes)-1]]
 89 |     lm = blockops[conv_indexes[len(conv_indexes)-1]]
 90 |     lm['name'] = 'output'
 91 |     opmap = {}
 92 |     opmap['output_norm'] = output_norm
 93 |     convs = []
 94 |     for i in range(0, len(blockes)):
 95 |         _block = blockes[i]
 96 |         if len(_block.layernorm) == 2:
 97 |             opmap['blk.%d' %i + '.attn_norm']= _block.layernorm[0]
 98 |             opmap['blk.%d' %i + '.ffn_norm']= _block.layernorm[1]
 99 |         elif len(_block.layernorm) == 6:
100 |             names = ['attn_norm', 'attn_q_norm', 'attn_k_norm', 'post_attention_norm',  'ffn_norm', 'post_ffw_norm']
101 |             for j in range(0, len(_block.layernorm)):
102 |                 opmap['blk.%d' %i + '.%s' %names[j]]= _block.layernorm[j]
103 |         else:
104 |             assert(False)
105 |         for j in range(0, 7):
106 |             newname = 'blk.%d' %i + '.' + conv_order[j]
107 |             _block.conv[j]['name'] = newname
108 |             convs.append(_block.conv[j])
109 |     convs.append(lm)
110 | 
111 |     return mnn, opmap, convs, blockes, block
112 | 
113 | 
114 | def write_quant_parameters(quant_bit, asymc, mnn_weight_file, ic, oc, weight_main, scalebias, mnn_weight_offset, need_scale_treat = True):
115 |     conv = {}
116 |     aMin = 0
117 |     readType = 0
118 |     if asymc:
119 |         # Avoid aMin post treat for bias
120 |         offset = -(1 << (quant_bit - 1))
121 |         aMin = 1
122 |         if need_scale_treat:
123 |             scalebias = scalebias.reshape([-1, 2])
124 |             bias = scalebias[:, 0:1]
125 |             scale = scalebias[:, 1:2]
126 |             bias = bias - offset * scale
127 |             scalebias = numpy.concatenate([bias, scale], axis=1).astype(numpy.float32)
128 |         readType = 1
129 |     header_len, shape_int32 = write_quant_header(mnn_weight_file, ic, oc, quant_bit)
130 |     weight_len =  mnn_weight_file.write(weight_main.tobytes()) + header_len
131 |     alpha_len = mnn_weight_file.write(scalebias.tobytes())
132 |     conv['quanParameter'] = {
133 |         "quantScale": 1.0, "scaleIn": 0.0, "scaleOut": 0.0,
134 |         "useInt32": False, "has_scaleInt": False, "shapeInt32": shape_int32,
135 |         "type": 1, "aMaxOrBits": quant_bit, "aMin": aMin, "readType": readType, "weightSize": 0
136 |     }
137 |     conv['external'] = [mnn_weight_offset, weight_len, alpha_len, oc * 4, 0]
138 |     mnn_weight_offset += (weight_len + alpha_len)   
139 |     return conv, header_len, mnn_weight_offset


--------------------------------------------------------------------------------
/llmexport/utils/gptq.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import json
  3 | import torch
  4 | from safetensors import safe_open
  5 | 
  6 | class GPTQWeight:
  7 |     def __init__(self, name):
  8 |         self.name = name
  9 | 
 10 |     def __repr__(self) -> str:
 11 |         if hasattr(self, 'qweight'):
 12 |             return f'{self.name}, {self.qweight.shape}, {self.scales.shape}'
 13 |         return 'None'
 14 | 
 15 |     def add(self, name, tensor):
 16 |         setattr(self, name, tensor)
 17 | 
 18 |     def weight(self, idx):
 19 |         shape = self.qweight.shape
 20 |         if len(shape) == 2:
 21 |             ic, oc = shape
 22 |             self.qweight = self.qweight.reshape(ic//16, 16, oc)
 23 |         return self.qweight[idx]
 24 | 
 25 |     def scale(self, idx):
 26 |         return self.scales[idx]
 27 | 
 28 | class MNNWeight:
 29 |     def __init__(self, name, external, weight_elements):
 30 |         self.name = name
 31 |         self.external = external
 32 |         self.quant_bits = 4
 33 |         if round(weight_elements / external[1]) == 2:
 34 |             self.quant_bits = 4
 35 |             self.a_min = -8
 36 |         else:
 37 |             self.quant_bits = 8
 38 |             self.a_min = -128
 39 |         self.parse_name()
 40 | 
 41 |     def __repr__(self) -> str:
 42 |         return f'{self.layer_id}.{self.op_id}.{self.block_id}, {self.external}'
 43 | 
 44 |     def parse_name(self):
 45 |         parts = self.name.split('/')
 46 |         if len(parts) > 4:
 47 |             self.layer_id = parts[1].split('.')[1]
 48 |             self.op_id = parts[2] + '.' + parts[3]
 49 |             self.block_id = parts[-1].split('__')[-1]
 50 |         else:
 51 |             self.layer_id = -1
 52 |             self.op_id = parts[2]
 53 |             self.block_id = parts[-1].split('__')[-1]
 54 | 
 55 |     def key(self):
 56 |         if self.layer_id == -1: return self.op_id
 57 |         return f'{self.layer_id}.{self.op_id}'
 58 |     def offset(self): return self.external[0]
 59 |     def weight_size(self): return self.external[1]
 60 |     def scale_size(self): return self.external[2]
 61 | 
 62 | class GPTQ:
 63 |     def __init__(self, gptq_path):
 64 |         self.load(gptq_path)
 65 | 
 66 |     def load(self, path):
 67 |         for tensor in glob.glob(f'{path}/*.safetensors'):
 68 |             self.load_safetensor(tensor)
 69 | 
 70 |     def prefix(self, name):
 71 |         splits = name.split('.')
 72 |         if 'lm_head' in splits[0] and len(splits) == 2:
 73 |             return splits[0], splits[1]
 74 |         if len(splits) < 5:
 75 |             return None, None
 76 |         pre = f'{splits[2]}.{splits[3]}.{splits[4]}'
 77 |         suf = splits[-1]
 78 |         return pre, suf
 79 | 
 80 |     def get(self, key : str):
 81 |         if key in self.weight_dict:
 82 |             return self.weight_dict[key]
 83 |         return None
 84 | 
 85 |     def load_safetensor(self, tensor):
 86 |         self.weight_dict = dict()
 87 |         with safe_open(tensor, framework="pt") as f:
 88 |             for k in f.keys():
 89 |                 p, s = self.prefix(k)
 90 |                 if p is None: continue
 91 |                 if s not in ['qweight', 'scales']: continue
 92 |                 if p not in self.weight_dict:
 93 |                     self.weight_dict[p] = GPTQWeight(p)
 94 |                 self.weight_dict[p].add(s, f.get_tensor(k))
 95 | 
 96 |     @staticmethod
 97 |     def weight_reorder(qweight, bits=4, group_size=128):
 98 |         oc = qweight.shape[-1]
 99 |         wf = torch.tensor(list(range(0, 32, bits)), dtype=torch.int32).unsqueeze(0)
100 |         weight = torch.bitwise_right_shift(torch.unsqueeze(qweight, 1).expand(-1, 32 // bits, -1), wf.unsqueeze(-1)).to(torch.int16 if bits == 8 else torch.int8)
101 |         torch.bitwise_and(weight, (2 ** bits) - 1, out=weight)
102 |         weight = weight.reshape(-1, oc).transpose(1, 0)
103 |         if bits == 8:
104 |             weight = weight.to(torch.uint8)
105 |             return weight
106 |         weight = weight.reshape(-1, 2).to(torch.uint8)
107 |         weight = weight[:, 0] * 16 + weight[:, 1]
108 |         return weight
109 | 
110 |     def apply(self, graph_path, weight_path):
111 |         # parse mnn graph
112 |         mnn_weights = []
113 |         mnn_graph = json.load(open(graph_path, 'rt'))
114 |         for op in mnn_graph['oplists']:
115 |             if op['type'] == 'Convolution':
116 |                 name = op['name']
117 |                 external = op['main']['external']
118 |                 weight_elements = op['main']['common']['outputCount'] * op['main']['common']['inputCount']
119 |                 mnn_weights.append(MNNWeight(name, external, weight_elements))
120 |         # load mnn weight
121 |         external_weight = open(weight_path, 'r+b')
122 |         for mnn_weight in mnn_weights:
123 |             gptq_weight = self.get(mnn_weight.key())
124 |             if gptq_weight is None: continue
125 |             # print(f'write {mnn_weight.key()} ... ', end='')
126 |             weight = gptq_weight.qweight
127 |             scale = gptq_weight.scales.float().transpose(1, 0)
128 |             # write weight data
129 |             weight = GPTQ.weight_reorder(weight, mnn_weight.quant_bits)
130 |             weight_bytes = weight.numpy().tobytes()
131 |             weight_size = mnn_weight.weight_size()
132 |             header_len = weight_size - len(weight_bytes)
133 |             assert(header_len > 0)
134 |             external_weight.seek(mnn_weight.offset() + header_len)
135 |             external_weight.write(weight_bytes)
136 |             scale_size = mnn_weight.scale_size()
137 |             is_asy = scale.numel() * scale.element_size() < scale_size
138 |             # write scale data
139 |             if is_asy:
140 |                 # zeros = mnn_weight.a_min * scale
141 |                 zeros = torch.zeros_like(scale)
142 |                 scale = torch.stack([zeros, scale], axis=-1)
143 |             scale_bytes = scale.numpy().tobytes()
144 |             assert(scale_size == len(scale_bytes))
145 |             external_weight.write(scale_bytes)
146 |             # print('Done!')
147 |         external_weight.close()


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # LLM-Export
  2 | 
  3 | [![PyPI version](https://badge.fury.io/py/llmexport.svg)](https://badge.fury.io/py/llmexport)
  4 | [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/release/python-380/)
  5 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
  6 | 
  7 | [English](./README_en.md) | 中文
  8 | 
  9 | 一个高效的大语言模型导出工具，能够将 LLM 模型导出为 ONNX 和 MNN 格式，支持量化优化和多模态模型。
 10 | 
 11 | ## ✨ 主要特性
 12 | 
 13 | - 🚀 **动态形状支持**：优化原始代码，支持动态输入形状
 14 | - 🚀 **模型优化**：减少常量部分，提升推理性能
 15 | - 🚀 **自动优化**：集成 [OnnxSlim](https://github.com/inisis/OnnxSlim) 优化 ONNX 模型，性能提升约 5% (感谢 [@inisis](https://github.com/inisis))
 16 | - 🚀 **LoRA 支持**：支持 LoRA 权重的合并/分离导出
 17 | - 🚀 **量化技术**：支持 AWQ、GPTQ、HQQ等多种量化方法
 18 | - 🚀 **EAGLE 支持**：支持 EAGLE 推理加速技术
 19 | - 🚀 **多模态支持**：支持文本、图像、音频等多模态模型
 20 | - 🚀 **推理框架**：提供 [MNN](https://github.com/wangzhaode/mnn-llm) 和 [ONNX](https://github.com/wangzhaode/onnx-llm) 推理代码
 21 | 
 22 | ## 📜 快速开始
 23 | 
 24 | ### 安装
 25 | 
 26 | ```bash
 27 | # 从 PyPI 安装（推荐）
 28 | pip install llmexport
 29 | 
 30 | # 从 GitHub 安装最新版本
 31 | pip install git+https://github.com/wangzhaode/llm-export@master
 32 | 
 33 | # 本地开发安装
 34 | git clone https://github.com/wangzhaode/llm-export
 35 | cd llm-export
 36 | pip install -e .
 37 | ```
 38 | 
 39 | ### 基本用法
 40 | 
 41 | #### 1. 下载模型
 42 | 
 43 | ```bash
 44 | # 使用 Hugging Face CLI
 45 | huggingface-cli download Qwen/Qwen2.5-1.5B-Instruct --local-dir Qwen2.5-1.5B-Instruct
 46 | 
 47 | # 或使用 ModelScope（国内用户推荐）
 48 | modelscope download Qwen/Qwen2.5-1.5B-Instruct --local_dir Qwen2.5-1.5B-Instruct
 49 | ```
 50 | 
 51 | #### 2. 模型测试
 52 | 
 53 | ```bash
 54 | # 文本对话测试
 55 | llmexport --path Qwen2.5-1.5B-Instruct --test "你好，请介绍一下你自己"
 56 | 
 57 | # 多模态测试（图像+文本）
 58 | llmexport --path Qwen2-VL-2B-Instruct --test "<img>image_url</img>描述一下这张图片"
 59 | ```
 60 | 
 61 | #### 3. 模型导出
 62 | 
 63 | ```bash
 64 | # 导出为 ONNX 格式
 65 | llmexport --path Qwen2.5-1.5B-Instruct --export onnx
 66 | 
 67 | # 导出为 MNN 格式（默认 4bit 量化）
 68 | llmexport --path Qwen2.5-1.5B-Instruct --export mnn
 69 | 
 70 | # 自定义量化参数
 71 | llmexport --path Qwen2.5-1.5B-Instruct --export mnn --quant_bit 8 --quant_block 128
 72 | 
 73 | # 导出 EAGLE 模型
 74 | llmexport --path Qwen2.5-1.5B-Instruct --export mnn --eagle_path path/to/eagle
 75 | ```
 76 | 
 77 | ## 🔧 高级功能
 78 | 
 79 | ### 模型导出选项
 80 | 
 81 | - **ONNX 导出**：使用 `--export onnx` 导出为 ONNX 格式
 82 | - **MNN 导出**：使用 `--export mnn` 导出为 MNN 格式
 83 | - **模型优化**：默认启用 OnnxSlim 优化，使用 `--onnx_slim` 显式启用
 84 | - **EAGLE 导出**：使用 `--eagle_path` 导出 EAGLE 加速模型
 85 | 
 86 | ### 量化配置
 87 | 
 88 | - **量化位数**：`--quant_bit 4/8` （默认 4bit）
 89 | - **量化块大小**：`--quant_block 64/128` （默认 64）
 90 | - **LM Head 量化**：`--lm_quant_bit` 单独设置输出层量化
 91 | - **对称量化**：`--sym` 启用对称量化（无零点）
 92 | 
 93 | ### 量化算法支持
 94 | 
 95 | - **AWQ 量化**：`--awq` 启用 AWQ 量化
 96 | - **HQQ 量化**：`--hqq` 启用 HQQ 量化
 97 | - **GPTQ 量化**：`--gptq_path` 加载 GPTQ 量化模型
 98 | - **Smooth 量化**：`--smooth` 启用 Smooth 量化
 99 | 
100 | ### LoRA 支持
101 | 
102 | - **LoRA 合并**：`--lora_path` 指定 LoRA 权重路径
103 | - **LoRA 分离**：`--lora_split` 分离导出 LoRA 权重
104 | 
105 | ### 多模态支持
106 | 
107 | - **视觉量化**：`--visual_quant_bit`、`--visual_quant_block` 设置视觉模块量化
108 | - **视觉对称**：`--visual_sym` 视觉模块对称量化
109 | 
110 | ### 其他选项
111 | 
112 | - **详细输出**：`--verbose` 显示详细日志
113 | - **性能评估**：`--ppl` 获取所有 token 的 logits
114 | - **自定义输出**：`--dst_path` 指定输出目录（默认 `./model`）
115 | - **EAGLE 支持**：`--eagle_path` 指定 EAGLE 模型路径
116 | 
117 | ## 📎 命令行参数
118 | 
119 | ### 基本参数
120 | 
121 | | 参数 | 类型 | 说明 |
122 | |------|------|------|
123 | | `--path` | 必需 | 模型路径，支持本地目录或 Hugging Face 模型 ID |
124 | | `--export` | 可选 | 导出格式：`onnx` 或 `mnn` |
125 | | `--test` | 可选 | 测试查询字符串 |
126 | | `--dst_path` | 可选 | 输出目录（默认 `./model`） |
127 | | `--verbose` | 开关 | 显示详细日志 |
128 | 
129 | ### 量化参数
130 | 
131 | | 参数 | 默认值 | 说明 |
132 | |------|--------|------|
133 | | `--quant_bit` | 4 | 量化位数（4 或 8） |
134 | | `--quant_block` | 64 | 量化块大小（0 表示通道级量化） |
135 | | `--lm_quant_bit` | 同 `quant_bit` | LM Head 层量化位数 |
136 | | `--visual_quant_bit` | 模型相关 | 视觉模块量化位数 |
137 | | `--visual_quant_block` | 模型相关 | 视觉模块量化块大小 |
138 | 
139 | ### 量化算法
140 | 
141 | | 参数 | 说明 |
142 | |------|------|
143 | | `--awq` | 启用 AWQ 量化 |
144 | | `--hqq` | 启用 HQQ 量化 |
145 | | `--smooth` | 启用 Smooth 量化 |
146 | | `--sym` | 启用对称量化（无零点） |
147 | | `--visual_sym` | 视觉模块对称量化 |
148 | 
149 | ### LoRA 支持
150 | 
151 | | 参数 | 说明 |
152 | |------|------|
153 | | `--lora_path` | LoRA 权重路径 |
154 | | `--lora_split` | 分离导出 LoRA 权重 |
155 | 
156 | ### EAGLE 支持
157 | 
158 | | 参数 | 说明 |
159 | |------|------|
160 | | `--eagle_path` | EAGLE 模型路径 |
161 | 
162 | ### 其他选项
163 | 
164 | | 参数 | 说明 |
165 | |------|------|
166 | | `--tokenizer_path` | 分词器路径（默认使用 `--path`） |
167 | | `--gptq_path` | GPTQ 量化模型路径 |
168 | | `--mnnconvert` | 本地 MNNConvert 路径 |
169 | | `--onnx_slim` | 启用 ONNX-Slim 优化 |
170 | | `--ppl` | 获取所有 token 的 logits |
171 | | `--seperate_embed` | 分离嵌入层以避免量化 |
172 | | `--calib_data` | 校准数据路径 |
173 | 
174 | ## 📋 支持模型
175 | 
176 | 目前支持以下模型类型：
177 | 
178 | ### 文本模型
179 | - **Qwen 系列**：Qwen3、Qwen2.5、Qwen2、Qwen1.5、Qwen-VL 等
180 | - **LLaMA 系列**：Llama-3.2、Llama-3、Llama-2 等
181 | - **ChatGLM 系列**：ChatGLM4、ChatGLM3、ChatGLM2 等
182 | - **Baichuan 系列**：Baichuan2-7B-Chat 等
183 | - **Yi 系列**：Yi-6B-Chat 等
184 | - **其他**：InternLM、DeepSeek、Phi、Gemma、TinyLlama、SmolLM 等
185 | 
186 | ### 多模态模型
187 | - **视觉模型**：Qwen2-VL、Qwen2.5-VL、Qwen3-VL、Llama-3.2-Vision、InternVL 等
188 | - **音频模型**：Qwen2-Audio、Qwen2.5-Omni 等
189 | 
190 | ### 嵌入模型
191 | - **文本嵌入**：bge-large-zh、gte-multilingual 等
192 | 
193 | ## 💾 模型下载
194 | 
195 | 我们提供了已经优化的模型下载：
196 | 
197 | - **Hugging Face**：[taobao-mnn](https://huggingface.co/taobao-mnn)
198 | - **ModelScope**：[MNN](https://modelscope.cn/organization/MNN)
199 | 
200 | 部分热门模型：
201 | 
202 | | 模型 | Hugging Face | ModelScope |
203 | |------|-------------|------------|
204 | | DeepSeek-R1-1.5B-Qwen | [Q4_1](https://huggingface.co/taobao-mnn/DeepSeek-R1-1.5B-Qwen-MNN) | [Q4_1](https://modelscope.cn/models/MNN/DeepSeek-R1-1.5B-Qwen-MNN) |
205 | | Qwen2.5-0.5B-Instruct | [Q4_1](https://huggingface.co/taobao-mnn/Qwen2.5-0.5B-Instruct-MNN) | [Q4_1](https://modelscope.cn/models/MNN/Qwen2.5-0.5B-Instruct-MNN) |
206 | | Qwen2.5-1.5B-Instruct | [Q4_1](https://huggingface.co/taobao-mnn/Qwen2.5-1.5B-Instruct-MNN) | [Q4_1](https://modelscope.cn/models/MNN/Qwen2.5-1.5B-Instruct-MNN) |
207 | | GPT-OSS-20B | [Q4_1](https://huggingface.co/taobao-mnn/gpt-oss-20b-MNN) | [Q4_1](https://modelscope.cn/models/MNN/gpt-oss-20b-MNN) |
208 | | Qwen3-4B-Instruct-2507 | [Q4_1](https://huggingface.co/taobao-mnn/Qwen3-4B-Instruct-2507-MNN) | [Q4_1](https://modelscope.cn/models/MNN/Qwen3-4B-Instruct-2507-MNN) |
209 | 
210 | 更多模型请查看完整列表。
211 | 
212 | ## 🔗 相关项目
213 | 
214 | - **MNN 推理**：[mnn-llm](https://github.com/wangzhaode/mnn-llm) - MNN 框架的 LLM 推理库
215 | - **ONNX 推理**：[onnx-llm](https://github.com/wangzhaode/onnx-llm)、[OnnxLLM](https://github.com/inisis/OnnxLLM) - ONNX 格式推理库
216 | - **模型优化**：[OnnxSlim](https://github.com/inisis/OnnxSlim) - ONNX 模型优化工具
217 | 
218 | ## 📄 许可证
219 | 
220 | 本项目采用 [Apaache 2.0 许可证](./LICENSE)。


--------------------------------------------------------------------------------
/llmexport/utils/hqq_quantizer.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | 
  4 | 
  5 | class HQQQuantizer:
  6 |     def __init__(self,
  7 |                  weight,
  8 |                  bit,
  9 |                  group_size,
 10 |                  sym=False,
 11 |                  compute_dtype: torch.dtype = torch.float32,
 12 |                  device: torch.device = torch.device("cpu"),
 13 |                  quant_config: dict = None):
 14 |         self.weight = weight
 15 |         self.bit = bit
 16 |         self.group_size = group_size
 17 |         self.sym = sym
 18 |         self.compute_dtype = compute_dtype
 19 |         self.device = device
 20 | 
 21 |     def quant(self):
 22 |         self._quantize()
 23 | 
 24 |     @torch.inference_mode()
 25 |     def _quantize(
 26 |         self,
 27 |         channel_wise: bool = True,
 28 |         axis: int = 1,
 29 |     ) -> tuple:
 30 | 
 31 |         if self.group_size is not None:
 32 |             assert self.weight.numel() % self.group_size == 0, (
 33 |                 "group_size should be divisble by the total tensor dimensions. shape: "
 34 |                 + str(self.weight.shape)
 35 |                 + ", group_size: "
 36 |                 + str(self.group_size)
 37 |             )
 38 | 
 39 |         W = self.weight.to(self.compute_dtype).float()
 40 |         shape = W.shape
 41 | 
 42 |         # Reshape for grouping
 43 |         if (self.group_size is not None) and channel_wise:
 44 |             W = (
 45 |                 W.reshape([-1, self.group_size])
 46 |                 if (axis == 1)
 47 |                 else W.reshape([self.group_size, -1])
 48 |             )
 49 | 
 50 |         # Get min/max values
 51 |         if not channel_wise:
 52 |             _min, _max = W.min(), W.max()
 53 |         else:
 54 |             _min = W.min(axis=axis, keepdim=True)[0]
 55 |             _max = W.max(axis=axis, keepdim=True)[0]
 56 | 
 57 |         if self.sym:
 58 |             max_v = 2**(self.bit-1) - 1    # 4bit: 7
 59 |             min_v = -2**(self.bit-1)       # 4bit: -8
 60 |             min_max = [min_v, max_v]       # [-8, 7]
 61 | 
 62 |             max_abs = torch.max(torch.abs(_min), torch.abs(_max))
 63 |             scale = max_v / max_abs
 64 |             scale = torch.where(max_abs <= 1e-4, torch.full_like(scale, 1.0), scale)
 65 |             scale = scale.clamp(max=2e4)
 66 |             zero = None
 67 |         else:
 68 |             max_v = round(2**self.bit - 1)  # 4bit: 15
 69 |             min_v = 0                       # 4bit: 0
 70 |             min_max = [min_v, max_v]        # [0, 15]
 71 | 
 72 |             denom = (_max - _min)
 73 |             scale = (max_v / denom)
 74 |             scale = torch.where(denom.abs() <= 1e-4, torch.full_like(scale, 1.0), scale)
 75 |             scale = scale.clamp(max=2e4)
 76 |             zero = -_min * scale
 77 |             zero = torch.round(zero)
 78 | 
 79 |         W_q, scale, zero = self._optimize_weights(
 80 |             W,
 81 |             scale,
 82 |             zero,
 83 |             min_max=min_max,
 84 |             axis=axis,
 85 |         )
 86 |         #W_q = (W * scale).round_().clamp_(min_max[0], min_max[1])
 87 |         # cleanup
 88 |         del W, _min, _max
 89 | 
 90 |         # Store meta-data (we invert the scale for dequantization)
 91 |         scale = 1.0 / scale
 92 |         meta = {
 93 |             "nbits": self.bit,
 94 |             "group_size": self.group_size,
 95 |             "shape": shape,
 96 |             "scale": scale,
 97 |             "zero": zero,
 98 |             "axis": axis,
 99 |         }
100 | 
101 |         W_q = W_q.to(self.weight.dtype)
102 | 
103 |         if self.device == torch.device('cuda'):
104 |             torch.cuda.empty_cache()
105 |         elif self.device == torch.device('mps'):
106 |             torch.mps.empty_cache()
107 | 
108 |         self.W_q = W_q
109 |         self.meta = meta
110 | 
111 |     @torch.inference_mode()
112 |     def _optimize_weights(
113 |         self,
114 |         W: torch.Tensor,
115 |         scale: torch.Tensor,
116 |         zero: torch.Tensor,
117 |         min_max: list,
118 |         axis: int = 0,
119 |         opt_params: dict = {"lp_norm": 0.7, "beta": 1e1, "kappa": 1.01, "iters": 20},
120 |         verbose: bool = False,
121 |     ) -> tuple:
122 |         lp_norm, beta, kappa, iters = (
123 |             opt_params["lp_norm"],
124 |             opt_params["beta"],
125 |             opt_params["kappa"],
126 |             opt_params["iters"],
127 |         )
128 | 
129 |         dtype = torch.float32
130 |         W_f   = W.to(dtype=dtype, device=self.device)
131 |         scale = scale.to(dtype=dtype, device=self.device)
132 |         if not self.sym:
133 |             zero  = zero.to(dtype=dtype, device=self.device)
134 | 
135 |         best_error = torch.tensor(torch.inf, dtype=torch.float32, device=self.device)
136 |         W_q = torch.empty_like(W_f)
137 |         W_r = torch.empty_like(W_f)
138 |         W_e = torch.empty_like(W_f)
139 |         W_prime = torch.empty_like(W_f) if self.sym else None
140 |         for i in range(iters):
141 |             if not self.sym:
142 |                 self._optimize_weights_proximal_legacy_step(W_f, scale, zero, min_max, beta, lp_norm, axis, W_q, W_r, W_e)
143 |             else:
144 |                 self._optimize_weights_proximal_scale_only(W_f, scale, min_max, beta, lp_norm, axis, W_q, W_r, W_e, W_prime)
145 |             current_error = torch.abs(W_f - W_r).mean().float()
146 |             if verbose:
147 |                 print(i, current_error.cpu())
148 | 
149 |             if current_error < best_error:
150 |                 best_error = current_error
151 |             else:
152 |                 break
153 | 
154 |         scale = scale.to(W.device)
155 |         if not self.sym:
156 |             zero = zero.to(W.device)
157 |         del W_f, W_q, W_r
158 |         if self.device.type == 'cuda':
159 |             torch.cuda.empty_cache()
160 |         elif self.device.type == 'mps':
161 |             torch.mps.empty_cache()
162 |         if not self.sym:
163 |             W_q = torch.round(W * scale + zero).clamp_(min_max[0], min_max[1])
164 |         else:
165 |             W_q = torch.round(W * scale).clamp_(min_max[0], min_max[1])
166 |         return W_q, scale, zero
167 | 
168 |     @torch.inference_mode()
169 |     def _optimize_weights_proximal_legacy_step(self, W_f, scale, zero, min_max, beta, lp_norm, axis, W_q, W_r, W_e):
170 |         torch.mul(W_f, scale, out=W_q)
171 |         torch.add(W_q, zero, out=W_q)
172 |         torch.round(W_q, out=W_q).clamp_(min_max[0], min_max[1])
173 |         torch.sub(W_q, zero, out=W_r)
174 |         torch.div(W_r, scale, out=W_r)
175 |         torch.sub(W_f, W_r, out=W_e)
176 |         self._shrink_lp_op(W_e, beta, lp_norm, out=W_e)
177 |         torch.sub(W_f, W_e, out=W_r)
178 |         torch.mul(W_r, scale, out=W_r)
179 |         torch.sub(W_q, W_r, out=W_r)
180 |         torch.mean(W_r, axis=axis, keepdim=True, out=zero)
181 | 
182 |     @torch.inference_mode()
183 |     def _optimize_weights_proximal_scale_only(self, W_f, scale, min_max, beta, lp_norm, axis, W_q, W_r, W_e, W_prime, eps=1e-8):
184 |         torch.mul(W_f, scale, out=W_q)
185 |         torch.round(W_q, out=W_q).clamp_(min_max[0], min_max[1])
186 |         torch.div(W_q, scale, out=W_r)
187 |         torch.sub(W_f, W_r, out=W_e)
188 |         self._shrink_lp_op(W_e, beta, lp_norm, out=W_e)
189 |         torch.sub(W_f, W_e, out=W_prime)
190 |         w_prime_dot_w_q = torch.sum(W_prime * W_q, axis=axis, keepdim=True)
191 |         w_q_norm_sq = torch.sum(W_q**2, axis=axis, keepdim=True)
192 |         torch.add(w_prime_dot_w_q, eps, out=w_prime_dot_w_q)
193 |         torch.div(w_q_norm_sq, w_prime_dot_w_q, out=scale)
194 | 
195 |     # Shrinking operator
196 |     @torch.inference_mode()
197 |     def _shrink_lp_op(self, x: torch.Tensor, beta: float, lp_norm: float, out: torch.Tensor) -> torch.Tensor:
198 |         if lp_norm == 1:
199 |             #torch.sign(x) * torch.nn.functional.relu(torch.abs(x) - 1.0 / beta)
200 |             torch.abs(x, out=out)
201 |             out.sub_(1.0 / beta).clamp_min_(0.0)
202 |             out.mul_(torch.sign(x))
203 |             return out
204 |         else:
205 |             #torch.sign(x) * torch.nn.functional.relu(torch.abs(x) - (1.0 / beta) * torch.pow(torch.abs(x), lp_norm - 1))
206 |             torch.abs(x, out=out)
207 |             out.sub_((1.0 / beta) * out.pow(lp_norm - 1)).clamp_min_(0.0)
208 |             out.mul_(torch.sign(x))
209 |             return out


--------------------------------------------------------------------------------
/llmexport/utils/talker.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | torch.set_printoptions(precision=4, sci_mode=False)
  4 | from .model_mapper import ModelMapper
  5 | from .transformers import Rotary, Embedding, Decoder
  6 | from .token2wav import Qwen2_5OmniToken2Wav
  7 | from .spinner import spinner_run
  8 | from .torch_utils import onnx_export
  9 | 
 10 | class Talker(torch.nn.Module):
 11 |     def __init__(self, talker, token2wav, base):
 12 |         super().__init__()
 13 |         self.model_type = base.model_type
 14 |         self.thinker_embed = base.embed
 15 |         self.args = base.args
 16 |         self.talker = talker.float()
 17 |         self.token2wav = Qwen2_5OmniToken2Wav(token2wav, base)
 18 |         self.config = base.config
 19 |         self.hidden_size = base.hidden_size
 20 |         self.llm_config = base.llm_config
 21 |         self.rope_ratio = 1.0
 22 |         self.quant_bit = 4
 23 |         if self.hidden_size <= 2048:
 24 |             # Qwen2.5-Omni-3B using 8 bit quantization
 25 |             self.quant_bit = 8
 26 |         self.init_config()
 27 |         self.load()
 28 | 
 29 |     @staticmethod
 30 |     def get_talker(model_type):
 31 |         audio_models = {
 32 |             'qwen2_5_omni': Qwen2_5OmniTalker,
 33 |         }
 34 |         if model_type in audio_models:
 35 |             return audio_models[model_type]
 36 |         return None
 37 | 
 38 |     def init_config(self):
 39 |         self.llm_config['has_talker'] = True
 40 | 
 41 |     def load(self):
 42 |         raise NotImplementedError
 43 | 
 44 |     def add_token_embeds(self, thinker_embeds):
 45 |         raise NotImplementedError
 46 | 
 47 |     def add_hidden_states(self, thinker_hidden_states):
 48 |         raise NotImplementedError
 49 | 
 50 |     def add_generate_ids(self, token_id):
 51 |         raise NotImplementedError
 52 | 
 53 |     def forward(self, inputs_embeds, attention_mask, position_ids, past_key_values = None):
 54 |         raise NotImplementedError
 55 | 
 56 |     def export(self, onnx_path):
 57 |         raise NotImplementedError
 58 | 
 59 |     def export_embed(self):
 60 |         import ctypes
 61 |         tensor_data = self.embed.weight.data.bfloat16()
 62 |         data_ptr = tensor_data.untyped_storage().data_ptr()
 63 |         buffer = (ctypes.c_byte * (tensor_data.numel() * 2)).from_address(data_ptr)
 64 |         embedding_file = f'{self.args.dst_path}/talker_embeddings_bf16.bin'
 65 |         with open(embedding_file, 'wb') as f:
 66 |             f.write(buffer)
 67 |         return embedding_file
 68 | 
 69 | class OmniRotary(Rotary):
 70 |     def __init__(self, model):
 71 |         super().__init__(model)
 72 |         self.mrope_section = model.mrope_section
 73 |         self.theta_sections = self.theta.unsqueeze(0).split(self.mrope_section, dim=-1)
 74 | 
 75 |     def forward(self, position_ids):
 76 |         position_ids = position_ids.float().unsqueeze(-1)
 77 |         idx_theta = torch.concat([
 78 |             position_ids[0] * self.theta_sections[0],
 79 |             position_ids[1] * self.theta_sections[1],
 80 |             position_ids[2] * self.theta_sections[2]
 81 |         ], dim=-1)
 82 |         rotary_pos_emb = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)])
 83 |         rotary_pos_emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
 84 |         rotary_pos_emb = rotary_pos_emb.unsqueeze(3)
 85 |         return rotary_pos_emb
 86 | 
 87 | class Qwen2_5OmniTalker(Talker):
 88 |     def __init__(self, talker, token2wav, base):
 89 |         super().__init__(talker, token2wav, base)
 90 |         self.input_hidden_size = base.hidden_size
 91 |         self.seq_len = 0
 92 |         self.token_len = 0
 93 |         self.talker_embeds = []
 94 | 
 95 |     def load(self):
 96 |         # load talker model
 97 |         self.model_map = {
 98 |             'config': {
 99 |                 'hidden_size': 'hidden_size',
100 |                 'head_dim': 'head_dim',
101 |                 'num_attention_heads': 'num_attention_heads',
102 |                 'num_hidden_layers': 'num_hidden_layers',
103 |                 'num_key_value_heads': 'num_key_value_heads',
104 |                 'rope_theta': 'rope_theta',
105 |                 'rope_scaling': 'rope_scaling'
106 |             },
107 |             'decoder': {
108 |                 'self_attn': 'self_attn',
109 |                 'mlp': 'mlp',
110 |                 'input_layernorm': 'input_layernorm',
111 |                 'post_attention_layernorm': 'post_attention_layernorm'
112 |             },
113 |             'attention': {
114 |                 'q_proj': 'q_proj',
115 |                 'k_proj': 'k_proj',
116 |                 'v_proj': 'v_proj',
117 |                 'o_proj': 'o_proj'
118 |             }
119 |         }
120 |         ModelMapper.do_map(self, self.talker.config, self.model_map['config'])
121 |         self.mrope_section = self.rope_scaling['mrope_section']
122 |         self.embed = self.talker.model.embed_tokens
123 |         self.rotary = OmniRotary(self)
124 |         # self.rotary = Rotary(self)
125 |         self.blocks = []
126 |         for block in self.talker.model.layers:
127 |             layer_id = len(self.blocks)
128 |             self.blocks.append(Decoder(block, layer_id, self))
129 | 
130 |     def forward(self, inputs_embeds, attention_mask, position_ids, past_key_values = None):
131 |         hidden_states = self.talker.thinker_to_talker_proj(inputs_embeds)
132 |         rotary_pos_emb = self.rotary(position_ids)
133 |         presents = [None for i in range(self.num_hidden_layers)]
134 | 
135 |         for i in range(self.num_hidden_layers):
136 |             hidden_states, kv = self.blocks[i](hidden_states, rotary_pos_emb, attention_mask, past_key_values[i])
137 |             presents[i] = kv
138 | 
139 |         hidden_states = hidden_states[:, -1, :]
140 |         hidden_states = self.talker.model.norm(hidden_states)
141 |         logits = self.talker.codec_head(hidden_states)
142 |         presents = torch.stack(presents)
143 |         return logits, presents
144 | 
145 |     def get_position_ids(self) -> torch.Tensor:
146 |         if self.token_len:
147 |             position_ids = torch.tensor([[self.seq_len - 1]], dtype=torch.int)
148 |         else:
149 |             position_ids = torch.arange(self.seq_len, dtype=torch.int).unsqueeze(0)
150 |         position_ids = torch.stack([position_ids] * 3)
151 |         return position_ids
152 | 
153 |     def get_attention_mask(self) -> torch.Tensor:
154 |         if self.token_len:
155 |             return torch.zeros([1, 1, 1, self.seq_len], dtype=torch.float32)
156 |         return (1 - torch.tril(torch.ones([1, 1, self.seq_len, self.seq_len]))) * torch.finfo(torch.float32).min
157 | 
158 |     def generate(self):
159 |         talker_text_bos_token = 151872
160 |         talker_inputs_embeds = torch.cat(
161 |             [
162 |                 self.talker_embeds[0],
163 |                 self.thinker_embed(torch.tensor([[talker_text_bos_token]], dtype=torch.long)) + \
164 |                 self.embed(torch.LongTensor([self.talker.codec_pad_token])),
165 |                 self.talker_embeds[1] + self.embed(torch.LongTensor([self.talker.codec_bos_token])),
166 |             ],
167 |             dim=1,
168 |         )
169 |         thinker_reply_part = torch.cat(self.talker_embeds[2:], dim=1)
170 |         thinker_reply_part = torch.cat(
171 |             [
172 |                 thinker_reply_part,
173 |                 self.thinker_embed(
174 |                     torch.tensor([[self.talker.text_eos_token]], dtype=torch.long)
175 |                 ),
176 |                 self.thinker_embed(
177 |                     torch.tensor([[self.talker.text_pad_token]], dtype=torch.long)
178 |                 ),
179 |             ],
180 |             dim=1,
181 |         )
182 | 
183 |         _, self.seq_len, _ = talker_inputs_embeds.shape
184 |         _, reply_len, _ = thinker_reply_part.shape
185 |         past_key_values = [None for i in range(self.num_hidden_layers)]
186 | 
187 |         inputs_embeds = talker_inputs_embeds.float()
188 |         self.token_len = 0
189 |         self.stop_ids = [8292, 8294]
190 |         token_id = None
191 |         tokens = []
192 |         while self.token_len < 256:
193 |             attention_mask = self.get_attention_mask()
194 |             position_ids = self.get_position_ids()
195 |             if self.token_len > 0:
196 |                 inputs_embeds = self.embed(token_id)
197 |                 if self.token_len <= reply_len:
198 |                     inputs_embeds = inputs_embeds + thinker_reply_part[:, self.token_len - 1, :]
199 |                 else:
200 |                     inputs_embeds = inputs_embeds + thinker_reply_part[:, -1, :]
201 |             logits, past_key_values = self.forward(inputs_embeds=inputs_embeds,
202 |                                                     attention_mask=attention_mask,
203 |                                                     position_ids=position_ids,
204 |                                                     past_key_values=past_key_values)
205 |             token_id = torch.argmax(logits)
206 |             self.token_len += 1
207 |             self.seq_len += 1
208 |             tokens.append(int(token_id))
209 |             if int(token_id) in self.stop_ids:
210 |                 break
211 |         talker_generate_codes = torch.tensor(tokens, dtype=torch.long).unsqueeze(0)
212 |         # 3. Generate wavs from code
213 |         wav = self.token2wav.generate(talker_generate_codes,)
214 |         import soundfile as sf
215 |         sf.write(
216 |             "output.wav",
217 |             wav.reshape(-1).detach().cpu().numpy(),
218 |             samplerate=24000,
219 |         )
220 | 
221 |     def add_talker_embeds(self, talker_embed):
222 |         self.talker_embeds.append(talker_embed)
223 | 
224 |     @spinner_run(f'export talker to ')
225 |     def export(self, onnx_path):
226 |         self.export_embed()
227 |         self.seq_len = 3
228 |         self.token_len = 0
229 |         inputs_embeds = torch.randn([1, self.seq_len, self.input_hidden_size])
230 |         posision_ids = self.get_position_ids()
231 |         attention_mask = self.get_attention_mask()
232 |         past_key_values = torch.zeros([self.num_hidden_layers, 2, 1, 0, self.num_key_value_heads, self.head_dim])
233 |         talker_onnx = f'{onnx_path}/talker.onnx'
234 |         onnx_export(self, (inputs_embeds, attention_mask, posision_ids, past_key_values),
235 |                     talker_onnx,
236 |                     input_names=['inputs_embeds', 'attention_mask', 'position_ids', 'past_key_values'],
237 |                     output_names=['logits'],
238 |                     dynamic_axes={
239 |                         "inputs_embeds": { 1: "size" },
240 |                         "attention_mask": { 2: "size", 3: "size" },
241 |                         "position_ids": { 2: "size" },
242 |                         "past_key_values": { 3: "size" }
243 |                     })
244 |         return talker_onnx


--------------------------------------------------------------------------------
/README_en.md:
--------------------------------------------------------------------------------
  1 | # LLM-Export
  2 | 
  3 | [![PyPI version](https://badge.fury.io/py/llmexport.svg)](https://badge.fury.io/py/llmexport)
  4 | [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/release/python-380/)
  5 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
  6 | 
  7 | English | [中文](./README.md)
  8 | 
  9 | An efficient Large Language Model export tool that converts LLM models to ONNX and MNN formats, supporting quantization optimization and multimodal models.
 10 | 
 11 | ## ✨ Key Features
 12 | 
 13 | - 🚀 **Dynamic Shape Support**: Optimized original code with dynamic input shape support
 14 | - 🚀 **Model Optimization**: Reduced constant parts for improved inference performance
 15 | - 🚀 **Automatic Optimization**: Integrated [OnnxSlim](https://github.com/inisis/OnnxSlim) for ONNX model optimization, ~5% performance improvement (Thanks [@inisis](https://github.com/inisis))
 16 | - 🚀 **LoRA Support**: Support for LoRA weight merging/splitting export
 17 | - 🚀 **Quantization Methods**: Support for AWQ, GPTQ, HQQ, and other quantization methods
 18 | - 🚀 **Multimodal Support**: Support for text, image, audio, and other multimodal models
 19 | - 🚀 **Inference Frameworks**: Provides [MNN](https://github.com/wangzhaode/mnn-llm) and [ONNX](https://github.com/wangzhaode/onnx-llm) inference code
 20 | 
 21 | ## 📖 Quick Start
 22 | 
 23 | ### Installation
 24 | 
 25 | ```bash
 26 | # Install from PyPI (Recommended)
 27 | pip install llmexport
 28 | 
 29 | # Install latest version from GitHub
 30 | pip install git+https://github.com/wangzhaode/llm-export@master
 31 | 
 32 | # Local development installation
 33 | git clone https://github.com/wangzhaode/llm-export
 34 | cd llm-export
 35 | pip install -e .
 36 | ```
 37 | 
 38 | ### Basic Usage
 39 | 
 40 | #### 1. Download Model
 41 | 
 42 | ```bash
 43 | # Using Hugging Face CLI
 44 | huggingface-cli download Qwen/Qwen2.5-1.5B-Instruct --local-dir Qwen2.5-1.5B-Instruct
 45 | 
 46 | # Or using ModelScope (Recommended for users in China)
 47 | modelscope download Qwen/Qwen2.5-1.5B-Instruct --local_dir Qwen2.5-1.5B-Instruct
 48 | ```
 49 | 
 50 | #### 2. Model Testing
 51 | 
 52 | ```bash
 53 | # Text conversation testing
 54 | llmexport --path Qwen2.5-1.5B-Instruct --test "Hello, please introduce yourself"
 55 | 
 56 | # Multimodal testing (Image + Text)
 57 | llmexport --path Qwen2-VL-2B-Instruct --test "<img>image_url</img>Describe this image"
 58 | ```
 59 | 
 60 | #### 3. Model Export
 61 | 
 62 | ```bash
 63 | # Export to ONNX format
 64 | llmexport --path Qwen2.5-1.5B-Instruct --export onnx
 65 | 
 66 | # Export to MNN format (Default 4bit quantization)
 67 | llmexport --path Qwen2.5-1.5B-Instruct --export mnn
 68 | 
 69 | # Custom quantization parameters
 70 | llmexport --path Qwen2.5-1.5B-Instruct --export mnn --quant_bit 8 --quant_block 128
 71 | ```
 72 | 
 73 | ## 🔧 Advanced Features
 74 | 
 75 | ### Model Export Options
 76 | 
 77 | - **ONNX Export**: Use `--export onnx` to export to ONNX format
 78 | - **MNN Export**: Use `--export mnn` to export to MNN format
 79 | - **Model Optimization**: OnnxSlim optimization enabled by default, use `--onnx_slim` to explicitly enable
 80 | 
 81 | ### Quantization Configuration
 82 | 
 83 | - **Quantization Bits**: `--quant_bit 4/8` (Default 4bit)
 84 | - **Quantization Block Size**: `--quant_block 64/128` (Default 64)
 85 | - **LM Head Quantization**: `--lm_quant_bit` separate setting for output layer quantization
 86 | - **Symmetric Quantization**: `--sym` enable symmetric quantization (no zero point)
 87 | 
 88 | ### Quantization Algorithm Support
 89 | 
 90 | - **AWQ Quantization**: `--awq` enable AWQ quantization
 91 | - **HQQ Quantization**: `--hqq` enable HQQ quantization
 92 | - **GPTQ Quantization**: `--gptq_path` load GPTQ quantized model
 93 | - **Smooth Quantization**: `--smooth` enable Smooth quantization
 94 | 
 95 | ### LoRA Support
 96 | 
 97 | - **LoRA Merging**: `--lora_path` specify LoRA weight path
 98 | - **LoRA Splitting**: `--lora_split` export LoRA weights separately
 99 | 
100 | ### Multimodal Support
101 | 
102 | - **Visual Quantization**: `--visual_quant_bit`, `--visual_quant_block` set visual module quantization
103 | - **Visual Symmetric**: `--visual_sym` visual module symmetric quantization
104 | 
105 | ### Other Options
106 | 
107 | - **Verbose Output**: `--verbose` show detailed logs
108 | - **Performance Evaluation**: `--ppl` get logits for all tokens
109 | - **Custom Output**: `--dst_path` specify output directory (default `./model`)
110 | 
111 | ## 📎 Command Line Parameters
112 | 
113 | ### Basic Parameters
114 | 
115 | | Parameter | Type | Description |
116 | |-----------|------|-------------|
117 | | `--path` | Required | Model path, supports local directory or Hugging Face model ID |
118 | | `--export` | Optional | Export format: `onnx` or `mnn` |
119 | | `--test` | Optional | Test query string |
120 | | `--dst_path` | Optional | Output directory (default `./model`) |
121 | | `--verbose` | Flag | Show detailed logs |
122 | 
123 | ### Quantization Parameters
124 | 
125 | | Parameter | Default | Description |
126 | |-----------|---------|-------------|
127 | | `--quant_bit` | 4 | Quantization bits (4 or 8) |
128 | | `--quant_block` | 64 | Quantization block size (0 means channel-wise) |
129 | | `--lm_quant_bit` | Same as `quant_bit` | LM Head layer quantization bits |
130 | | `--visual_quant_bit` | Model dependent | Visual module quantization bits |
131 | | `--visual_quant_block` | Model dependent | Visual module quantization block size |
132 | 
133 | ### Quantization Algorithms
134 | 
135 | | Parameter | Description |
136 | |-----------|-------------|
137 | | `--awq` | Enable AWQ quantization |
138 | | `--hqq` | Enable HQQ quantization |
139 | | `--smooth` | Enable Smooth quantization |
140 | | `--sym` | Enable symmetric quantization (no zero point) |
141 | | `--visual_sym` | Visual module symmetric quantization |
142 | 
143 | ### LoRA Support
144 | 
145 | | Parameter | Description |
146 | |-----------|-------------|
147 | | `--lora_path` | LoRA weight path |
148 | | `--lora_split` | Export LoRA weights separately |
149 | 
150 | ### Other Options
151 | 
152 | | Parameter | Description |
153 | |-----------|-------------|
154 | | `--tokenizer_path` | Tokenizer path (default uses `--path`) |
155 | | `--gptq_path` | GPTQ quantized model path |
156 | | `--mnnconvert` | Local MNNConvert path |
157 | | `--onnx_slim` | Enable ONNX-Slim optimization |
158 | | `--ppl` | Get logits for all tokens |
159 | | `--seperate_embed` | Separate embedding layer to avoid quantization |
160 | | `--calib_data` | Calibration data path |
161 | 
162 | ## Commad Args
163 | ```
164 | usage: llmexport.py [-h] --path PATH [--type TYPE] [--tokenizer_path TOKENIZER_PATH] [--lora_path LORA_PATH] [--gptq_path GPTQ_PATH] [--dst_path DST_PATH]
165 |                     [--verbose] [--test TEST] [--export EXPORT] [--onnx_slim] [--quant_bit QUANT_BIT] [--quant_block QUANT_BLOCK] [--lm_quant_bit LM_QUANT_BIT]
166 |                     [--mnnconvert MNNCONVERT] [--ppl] [--awq] [--sym] [--tie_embed] [--lora_split]
167 | 
168 | llm_exporter
169 | 
170 | options:
171 |   -h, --help            show this help message and exit
172 |   --path PATH           path(`str` or `os.PathLike`):
173 |                         Can be either:
174 |                         	- A string, the *model id* of a pretrained model like `THUDM/chatglm-6b`. [TODO]
175 |                         	- A path to a *directory* clone from repo like `../chatglm-6b`.
176 |   --type TYPE           type(`str`, *optional*):
177 |                         	The pretrain llm model type.
178 |   --tokenizer_path TOKENIZER_PATH
179 |                         tokenizer path, defaut is `None` mean using `--path` value.
180 |   --lora_path LORA_PATH
181 |                         lora path, defaut is `None` mean not apply lora.
182 |   --gptq_path GPTQ_PATH
183 |                         gptq path, defaut is `None` mean not apply gptq.
184 |   --dst_path DST_PATH   export onnx/mnn model to path, defaut is `./model`.
185 |   --verbose             Whether or not to print verbose.
186 |   --test TEST           test model inference with query `TEST`.
187 |   --export EXPORT       export model to an onnx/mnn model.
188 |   --onnx_slim           Whether or not to use onnx-slim.
189 |   --quant_bit QUANT_BIT
190 |                         mnn quant bit, 4 or 8, default is 4.
191 |   --quant_block QUANT_BLOCK
192 |                         mnn quant block, default is 0 mean channle-wise.
193 |   --lm_quant_bit LM_QUANT_BIT
194 |                         mnn lm_head quant bit, 4 or 8, default is `quant_bit`.
195 |   --mnnconvert MNNCONVERT
196 |                         local mnnconvert path, if invalid, using pymnn.
197 |   --ppl                 Whether or not to get all logits of input tokens.
198 |   --awq                 Whether or not to use awq quant.
199 |   --sym                 Whether or not to using symmetric quant (without zeropoint), defualt is False.
200 |   --tie_embed           Whether or not to using tie_embedding, defualt is False.
201 |   --lora_split          Whether or not export lora split, defualt is False.
202 | ```
203 | 
204 | ## 📋 Supported Models
205 | 
206 | Currently supports the following model types:
207 | 
208 | ### Text Models
209 | - **Qwen Series**: Qwen2.5, Qwen2, Qwen1.5, Qwen-VL, etc.
210 | - **LLaMA Series**: Llama-3.2, Llama-3, Llama-2, etc.
211 | - **ChatGLM Series**: ChatGLM4, ChatGLM3, ChatGLM2, etc.
212 | - **Baichuan Series**: Baichuan2-7B-Chat, etc.
213 | - **Yi Series**: Yi-6B-Chat, etc.
214 | - **Others**: InternLM, DeepSeek, Phi, Gemma, TinyLlama, etc.
215 | 
216 | ### Multimodal Models
217 | - **Vision Models**: Qwen2-VL, Qwen2.5-VL, Llama-3.2-Vision, InternVL, etc.
218 | - **Audio Models**: Qwen2-Audio, Qwen2.5-Omni, etc.
219 | 
220 | ### Embedding Models
221 | - **Text Embedding**: bge-large-zh, gte-multilingual, etc.
222 | 
223 | ## 💾 Model Downloads
224 | 
225 | We provide optimized model downloads:
226 | 
227 | - **Hugging Face**: [taobao-mnn](https://huggingface.co/taobao-mnn)
228 | - **ModelScope**: [MNN](https://modelscope.cn/organization/MNN)
229 | 
230 | Popular models:
231 | 
232 | | Model | Hugging Face | ModelScope |
233 | |-------|-------------|------------|
234 | | DeepSeek-R1-1.5B-Qwen | [Q4_1](https://huggingface.co/taobao-mnn/DeepSeek-R1-1.5B-Qwen-MNN) | [Q4_1](https://modelscope.cn/models/MNN/DeepSeek-R1-1.5B-Qwen-MNN) |
235 | | Qwen2.5-0.5B-Instruct | [Q4_1](https://huggingface.co/taobao-mnn/Qwen2.5-0.5B-Instruct-MNN) | [Q4_1](https://modelscope.cn/models/MNN/Qwen2.5-0.5B-Instruct-MNN) |
236 | | Qwen2.5-1.5B-Instruct | [Q4_1](https://huggingface.co/taobao-mnn/Qwen2.5-1.5B-Instruct-MNN) | [Q4_1](https://modelscope.cn/models/MNN/Qwen2.5-1.5B-Instruct-MNN) |
237 | | GPT-OSS-20B | [Q4_1](https://huggingface.co/taobao-mnn/gpt-oss-20b-MNN) | [Q4_1](https://modelscope.cn/models/MNN/gpt-oss-20b-MNN) |
238 | | Qwen3-4B-Instruct-2507 | [Q4_1](https://huggingface.co/taobao-mnn/Qwen3-4B-Instruct-2507-MNN) | [Q4_1](https://modelscope.cn/models/MNN/Qwen3-4B-Instruct-2507-MNN) |
239 | 
240 | See the complete list for more models.
241 | 
242 | ## 🔗 Related Projects
243 | 
244 | - **MNN Inference**: [mnn-llm](https://github.com/wangzhaode/mnn-llm) - LLM inference library for MNN framework
245 | - **ONNX Inference**: [onnx-llm](https://github.com/wangzhaode/onnx-llm), [OnnxLLM](https://github.com/inisis/OnnxLLM) - ONNX format inference libraries
246 | - **Model Optimization**: [OnnxSlim](https://github.com/inisis/OnnxSlim) - ONNX model optimization tool
247 | 
248 | ## 📄 License
249 | 
250 | This project is licensed under the [MIT License](https://opensource.org/licenses/MIT).1.7B-Instruct-MNN) |


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [2024] [wangzhaode]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/llmexport/utils/mtp.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | import torch.nn as nn
  4 | import numpy as np
  5 | from typing import Optional, List, Tuple
  6 | 
  7 | from .transformers import Attention
  8 | from .utils.custom_op import FakeLinear
  9 | from .utils.spinner import spinner_run
 10 | from .torch_utils import onnx_export
 11 | 
 12 | class Mtp(torch.nn.Module):
 13 |     def __init__(self, mtp, base):
 14 |         super().__init__()
 15 |         self.model_type = base.model_type
 16 |         self.mtp = mtp
 17 |         self.embed_ = base.embed
 18 |         self.lm_ = base.lm
 19 | 
 20 |         self.config_ = base.config
 21 |         if not hasattr(base.config, 'head_dim'):
 22 |             self.config_.head_dim = base.head_dim
 23 |         self.config_.rotary = base.rotary
 24 |         self.config_.model_type = base.model_type
 25 |         self.config_.model_map = base.model_map
 26 |         self.hidden_size = base.hidden_size
 27 |         self.past_kv_shape = base.past_kv_shape
 28 |         self.num_attention_heads = base.num_attention_heads
 29 |         self.llm_config = base.llm_config
 30 |         self.load()
 31 |         self.unloaded_ops = {}
 32 | 
 33 | 
 34 |     @staticmethod
 35 |     def get_mtp(model_type):
 36 |         mtps = {
 37 |             'mimo': MimoMtp,
 38 |             'poi_qwen2_mtp' : PoiQwenMtp,
 39 |         }
 40 |         if model_type in mtps:
 41 |             return mtps[model_type]
 42 |         return None
 43 | 
 44 |     @spinner_run(f'export onnx model to ')
 45 |     def export(self, onnx_path):
 46 |         onnx_model = f'{onnx_path}/mtp.onnx'
 47 | 
 48 |         # unload linear weight to save export memory
 49 |         self.unload_param()
 50 | 
 51 |         self.seq_len = 3
 52 |         input_ids = torch.arange(3, dtype=torch.long)
 53 |         attention_mask =  (1 - torch.tril(torch.ones([1, 1, self.seq_len, self.seq_len]))) * torch.finfo(torch.float32).min
 54 |         position_ids = torch.arange(self.seq_len, dtype=torch.int).unsqueeze(0)
 55 |         hidden_states = torch.ones([self.seq_len, 1, self.hidden_size], dtype=torch.float)
 56 | 
 57 |         # For export onnx, don't need image or audio's embedding
 58 |         input_embed = self.embed_(input_ids)
 59 |         past_key_values = torch.zeros(self.past_kv_shape[1:])
 60 |         logits_index = torch.tensor([-1], dtype=torch.int32)
 61 |         # export to onnx
 62 |         with torch.no_grad():
 63 |             onnx_export(
 64 |                 self, (input_embed, hidden_states, attention_mask, position_ids, past_key_values, logits_index),
 65 |                 onnx_model,
 66 |                 input_names=[
 67 |                     'input_embed', 'hidden_states',
 68 |                     'attention_mask', 'position_ids',
 69 |                     'past_key_values', 'logits_index'
 70 |                 ],
 71 |                 output_names=['logits', 'presents'],
 72 |                 dynamic_axes={
 73 |                     "input_embed" : { 0: "seq_len" },
 74 |                     "hidden_states" : { 0: "seq_len" },
 75 |                     "attention_mask" : { 2: "seq_len", 3: "seq_len" },
 76 |                     "position_ids" : { 1: "seq_len" },
 77 |                     "past_key_values" : { 2: "history_len" }
 78 |                 })
 79 |         return onnx_model
 80 | 
 81 |     def load(self):
 82 |         raise NotImplementedError
 83 | 
 84 |     def forward(self, images):
 85 |         raise NotImplementedError
 86 | 
 87 | 
 88 | class MimoMtp(Mtp):
 89 |     def __init__(self, mtp, base):
 90 |         super().__init__(mtp, base)
 91 | 
 92 |     def load(self):
 93 |         self.mtp.eval()
 94 |         self.token_layernorm = getattr(self.mtp[0], 'token_layernorm')
 95 |         self.hidden_layernorm = getattr(self.mtp[0], 'hidden_layernorm')
 96 |         self.input_proj = getattr(self.mtp[0], 'input_proj')
 97 |         self.input_layernorm = getattr(self.mtp[0], 'input_layernorm')
 98 |         self.self_attn = getattr(self.mtp[0], 'self_attn')
 99 |         self.post_attention_layernorm = getattr(self.mtp[0], 'post_attention_layernorm')
100 |         self.mlp = getattr(self.mtp[0], 'mlp')
101 |         self.final_layernorm = getattr(self.mtp[0], 'final_layernorm')
102 |         self.self_attn = Attention(self.self_attn, 0, self.config_)
103 | 
104 |     def unload_param(self):
105 |         def build_faker(real, name):
106 |             faker = FakeLinear(real.in_features, real.out_features, real.bias is not None, name)
107 |             self.unloaded_ops[name] = real
108 |             return faker
109 |         # replace linear with fakelinear to save export memory and time
110 |         with torch.no_grad():
111 |             # different kv cache shape in different layers
112 |             if isinstance(self.num_attention_heads, list):
113 |                 self.self_attn.export_fused_attn = True
114 |             for name, child in self.self_attn.named_children():
115 |                 if isinstance(child, torch.nn.Linear):
116 |                     setattr(self.self_attn, name, build_faker(child, f'/mtp_layers.0/self_attn/{name}/Linear'))
117 |             for name, child in self.mlp.named_children():
118 |                 if isinstance(child, torch.nn.Linear):
119 |                     setattr(self.mlp, name, build_faker(child, f'/mtp_layers.0/mlp/{name}/Linear'))
120 |             self.input_proj = build_faker(self.input_proj, f'/mtp/input_proj/Linear')
121 | 
122 |     def forward(self,
123 |                 input_embeds: torch.Tensor,
124 |                 hidden_states: torch.Tensor,
125 |                 attention_mask: torch.Tensor,
126 |                 position_ids: torch.Tensor,
127 |                 past_key_values: Optional[Tuple[torch.Tensor]] = None,
128 |                 logits_index: int = -1
129 |                 ):
130 |         input_embeds = input_embeds.view(1, -1, self.hidden_size)
131 |         hidden_states = hidden_states.view(1, -1, self.hidden_size)
132 |         hidden_states = hidden_states[:, 0 : input_embeds.size(1), :]
133 | 
134 |         input_embeds = self.token_layernorm(input_embeds)
135 |         previous_hidden_states = self.hidden_layernorm(hidden_states)
136 |         hidden_states = self.input_proj(torch.cat([previous_hidden_states, input_embeds], dim=-1))
137 |         residual = hidden_states
138 |         hidden_states = self.input_layernorm(hidden_states)
139 | 
140 |         rotary_pos_emb = self.config_.rotary(position_ids)
141 | 
142 |         # Self Attention
143 |         hidden_states, present_key_value = self.self_attn(
144 |             hidden_states=hidden_states,
145 |             rotary_pos_emb=rotary_pos_emb,
146 |             attention_mask=attention_mask,
147 |             past_key_value=past_key_values,
148 |             cross_attention_states=None,
149 |         )
150 | 
151 |         hidden_states = residual + hidden_states
152 |         residual = hidden_states
153 |         hidden_states = self.post_attention_layernorm(hidden_states)
154 |         hidden_states = self.mlp(hidden_states)
155 |         hidden_states = residual + hidden_states
156 | 
157 |         hidden_states = hidden_states[:, logits_index:, :]
158 |         hidden_states = self.final_layernorm(hidden_states)
159 | 
160 |         logits = self.lm_(hidden_states)
161 |         return logits, present_key_value
162 | 
163 | class PoiQwenMtp(Mtp):
164 |     def __init__(self, mtp, base):
165 |         self.num_mtp_layers = 2
166 |         super().__init__(mtp, base)
167 | 
168 |     def load(self):
169 |         self.mtp[0].eval()
170 |         self.mtp[1].eval()
171 |         self.decode_layers = nn.ModuleList([])
172 |         self.hidden_norm = nn.ModuleList([])
173 |         self.last_norm = nn.ModuleList([])
174 | 
175 |         with torch.no_grad():
176 |             for i in range(self.num_mtp_layers):
177 |                 self.decode_layers.append(getattr(self.mtp[i], 'layers'))
178 |                 self.hidden_norm.append(getattr(self.mtp[i], 'RMSorm_MTP_1'))
179 |                 self.last_norm.append(getattr(self.mtp[i], 'norm'))
180 | 
181 |         self.input_layernorm = nn.ModuleList([])
182 |         self.post_attention_layernorm = nn.ModuleList([])
183 |         self.mlp = nn.ModuleList([])
184 |         self.self_attn = nn.ModuleList([])
185 | 
186 |         with torch.no_grad():
187 |             for i in range(self.num_mtp_layers):
188 |                 self.input_layernorm.append(getattr(self.decode_layers[i], 'input_layernorm'))
189 |                 self.ori_attn = getattr(self.decode_layers[i], 'self_attn')
190 |                 self.post_attention_layernorm.append(getattr(self.decode_layers[i], 'post_attention_layernorm'))
191 |                 self.mlp.append(getattr(self.decode_layers[i], 'mlp'))
192 |                 self.self_attn.append(Attention(self.ori_attn, i, self.config_))
193 | 
194 |     def unload_param(self):
195 |         def build_faker(real, name):
196 |             faker = FakeLinear(real.in_features, real.out_features, real.bias is not None, name)
197 |             self.unloaded_ops[name] = real
198 |             return faker
199 |         # replace linear with fakelinear to save export memory and time
200 |         with torch.no_grad():
201 |             for i in range(self.num_mtp_layers):
202 |                 # different kv cache shape in different layers
203 |                 if isinstance(self.num_attention_heads, list):
204 |                     self.self_attn[i].export_fused_attn = True
205 |                 for name, child in self.self_attn[i].named_children():
206 |                     if isinstance(child, torch.nn.Linear):
207 |                         setattr(self.self_attn[i], name, build_faker(child, f'/mtp_layers.{i}/self_attn/{name}/Linear'))
208 |                 for name, child in self.mlp[i].named_children():
209 |                     if isinstance(child, torch.nn.Linear):
210 |                         setattr(self.mlp[i], name, build_faker(child, f'/mtp_layers.{i}/mlp/{name}/Linear'))
211 | 
212 |     def forward(self,
213 |                 input_embeds: torch.Tensor,
214 |                 hidden_states: torch.Tensor,
215 |                 attention_mask: torch.Tensor,
216 |                 position_ids: torch.Tensor,
217 |                 past_key_values: Optional[Tuple[torch.Tensor]] = None,
218 |                 logits_index: int = -1
219 |                 ):
220 |         present_key_value = []
221 |         # [1, -1, self.hidden_size]
222 |         mtp_hidden_states = []
223 | 
224 |         rotary_pos_emb = self.config_.rotary(position_ids)
225 |         hidden_states = hidden_states.view(1, -1, self.hidden_size)
226 |         hidden_states = hidden_states[:, 0 : input_embeds.size(0), :]
227 | 
228 |         for i in range(self.num_mtp_layers):
229 |             # first norm
230 |             hidden_states = self.hidden_norm[i](hidden_states)
231 | 
232 |             # Decoder Layer
233 |             residual = hidden_states
234 |             hidden_states = self.input_layernorm[i](hidden_states)
235 | 
236 |             # Self Attention
237 |             hidden_states, kv = self.self_attn[i](
238 |                 hidden_states=hidden_states,
239 |                 rotary_pos_emb=rotary_pos_emb,
240 |                 attention_mask=attention_mask,
241 |                 past_key_value=past_key_values,
242 |                 cross_attention_states=None,
243 |             )
244 |             present_key_value.append(kv)
245 | 
246 |             hidden_states = residual + hidden_states
247 |             residual = hidden_states
248 |             hidden_states = self.post_attention_layernorm[i](hidden_states)
249 |             hidden_states = self.mlp[i](hidden_states)
250 |             hidden_states = residual + hidden_states
251 | 
252 |             # last norm
253 |             hidden_states = self.last_norm[i](hidden_states)
254 | 
255 |             mtp_hidden_states.append(hidden_states)
256 |             hidden_states = mtp_hidden_states[i]
257 | 
258 |         for i in range(self.num_mtp_layers):
259 |             mtp_hidden_states[i] = mtp_hidden_states[i][:, logits_index:, :]
260 | 
261 |         mtp_logits = self.lm_(mtp_hidden_states[0])
262 |         for i in range(self.num_mtp_layers-1):
263 |             logits = self.lm_(mtp_hidden_states[i+1])
264 |             mtp_logits = torch.cat([mtp_logits, logits], dim=0)
265 |         return mtp_logits, present_key_value


--------------------------------------------------------------------------------
/llmexport/utils/eagle.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | import torch.nn as nn
  4 | import numpy as np
  5 | import os
  6 | from typing import Optional, List, Tuple
  7 | from .transformers import Attention
  8 | from .transformers import RMSNorm
  9 | from .transformers import Rotary
 10 | from .transformers import Embedding
 11 | from .utils.custom_op import FakeLinear
 12 | from transformers.configuration_utils import PretrainedConfig
 13 | from transformers.activations import ACT2FN
 14 | from .utils.spinner import spinner_run
 15 | from .torch_utils import onnx_export
 16 | 
 17 | 
 18 | class Eagle(torch.nn.Module):
 19 |     def __init__(self, eagle_path, base):
 20 |         super().__init__()
 21 |         # load eagle config.json
 22 |         config_file_path = eagle_path + "/config.json"
 23 |         self.eagle_config = PretrainedConfig.from_json_file(config_file_path)
 24 | 
 25 |         self.model_type = base.model_type
 26 |         self.eagle_path = eagle_path
 27 | 
 28 |         self.config = base.config
 29 |         if not hasattr(base.config, 'head_dim'):
 30 |             self.config.head_dim = base.head_dim
 31 | 
 32 |         self.rope_theta = 10000
 33 |         self.rope_ratio = 1.0
 34 |         self.head_dim = self.config.head_dim
 35 |         self.config.model_type = base.model_type
 36 |         self.config.model_map = base.model_map
 37 |         self.hidden_size = base.hidden_size
 38 |         if self.eagle_config.hidden_size != self.hidden_size:
 39 |             raise RuntimeError(f'eagle_config hidden_size not equal: {self.eagle_config.hidden_size}, {self.hidden_size}!')
 40 |         self.past_kv_shape = base.past_kv_shape
 41 |         self.num_attention_heads = base.num_attention_heads
 42 |         self.llm_config = base.llm_config
 43 | 
 44 |         self.head_dim = self.config.head_dim
 45 |         self.num_key_value_heads = self.config.num_key_value_heads
 46 |         self.num_key_value_groups = self.num_attention_heads // self.num_key_value_heads
 47 |         # self.config.head_dim = self.head_dim
 48 |         self.config.rotary = Rotary(self)
 49 |         # eagle config params
 50 |         self.padding_idx = self.eagle_config.pad_token_id
 51 |         self.vocab_size = self.eagle_config.vocab_size
 52 |         self.draft_vocab_size = self.eagle_config.draft_vocab_size
 53 |         # embed_tokens api
 54 |         self.embed_tokens = nn.Embedding(self.vocab_size, self.hidden_size, self.padding_idx)
 55 |         if not hasattr(self.eagle_config, "target_hidden_size"):
 56 |             self.embed_tokens.weight = base.embed.embed.weight
 57 | 
 58 |         # fc api
 59 |         if hasattr(self.eagle_config, "target_hidden_size"):
 60 |             self.fc = nn.Linear(self.eagle_config.target_hidden_size * 3, self.hidden_size, bias=False)
 61 |         else:
 62 |             self.fc = nn.Linear(self.hidden_size * 3, self.hidden_size, bias=False)
 63 | 
 64 |         self.midlayer = nn.Module()
 65 |         # midlayer.hidden_norm
 66 |         self.midlayer.hidden_norm = RMSNorm(self.hidden_size, eps=self.eagle_config.rms_norm_eps)
 67 |         # midlayer.input_layernorm
 68 |         self.midlayer.input_layernorm = RMSNorm(self.hidden_size, eps=self.eagle_config.rms_norm_eps)
 69 |         # midlayer.self_attn
 70 |         self.midlayer.self_attn = Attention(None, 0, self.config)
 71 |         self.midlayer.self_attn.q_proj = nn.Linear(self.hidden_size * 2, self.num_attention_heads * self.head_dim, bias=False)
 72 |         self.midlayer.self_attn.k_proj = nn.Linear(self.hidden_size * 2, self.num_key_value_heads * self.head_dim, bias=False)
 73 |         self.midlayer.self_attn.v_proj = nn.Linear(self.hidden_size * 2, self.num_key_value_heads * self.head_dim, bias=False)
 74 |         self.midlayer.self_attn.o_proj = nn.Linear(self.num_attention_heads * self.head_dim, self.hidden_size, bias=False)
 75 |         # midlayer.post_attention_layernorm
 76 |         self.midlayer.post_attention_layernorm = RMSNorm(self.hidden_size, eps=self.eagle_config.rms_norm_eps)
 77 |         # midlayer.mlp
 78 |         self.midlayer.mlp = nn.Module()
 79 |         self.intermediate_size = self.eagle_config.intermediate_size
 80 |         self.midlayer.mlp.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
 81 |         self.midlayer.mlp.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
 82 |         self.midlayer.mlp.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
 83 |         self.midlayer.mlp.act_fn = ACT2FN[self.eagle_config.hidden_act]
 84 | 
 85 |         # norm api
 86 |         self.norm = RMSNorm(self.hidden_size, eps=self.eagle_config.rms_norm_eps)
 87 |         # lm_head api
 88 |         self.lm_head = nn.Linear(self.hidden_size, self.draft_vocab_size,bias=False)
 89 |         # logsoftmax api
 90 |         self.logsoftmax = nn.LogSoftmax(dim=-1)
 91 |         # d2t
 92 |         d2t = torch.zeros((self.draft_vocab_size), dtype=torch.int64)
 93 |         self.register_buffer("d2t", d2t)
 94 | 
 95 |         self.load()
 96 |         self.unloaded_ops = {}
 97 | 
 98 |     def unload_param(self):
 99 |         def build_faker(real, name):
100 |             faker = FakeLinear(real.in_features, real.out_features, real.bias is not None, name)
101 |             self.unloaded_ops[name] = real
102 |             return faker
103 |         # replace linear with fakelinear to save export memory and time
104 |         with torch.no_grad():
105 |             # different kv cache shape in different layers
106 |             if isinstance(self.num_attention_heads, list):
107 |                 self.midlayer.self_attn.export_fused_attn = True
108 |             for name, child in self.midlayer.self_attn.named_children():
109 |                 if isinstance(child, torch.nn.Linear):
110 |                     setattr(self.midlayer.self_attn, name, build_faker(child, f'/eagle_layers.0/self_attn/{name}/Linear'))
111 |             for name, child in self.midlayer.mlp.named_children():
112 |                 if isinstance(child, torch.nn.Linear):
113 |                     setattr(self.midlayer.mlp, name, build_faker(child, f'/eagle_layers.0/mlp/{name}/Linear'))
114 |             self.lm_head = build_faker(self.lm_head, f'/eagle/lm_head/Linear')
115 |             self.fc = build_faker(self.fc, f'/eagle/fc/Linear')
116 | 
117 |     @staticmethod
118 |     def get_eagle(model_type):
119 |         eagles = {
120 |             'llama': LlamaEagle,
121 |             'qwen3': LlamaEagle,
122 |         }
123 |         if model_type in eagles:
124 |             return eagles[model_type]
125 |         return None
126 | 
127 |     @spinner_run(f'export onnx model to ')
128 |     def export(self, onnx_path):
129 |         # save d2t to file
130 |         import MNN.expr as expr
131 |         torch_d2t = self.d2t.detach().to(torch.int32).contiguous().cpu()
132 |         mnn_d2t = expr.const(torch_d2t.data_ptr(), torch_d2t.shape, expr.data_format.NHWC, expr.dtype.int)
133 |         mnn_d2t.name = 'd2t'
134 |         expr.save([mnn_d2t], f'{onnx_path}/../eagle_d2t.mnn')
135 | 
136 |         eagle_model = f'{onnx_path}/eagle.onnx'
137 |         eagle_fc_model = f'{onnx_path}/eagle_fc.onnx'
138 |         # unload linear weight to save export memory
139 |         # self.unload_param()
140 | 
141 |         self.seq_len = 3
142 |         input_ids = torch.arange(3, dtype=torch.long)
143 |         attention_mask =  (1 - torch.tril(torch.ones([1, 1, self.seq_len, self.seq_len]))) * torch.finfo(torch.float32).min
144 |         position_ids = torch.arange(self.seq_len, dtype=torch.int).unsqueeze(0)
145 |         hidden_states = torch.ones([1, self.seq_len, self.hidden_size], dtype=torch.float)
146 |         logits_index = torch.tensor([-1], dtype=torch.int32)
147 | 
148 |         fc_hidden = torch.ones([1, self.seq_len, self.hidden_size * 3], dtype=torch.float)
149 | 
150 |         # For export onnx, don't need image or audio's embedding
151 |         input_embed = self.embed_tokens(input_ids)
152 |         past_key_values = torch.zeros(self.past_kv_shape[1:-1] + [self.head_dim])
153 |         # export to onnx
154 |         with torch.no_grad():
155 |             onnx_export(self.fc, (fc_hidden),
156 |                 eagle_fc_model,
157 |                 input_names=['fc_hidden'],
158 |                 output_names=['hidden_states'],
159 |                 dynamic_axes={ "fc_hidden" : { 1: "seq_len" } })
160 |             onnx_export(
161 |                 self, (input_embed, hidden_states, attention_mask, position_ids, past_key_values, logits_index),
162 |                 eagle_model,
163 |                 input_names=[
164 |                     'input_embed', 'hidden_states',
165 |                     'attention_mask', 'position_ids',
166 |                     'past_key_values', 'logits_index'
167 |                 ],
168 |                 output_names=['logits', 'out_hidden_states', 'presents'],
169 |                 dynamic_axes={
170 |                     "input_embed" : { 0: "seq_len" },
171 |                     "hidden_states" : { 1: "seq_len" },
172 |                     "attention_mask" : { 2: "seq_len", 3: "seq_len" },
173 |                     "position_ids" : { 1: "seq_len" },
174 |                     "past_key_values" : { 2: "history_len" }
175 |                     })
176 |         return eagle_model, eagle_fc_model
177 | 
178 |     def load(self):
179 |         raise NotImplementedError
180 | 
181 |     def forward(self, images):
182 |         raise NotImplementedError
183 | 
184 | class LlamaEagle(Eagle):
185 |     def __init__(self, eagle_path, base):
186 |         super().__init__(eagle_path, base)
187 | 
188 |     def load(self):
189 |         safetensors_path = os.path.join(self.eagle_path, "model.safetensors")
190 |         bin_path = os.path.join(self.eagle_path, "pytorch_model.bin")
191 |         ea_layer_state_dict = None
192 |         if os.path.exists(safetensors_path):
193 |             from safetensors.torch import load_file
194 |             ea_layer_state_dict = load_file(safetensors_path, device="cpu")
195 |         elif os.path.exists(bin_path):
196 |             ea_layer_state_dict = torch.load(bin_path, map_location="cpu")
197 |         else:
198 |             raise FileNotFoundError(
199 |                 f"Eagle path '{self.eagle_path}' not found 'model.safetensors' or 'pytorch_model.bin'."
200 |             )
201 |         self.load_state_dict(ea_layer_state_dict, strict=False)
202 | 
203 |     def forward(self,
204 |                 input_embeds: torch.Tensor,
205 |                 hidden_states: torch.Tensor,
206 |                 attention_mask: torch.Tensor,
207 |                 position_ids: torch.Tensor,
208 |                 past_key_values: Optional[Tuple[torch.Tensor]] = None,
209 |                 logits_index: int = -1
210 |                 ):
211 |         # hidden_states = self.fc(hidden_states)
212 |         hidden_states = hidden_states.view(1, -1, self.hidden_size)
213 |         input_embeds = input_embeds.view(1, -1, self.hidden_size)
214 | 
215 |         residual = hidden_states
216 | 
217 |         input_embeds = self.midlayer.input_layernorm(input_embeds)
218 |         previous_hidden_states = self.midlayer.hidden_norm(hidden_states)
219 |         hidden_states = torch.cat([input_embeds, previous_hidden_states], dim=-1)
220 | 
221 |         rotary_pos_emb = self.config.rotary(position_ids)
222 | 
223 |         # Self Attention
224 |         hidden_states, present_key_value = self.midlayer.self_attn(
225 |             hidden_states=hidden_states,
226 |             rotary_pos_emb=rotary_pos_emb,
227 |             attention_mask=attention_mask,
228 |             past_key_value=past_key_values,
229 |             cross_attention_states=None,
230 |         )
231 | 
232 |         hidden_states = residual + hidden_states
233 |         residual = hidden_states
234 |         hidden_states = self.midlayer.post_attention_layernorm(hidden_states)
235 |         hidden_states = self.midlayer.mlp.down_proj(self.midlayer.mlp.act_fn(self.midlayer.mlp.gate_proj(hidden_states)) * self.midlayer.mlp.up_proj(hidden_states))
236 |         hidden_states = residual + hidden_states
237 | 
238 |         hidden_states = hidden_states[:, logits_index:, :]
239 |         last_hidden = self.norm(hidden_states)
240 | 
241 |         logits = self.lm_head(last_hidden)
242 |         logits = self.logsoftmax(logits)
243 |         return logits, hidden_states, present_key_value
244 | 


--------------------------------------------------------------------------------
/llmexport/utils/audio.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from .transformers import Decoder
  3 | from .spinner import spinner_run
  4 | from .torch_utils import onnx_export
  5 | 
  6 | class Audio(torch.nn.Module):
  7 |     def __init__(self, audio, base):
  8 |         super().__init__()
  9 |         self.model_type = base.model_type
 10 |         self.audio = audio
 11 |         self.embed_ = base.embed
 12 |         self.tokenizer = base.tokenizer
 13 |         self.config = base.config
 14 |         self.hidden_size = base.hidden_size
 15 |         self.llm_config = base.llm_config
 16 |         self.rope_ratio = 1.0
 17 |         self.quant_bit = 16
 18 |         self.init_config()
 19 |         self.load()
 20 | 
 21 |     @staticmethod
 22 |     def get_audio(model_type):
 23 |         audio_models = {
 24 |             'qwen2_audio': Qwen2Audio,
 25 |             'qwen2_5_omni_audio_encoder': Qwen2_5OmniAudio,
 26 |         }
 27 |         if model_type in audio_models:
 28 |             return audio_models[model_type]
 29 |         return None
 30 | 
 31 |     def init_config(self):
 32 |         self.llm_config['is_audio'] = True
 33 | 
 34 |     def load(self):
 35 |         raise NotImplementedError
 36 | 
 37 |     def str_to_ids(self, prompt):
 38 |         input_ids = self.tokenizer(prompt, return_tensors="pt")['input_ids']
 39 |         return input_ids
 40 | 
 41 |     def forward(self, images):
 42 |         raise NotImplementedError
 43 | 
 44 |     def embed(self, input_ids, images = None, videos = None):
 45 |         raise NotImplementedError
 46 | 
 47 |     def export(self, onnx_path):
 48 |         raise NotImplementedError
 49 | 
 50 | class Qwen2Audio(Audio):
 51 |     def __init__(self, audio, base):
 52 |         super().__init__(audio, base)
 53 |         self.audio_embeds = None
 54 |         self.audio_pad_id = 151646
 55 |         self.n_fft = 400
 56 |         self.sampling_rate = 16000
 57 |         self.hop_length = 160
 58 |         self.chunk_length = 30
 59 |         self.feature_size = 128
 60 |         self.n_samples = self.chunk_length * self.sampling_rate
 61 |         self.max_length = self.n_samples // self.hop_length
 62 |         from transformers.audio_utils import mel_filter_bank
 63 |         self.mel_filters = mel_filter_bank(
 64 |             num_frequency_bins=1 + self.n_fft // 2,
 65 |             num_mel_filters=self.feature_size,
 66 |             min_frequency=0.0,
 67 |             max_frequency=8000.0,
 68 |             sampling_rate=self.sampling_rate,
 69 |             norm="slaney",
 70 |             mel_scale="slaney",
 71 |         )
 72 | 
 73 |     def load(self):
 74 |         # model
 75 |         self.audio_tower = self.audio.audio_tower
 76 |         self.multi_modal_projector = self.audio.multi_modal_projector
 77 |         # config
 78 |         self.llm_config['is_audio'] = True
 79 | 
 80 |     def str_to_ids(self, prompt):
 81 |         if '<audio>' in prompt and '</audio>' in prompt:
 82 |             import re
 83 |             from io import BytesIO
 84 |             from urllib.request import urlopen
 85 |             import librosa
 86 |             pattern = r'(<audio>.*?</audio>)'
 87 |             parts = re.split(pattern, prompt)
 88 |             txt_prompt = ''
 89 |             for part in parts:
 90 |                 if re.match(pattern, part):
 91 |                     audio_content = re.search(r'<audio>(.*?)</audio>', part).group(1)
 92 |                     if audio_content.startswith('http://') or audio_content.startswith('https://'):
 93 |                         audio_obj = librosa.load(BytesIO(urlopen(audio_content).read()), sr=self.sampling_rate)[0]
 94 |                     else:
 95 |                         # local file
 96 |                         audio_obj = librosa.load(audio_content, sr=self.sampling_rate)[0]
 97 |                     audio_embed_len = self.audio_process(audio_obj)
 98 |                     audio_pad_str = '<|AUDIO|>' * audio_embed_len
 99 |                     txt_prompt += audio_pad_str
100 |                 else:
101 |                     txt_prompt += part
102 |         else:
103 |             txt_prompt = prompt
104 |         input_ids = self.tokenizer(txt_prompt, return_tensors="pt")['input_ids']
105 |         return input_ids
106 | 
107 |     def forward(self, input_features):
108 |         input_features = input_features.to(dtype=self.audio_tower.conv1.weight.dtype, device=self.audio_tower.conv1.weight.device)
109 |         inputs_embeds = torch.nn.functional.gelu(self.audio_tower.conv1(input_features))
110 |         inputs_embeds = torch.nn.functional.gelu(self.audio_tower.conv2(inputs_embeds))
111 |         inputs_embeds = inputs_embeds.permute(0, 2, 1)
112 |         _, seq_len, _ = inputs_embeds.shape
113 |         embed_pos = self.audio_tower.embed_positions.weight[:seq_len, :]
114 |         hidden_states = inputs_embeds + embed_pos
115 |         for encoder_layer in self.audio_tower.layers:
116 |             hidden_states = encoder_layer(hidden_states, None, None)[0]
117 |         hidden_states = hidden_states.permute(0, 2, 1)
118 |         hidden_states = self.audio_tower.avg_pooler(hidden_states)
119 |         hidden_states = hidden_states.permute(0, 2, 1)
120 |         hidden_states = self.audio_tower.layer_norm(hidden_states)
121 |         audio_features = self.multi_modal_projector(hidden_states)
122 |         return audio_features
123 | 
124 |     def _torch_extract_fbank_features(self, waveform):
125 |         window = torch.hann_window(self.n_fft)
126 |         stft = torch.stft(waveform, self.n_fft, self.hop_length, window=window, return_complex=True)
127 |         magnitudes = stft[..., :-1].abs() ** 2
128 |         mel_filters = torch.from_numpy(self.mel_filters).type(torch.float32)
129 |         mel_spec = mel_filters.T @ magnitudes
130 |         log_spec = torch.clamp(mel_spec, min=1e-10).log10()
131 |         if waveform.dim() == 2:
132 |             max_val = log_spec.max(dim=2, keepdim=True)[0].max(dim=1, keepdim=True)[0]
133 |             log_spec = torch.maximum(log_spec, max_val - 8.0)
134 |         else:
135 |             log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
136 |         log_spec = (log_spec + 4.0) / 4.0
137 |         return log_spec
138 | 
139 |     def audio_process(self, audio_obj):
140 |         # audio_obj = np.pad(audio_obj, (0, self.n_samples - audio_obj.shape[0]))
141 |         waveform = torch.from_numpy(audio_obj).type(torch.float32)
142 |         input_features = self._torch_extract_fbank_features(waveform).unsqueeze(0)
143 |         audio_embeds = self.forward(input_features)
144 |         self.audio_embeds = audio_embeds.permute([1, 0, 2])
145 |         return self.audio_embeds.shape[0]
146 | 
147 |     def embed(self, input_ids, images = None, videos = None):
148 |         input_embeds = self.embed_(input_ids)
149 |         if self.audio_embeds is not None:
150 |             audio_mask = (input_ids == self.audio_pad_id).squeeze()
151 |             input_embeds[audio_mask] = self.audio_embeds.type(input_embeds.dtype)
152 |         return input_embeds
153 | 
154 |     @spinner_run(f'export audio to ')
155 |     def export(self, onnx_path):
156 |         input_features = torch.randn((1, self.feature_size, self.max_length))
157 | 
158 |         model = self.float()
159 |         onnx_model = f'{onnx_path}/audio.onnx'
160 |         onnx_export(model, (input_features),
161 |                     onnx_model,
162 |                     input_names=['input_features'],
163 |                     output_names=['audio_embeds'],
164 |                     dynamic_axes={"input_features": {
165 |                         2: "size"
166 |                     }})
167 |         return onnx_model
168 | 
169 | class AudioMlp(torch.nn.Module):
170 |     def __init__(self, fc1, fc2, act):
171 |         super().__init__()
172 |         self.fc1 = fc1
173 |         self.fc2 = fc2
174 |         self.act = act
175 | 
176 |     def forward(self, hidden_states):
177 |         hidden_states = self.fc1(hidden_states)
178 |         hidden_states = self.act(hidden_states)
179 |         hidden_states = self.fc2(hidden_states)
180 |         return hidden_states
181 | 
182 | class Qwen2_5OmniAudio(Qwen2Audio):
183 |     def __init__(self, audio, base):
184 |         super().__init__(audio, base)
185 |         self.quant_bit = 4
186 | 
187 |     def load(self):
188 |         # config
189 |         config = self.audio.config
190 |         self.n_window = config.n_window
191 |         self.llm_config['is_audio'] = True
192 |         self.llm_config['n_window'] = self.n_window
193 |         self.hidden_size = config.d_model
194 |         self.num_attention_heads = config.encoder_attention_heads
195 |         self.num_key_value_heads = self.num_attention_heads
196 |         self.head_dim = self.hidden_size // self.num_attention_heads
197 |         self.rotary = None
198 |         self.model_map = {
199 |             'decoder': {
200 |                 'self_attn': 'self_attn',
201 |                 'input_layernorm': 'self_attn_layer_norm',
202 |                 'post_attention_layernorm': 'final_layer_norm'
203 |             },
204 |             'attention': {
205 |                 'q_proj': 'q_proj',
206 |                 'k_proj': 'k_proj',
207 |                 'v_proj': 'v_proj',
208 |                 'o_proj': 'out_proj'
209 |             }
210 |         }
211 |         self.blocks = []
212 |         for layer in self.audio.layers:
213 |             layer_id = len(self.blocks)
214 |             block = Decoder(layer, layer_id, self)
215 |             block.mlp = AudioMlp(layer.fc1, layer.fc2, layer.activation_fn)
216 |             self.blocks.append(block)
217 | 
218 |     def forward(self, input_features, attention_mask = None):
219 |         input_features = input_features.to(dtype=self.audio.conv1.weight.dtype, device=self.audio.conv1.weight.device)
220 |         inputs_embeds = torch.nn.functional.gelu(self.audio.conv1(input_features))
221 |         inputs_embeds = torch.nn.functional.gelu(self.audio.conv2(inputs_embeds))
222 |         inputs_embeds = inputs_embeds.permute(0, 2, 1)
223 |         _, seq_len, _ = inputs_embeds.shape
224 |         embed_pos = self.audio.positional_embedding.positional_embedding[:seq_len, :]
225 |         hidden_states = inputs_embeds + embed_pos
226 |         for block in self.blocks:
227 |             hidden_states = block(hidden_states, attention_mask=attention_mask)[0]
228 |         hidden_states = hidden_states.permute(0, 2, 1)
229 |         hidden_states = self.audio.avg_pooler(hidden_states)
230 |         hidden_states = hidden_states.permute(0, 2, 1)
231 |         hidden_states = self.audio.ln_post(hidden_states)
232 |         audio_features = self.audio.proj(hidden_states)
233 |         return audio_features
234 | 
235 |     def audio_process(self, audio_obj):
236 |         # audio_obj = np.pad(audio_obj, (0, self.n_samples - audio_obj.shape[0]))
237 |         waveform = torch.from_numpy(audio_obj).type(torch.float32)
238 |         input_features = self._torch_extract_fbank_features(waveform).unsqueeze(0)
239 |         _, _, seq_len = input_features.shape
240 |         seq_len = int(seq_len // 2)
241 |         cu_seqlens = [i for i in range(0, seq_len, self.n_window)]
242 |         if seq_len % self.n_window != 0:
243 |             cu_seqlens.append(seq_len)
244 |         cu_seqlens = torch.tensor(cu_seqlens)
245 |         attention_mask = torch.full(
246 |             [1, seq_len, seq_len], torch.finfo(torch.float32).min
247 |         )
248 |         for i in range(1, len(cu_seqlens)):
249 |             attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = 0
250 |         audio_embeds = self.forward(input_features, attention_mask)
251 |         self.audio_embeds = audio_embeds.permute([1, 0, 2])
252 |         return self.audio_embeds.shape[0]
253 | 
254 |     @spinner_run(f'export audio to ')
255 |     def export(self, onnx_path):
256 |         input_features = torch.randn((1, self.feature_size, self.max_length))
257 |         seq_len = self.max_length // 2
258 |         attention_mask = torch.randn([1, seq_len, seq_len])
259 |         model = self.float()
260 |         onnx_model = f'{onnx_path}/audio.onnx'
261 |         onnx_export(model, (input_features, attention_mask),
262 |                     onnx_model,
263 |                     input_names=['input_features', 'attention_mask'],
264 |                     output_names=['audio_embeds'],
265 |                     dynamic_axes={"input_features": {
266 |                         0: "size"
267 |                     }, "attention_mask": {
268 |                         1: "size", 2: "size"
269 |                     }})
270 |         return onnx_model


--------------------------------------------------------------------------------
/llmexport/gguf/gguf_reader.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # GGUF file reading/modification support. For API usage information,
  3 | # please see the files scripts/ for some fairly simple examples.
  4 | #
  5 | from __future__ import annotations
  6 | 
  7 | import logging
  8 | import os
  9 | from collections import OrderedDict
 10 | from typing import Any, Literal, NamedTuple, TypeVar, Union
 11 | 
 12 | import numpy as np
 13 | import numpy.typing as npt
 14 | 
 15 | def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType) -> tuple[int, ...]:
 16 |     block_size, type_size = GGML_QUANT_SIZES[quant_type]
 17 |     if shape[-1] % block_size != 0:
 18 |         raise ValueError(f"Quantized tensor row size ({shape[-1]}) is not a multiple of {quant_type.name} block size ({block_size})")
 19 |     return (*shape[:-1], shape[-1] // block_size * type_size)
 20 | 
 21 | if __name__ == "__main__":
 22 |     import sys
 23 |     from pathlib import Path
 24 | 
 25 |     # Allow running file in package as a script.
 26 |     sys.path.insert(0, str(Path(__file__).parent.parent))
 27 | 
 28 | from gguf.constants import (
 29 |     GGML_QUANT_SIZES,
 30 |     GGUF_DEFAULT_ALIGNMENT,
 31 |     GGUF_MAGIC,
 32 |     GGUF_VERSION,
 33 |     GGMLQuantizationType,
 34 |     GGUFValueType,
 35 | )
 36 | 
 37 | logger = logging.getLogger(__name__)
 38 | 
 39 | READER_SUPPORTED_VERSIONS = [2, GGUF_VERSION]
 40 | 
 41 | 
 42 | class ReaderField(NamedTuple):
 43 |     # Offset to start of this field.
 44 |     offset: int
 45 | 
 46 |     # Name of the field (not necessarily from file data).
 47 |     name: str
 48 | 
 49 |     # Data parts. Some types have multiple components, such as strings
 50 |     # that consist of a length followed by the string data.
 51 |     parts: list[npt.NDArray[Any]] = []
 52 | 
 53 |     # Indexes into parts that we can call the actual data. For example
 54 |     # an array of strings will be populated with indexes to the actual
 55 |     # string data.
 56 |     data: list[int] = [-1]
 57 | 
 58 |     types: list[GGUFValueType] = []
 59 | 
 60 | 
 61 | class ReaderTensor(NamedTuple):
 62 |     name: str
 63 |     tensor_type: GGMLQuantizationType
 64 |     shape: npt.NDArray[np.uint32]
 65 |     n_elements: int
 66 |     n_bytes: int
 67 |     data_offset: int
 68 |     data: npt.NDArray[Any]
 69 |     field: ReaderField
 70 | 
 71 | 
 72 | class GGUFReader:
 73 |     # I - same as host, S - swapped
 74 |     byte_order: Literal['I', 'S'] = 'I'
 75 |     alignment: int = GGUF_DEFAULT_ALIGNMENT
 76 |     data_offset: int
 77 | 
 78 |     # Note: Internal helper, API may change.
 79 |     gguf_scalar_to_np: dict[GGUFValueType, type[np.generic]] = {
 80 |         GGUFValueType.UINT8:   np.uint8,
 81 |         GGUFValueType.INT8:    np.int8,
 82 |         GGUFValueType.UINT16:  np.uint16,
 83 |         GGUFValueType.INT16:   np.int16,
 84 |         GGUFValueType.UINT32:  np.uint32,
 85 |         GGUFValueType.INT32:   np.int32,
 86 |         GGUFValueType.FLOAT32: np.float32,
 87 |         GGUFValueType.UINT64:  np.uint64,
 88 |         GGUFValueType.INT64:   np.int64,
 89 |         GGUFValueType.FLOAT64: np.float64,
 90 |         GGUFValueType.BOOL:    np.bool_,
 91 |     }
 92 | 
 93 |     def __init__(self, path: os.PathLike[str] | str, mode: Literal['r', 'r+', 'c'] = 'r'):
 94 |         self.data = np.memmap(path, mode = mode)
 95 |         offs = 0
 96 | 
 97 |         # Check for GGUF magic
 98 |         if self._get(offs, np.uint32, override_order = '<')[0] != GGUF_MAGIC:
 99 |             raise ValueError('GGUF magic invalid')
100 |         offs += 4
101 | 
102 |         # Check GGUF version
103 |         temp_version = self._get(offs, np.uint32)
104 |         if temp_version[0] & 65535 == 0:
105 |             # If we get 0 here that means it's (probably) a GGUF file created for
106 |             # the opposite byte order of the machine this script is running on.
107 |             self.byte_order = 'S'
108 |             temp_version = temp_version.newbyteorder(self.byte_order)
109 |         version = temp_version[0]
110 |         if version not in READER_SUPPORTED_VERSIONS:
111 |             raise ValueError(f'Sorry, file appears to be version {version} which we cannot handle')
112 |         self.fields: OrderedDict[str, ReaderField] = OrderedDict()
113 |         self.tensors: list[ReaderTensor] = []
114 |         offs += self._push_field(ReaderField(offs, 'GGUF.version', [temp_version], [0], [GGUFValueType.UINT32]))
115 | 
116 |         # Check tensor count and kv count
117 |         temp_counts = self._get(offs, np.uint64, 2)
118 |         offs += self._push_field(ReaderField(offs, 'GGUF.tensor_count', [temp_counts[:1]], [0], [GGUFValueType.UINT64]))
119 |         offs += self._push_field(ReaderField(offs, 'GGUF.kv_count', [temp_counts[1:]], [0], [GGUFValueType.UINT64]))
120 |         tensor_count, kv_count = temp_counts
121 |         offs = self._build_fields(offs, kv_count)
122 | 
123 |         # Build Tensor Info Fields
124 |         offs, tensors_fields = self._build_tensor_info(offs, tensor_count)
125 |         new_align = self.fields.get('general.alignment')
126 |         if new_align is not None:
127 |             if new_align.types != [GGUFValueType.UINT32]:
128 |                 raise ValueError('Bad type for general.alignment field')
129 |             self.alignment = new_align.parts[-1][0]
130 |         padding = offs % self.alignment
131 |         if padding != 0:
132 |             offs += self.alignment - padding
133 |         self.data_offset = offs
134 |         self._build_tensors(offs, tensors_fields)
135 | 
136 |     _DT = TypeVar('_DT', bound = npt.DTypeLike)
137 | 
138 |     # Fetch a key/value metadata field by key.
139 |     def get_field(self, key: str) -> Union[ReaderField, None]:
140 |         return self.fields.get(key, None)
141 | 
142 |     # Fetch a tensor from the list by index.
143 |     def get_tensor(self, idx: int) -> ReaderTensor:
144 |         return self.tensors[idx]
145 | 
146 |     def _get(
147 |         self, offset: int, dtype: npt.DTypeLike, count: int = 1, override_order: None | Literal['I', 'S', '<'] = None,
148 |     ) -> npt.NDArray[Any]:
149 |         count = int(count)
150 |         itemsize = int(np.empty([], dtype = dtype).itemsize)
151 |         end_offs = offset + itemsize * count
152 |         arr = self.data[offset:end_offs].view(dtype=dtype)[:count]
153 |         if override_order is None:
154 |             return arr
155 |         return arr.view(arr.dtype.newbyteorder(override_order))
156 | 
157 |     def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int:
158 |         if field.name in self.fields:
159 |             # TODO: add option to generate error on duplicate keys
160 |             # raise KeyError(f'Duplicate {field.name} already in list at offset {field.offset}')
161 | 
162 |             logger.warning(f'Duplicate key {field.name} at offset {field.offset}')
163 |             self.fields[field.name + '_{}'.format(field.offset)] = field
164 |         else:
165 |             self.fields[field.name] = field
166 |         return 0 if skip_sum else sum(int(part.nbytes) for part in field.parts)
167 | 
168 |     def _get_str(self, offset: int) -> tuple[npt.NDArray[np.uint64], npt.NDArray[np.uint8]]:
169 |         slen = self._get(offset, np.uint64)
170 |         return slen, self._get(offset + 8, np.uint8, slen[0])
171 | 
172 |     def _get_field_parts(
173 |         self, orig_offs: int, raw_type: int,
174 |     ) -> tuple[int, list[npt.NDArray[Any]], list[int], list[GGUFValueType]]:
175 |         offs = orig_offs
176 |         types: list[GGUFValueType] = []
177 |         gtype = GGUFValueType(raw_type)
178 |         types.append(gtype)
179 |         # Handle strings.
180 |         if gtype == GGUFValueType.STRING:
181 |             sparts: list[npt.NDArray[Any]] = list(self._get_str(offs))
182 |             size = sum(int(part.nbytes) for part in sparts)
183 |             return size, sparts, [1], types
184 |         # Check if it's a simple scalar type.
185 |         nptype = self.gguf_scalar_to_np.get(gtype)
186 |         if nptype is not None:
187 |             val = self._get(offs, nptype)
188 |             return int(val.nbytes), [val], [0], types
189 |         # Handle arrays.
190 |         if gtype == GGUFValueType.ARRAY:
191 |             raw_itype = self._get(offs, np.uint32)
192 |             offs += int(raw_itype.nbytes)
193 |             alen = self._get(offs, np.uint64)
194 |             offs += int(alen.nbytes)
195 |             aparts: list[npt.NDArray[Any]] = [raw_itype, alen]
196 |             data_idxs: list[int] = []
197 |             for idx in range(alen[0]):
198 |                 curr_size, curr_parts, curr_idxs, curr_types = self._get_field_parts(offs, raw_itype[0])
199 |                 if idx == 0:
200 |                     types += curr_types
201 |                 idxs_offs = len(aparts)
202 |                 aparts += curr_parts
203 |                 data_idxs += (idx + idxs_offs for idx in curr_idxs)
204 |                 offs += curr_size
205 |             return offs - orig_offs, aparts, data_idxs, types
206 |         # We can't deal with this one.
207 |         raise ValueError('Unknown/unhandled field type {gtype}')
208 | 
209 |     def _get_tensor_info_field(self, orig_offs: int) -> ReaderField:
210 |         offs = orig_offs
211 | 
212 |         # Get Tensor Name
213 |         name_len, name_data = self._get_str(offs)
214 |         offs += int(name_len.nbytes + name_data.nbytes)
215 | 
216 |         # Get Tensor Dimensions Count
217 |         n_dims = self._get(offs, np.uint32)
218 |         offs += int(n_dims.nbytes)
219 | 
220 |         # Get Tensor Dimension Array
221 |         dims = self._get(offs, np.uint64, n_dims[0])
222 |         offs += int(dims.nbytes)
223 | 
224 |         # Get Tensor Encoding Scheme Type
225 |         raw_dtype = self._get(offs, np.uint32)
226 |         offs += int(raw_dtype.nbytes)
227 | 
228 |         # Get Tensor Offset
229 |         offset_tensor = self._get(offs, np.uint64)
230 |         offs += int(offset_tensor.nbytes)
231 | 
232 |         return ReaderField(
233 |             orig_offs,
234 |             str(bytes(name_data), encoding = 'utf-8'),
235 |             [name_len, name_data, n_dims, dims, raw_dtype, offset_tensor],
236 |             [1, 3, 4, 5],
237 |         )
238 | 
239 |     def _build_fields(self, offs: int, count: int) -> int:
240 |         for _ in range(count):
241 |             orig_offs = offs
242 |             kv_klen, kv_kdata = self._get_str(offs)
243 |             offs += int(kv_klen.nbytes + kv_kdata.nbytes)
244 |             raw_kv_type = self._get(offs, np.uint32)
245 |             offs += int(raw_kv_type.nbytes)
246 |             parts: list[npt.NDArray[Any]] = [kv_klen, kv_kdata, raw_kv_type]
247 |             idxs_offs = len(parts)
248 |             field_size, field_parts, field_idxs, field_types = self._get_field_parts(offs, raw_kv_type[0])
249 |             parts += field_parts
250 |             self._push_field(ReaderField(
251 |                 orig_offs,
252 |                 str(bytes(kv_kdata), encoding = 'utf-8'),
253 |                 parts,
254 |                 [idx + idxs_offs for idx in field_idxs],
255 |                 field_types,
256 |             ), skip_sum = True)
257 |             offs += field_size
258 |         return offs
259 | 
260 |     def _build_tensor_info(self, offs: int, count: int) -> tuple[int, list[ReaderField]]:
261 |         tensor_fields = []
262 |         for _ in range(count):
263 |             field = self._get_tensor_info_field(offs)
264 |             offs += sum(int(part.nbytes) for part in field.parts)
265 |             tensor_fields.append(field)
266 |         return offs, tensor_fields
267 | 
268 |     def _build_tensors(self, start_offs: int, fields: list[ReaderField]) -> None:
269 |         tensors = []
270 |         tensor_names = set() # keep track of name to prevent duplicated tensors
271 |         for field in fields:
272 |             _name_len, name_data, _n_dims, dims, raw_dtype, offset_tensor = field.parts
273 |             # check if there's any tensor having same name already in the list
274 |             tensor_name = str(bytes(name_data), encoding = 'utf-8')
275 |             if tensor_name in tensor_names:
276 |                 raise ValueError(f'Found duplicated tensor with name {tensor_name}')
277 |             tensor_names.add(tensor_name)
278 |             ggml_type = GGMLQuantizationType(raw_dtype[0])
279 |             n_elems = int(np.prod(dims))
280 |             np_dims = tuple(reversed(dims.tolist()))
281 |             block_size, type_size = GGML_QUANT_SIZES[ggml_type]
282 |             n_bytes = n_elems * type_size // block_size
283 |             data_offs = int(start_offs + offset_tensor[0])
284 |             item_type: npt.DTypeLike
285 |             if ggml_type == GGMLQuantizationType.F16:
286 |                 item_count = n_elems
287 |                 item_type = np.float16
288 |             elif ggml_type == GGMLQuantizationType.F32:
289 |                 item_count = n_elems
290 |                 item_type = np.float32
291 |             elif ggml_type == GGMLQuantizationType.F64:
292 |                 item_count = n_elems
293 |                 item_type = np.float64
294 |             elif ggml_type == GGMLQuantizationType.I8:
295 |                 item_count = n_elems
296 |                 item_type = np.int8
297 |             elif ggml_type == GGMLQuantizationType.I16:
298 |                 item_count = n_elems
299 |                 item_type = np.int16
300 |             elif ggml_type == GGMLQuantizationType.I32:
301 |                 item_count = n_elems
302 |                 item_type = np.int32
303 |             elif ggml_type == GGMLQuantizationType.I64:
304 |                 item_count = n_elems
305 |                 item_type = np.int64
306 |             else:
307 |                 item_count = n_bytes
308 |                 item_type = np.uint8
309 |                 np_dims = quant_shape_to_byte_shape(np_dims, ggml_type)
310 |             tensors.append(ReaderTensor(
311 |                 name = tensor_name,
312 |                 tensor_type = ggml_type,
313 |                 shape = dims,
314 |                 n_elements = n_elems,
315 |                 n_bytes = n_bytes,
316 |                 data_offset = data_offs,
317 |                 data = self._get(data_offs, item_type, item_count).reshape(np_dims),
318 |                 field = field,
319 |             ))
320 |         self.tensors = tensors
321 | 


--------------------------------------------------------------------------------
/llmexport/utils/smooth_quantizer.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import logging
  3 | import gc
  4 | import functools
  5 | import json
  6 | import inspect
  7 | from typing import Dict
  8 | from tqdm import tqdm
  9 | from collections import defaultdict
 10 | #from datasets import load_from_disk
 11 | 
 12 | 
 13 | logging.basicConfig(level=logging.ERROR)
 14 | 
 15 | class SmoothQuantizer:
 16 |     def __init__(
 17 |         self,
 18 |         model,
 19 |         n_parallel_calib_samples=None,
 20 |         max_calib_samples=128,
 21 |         max_calib_seq_len=512,
 22 |         alpha=0.5,
 23 |         act_bit=8
 24 |     ) -> None:
 25 | 
 26 |         self.model = model
 27 |         self.tokenizer = model.tokenizer
 28 |         #self.w_bit = model.args.quant_bit
 29 |         self.act_bit = act_bit
 30 |         self.group_size = model.args.quant_block
 31 |         self.alpha = alpha
 32 | 
 33 |         self.max_calib_samples = max_calib_samples
 34 |         self.max_calib_seq_len = max_calib_seq_len
 35 |         self.split = 'train'
 36 |         self.calib_data = 'wikitext' if model.args.calib_data is None else model.args.calib_data
 37 |         self.best_device = SmoothQuantizer.get_best_device()
 38 | 
 39 |         self.modules = self.model.blocks
 40 |         if "cpu" != self.best_device:
 41 |             for idx in range(len(self.modules)):
 42 |                 SmoothQuantizer.to_device(self.modules[idx], "cpu")
 43 | 
 44 |         self.act_scales = [{} for _ in range(len(self.modules))]
 45 |         self.act_dict = [defaultdict(dict) for _ in range(len(self.modules))]
 46 | 
 47 |         self.n_parallel_calib_samples = n_parallel_calib_samples
 48 | 
 49 |         self.samples = self.init_quant(
 50 |             n_samples=self.max_calib_samples,
 51 |             max_seq_len=self.max_calib_seq_len,
 52 |         )
 53 | 
 54 |     @staticmethod
 55 |     def get_calib_dataset(
 56 |         data,
 57 |         tokenizer=None,
 58 |         n_samples=128,
 59 |         max_seq_len=512,
 60 |         split="train",
 61 |     ):
 62 |         if isinstance(data, str):
 63 |             from datasets import load_dataset
 64 |             if data == "pileval":
 65 |                 dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
 66 |             elif data == "wikitext":
 67 |                 dataset = load_dataset('wikitext', 'wikitext-2-raw-v1', split=split)
 68 |                 #dataset = load_from_disk("./wikitest-2-raw-v1")
 69 |             else:
 70 |                 dataset = load_dataset(data, split=split)
 71 |             # dataset = dataset.shuffle(seed=42)
 72 |         else:
 73 |             raise NotImplementedError(
 74 |                 "Either pass a string to a huggingface dataset"
 75 |                 "that is preprocessed with one sample of text per element"
 76 |             )
 77 | 
 78 |         samples = []
 79 |         dataset = dataset.shuffle(seed=42)
 80 | 
 81 |         for i in range(n_samples):
 82 |             input_ids = tokenizer(
 83 |                 dataset[i]["text"], return_tensors="pt", max_length=max_seq_len, truncation=True
 84 |             ).input_ids
 85 |             samples.append(input_ids)
 86 | 
 87 |         return samples
 88 | 
 89 |     @staticmethod
 90 |     def get_best_device():
 91 |         if torch.backends.mps.is_available():
 92 |             return "mps"
 93 |         elif torch.cuda.is_available():
 94 |             return "cuda:0"
 95 |         else:
 96 |             return "cpu"
 97 | 
 98 |     @staticmethod
 99 |     def clear_memory(weight=None):
100 |         if weight is not None:
101 |             del weight
102 |         gc.collect()
103 |         torch.cuda.empty_cache()
104 | 
105 | 
106 |     def init_quant(self, n_samples=128, max_seq_len=512):
107 |         samples = SmoothQuantizer.get_calib_dataset(
108 |             data=self.calib_data,
109 |             tokenizer=self.tokenizer,
110 |             n_samples=n_samples,
111 |             max_seq_len=max_seq_len,
112 |             split=self.split
113 |         )
114 |         return samples
115 | 
116 |     def _get_first_input(self, sample):
117 |         layer_kwargs = {}
118 |         self.model.seq_len = sample.numel()
119 |         self.model.context_len = sample.numel() - 2
120 |         self.model.token_len = 0
121 |         inps = self.model.embedding(sample).to(self.best_device)
122 |         position_ids = self.model.get_position_ids()
123 |         rotary_pos_emb = self.model.rotary(position_ids)
124 |         attention_mask = self.model.get_attention_mask()
125 |         layer_kwargs["rotary_pos_emb"] = rotary_pos_emb.to(self.best_device)
126 |         layer_kwargs["attention_mask"] = attention_mask.to(self.best_device)
127 |         del sample
128 |         SmoothQuantizer.clear_memory()
129 |         return layer_kwargs, inps
130 | 
131 |     def _get_max_input(self, idx, layer, named_linears):
132 | 
133 |         def stat_tensor(name, tensor):
134 |             hidden_dim = tensor.shape[-1]
135 |             tensor = tensor.view(-1, hidden_dim).abs().detach()
136 |             comming_max = torch.max(tensor, dim=0)[0].float().cpu()
137 |             if name in self.act_scales[idx]:
138 |                 self.act_scales[idx][name] = torch.max(self.act_scales[idx][name], comming_max)
139 |             else:
140 |                 self.act_scales[idx][name] = comming_max
141 | 
142 |         def stat_input_hook(m, x, y, name):
143 |             if isinstance(x, tuple):
144 |                 x = x[0]
145 |             stat_tensor(name, x)
146 |         handles = []
147 |         for name in named_linears:
148 |             handles.append(
149 |                 named_linears[name].register_forward_hook(
150 |                     functools.partial(stat_input_hook, name=name)
151 |                 )
152 |             )
153 |         module_kwargs = self._sanitize_kwargs(self.module_kwargs, layer)
154 | 
155 |         self.inps = self._module_forward(self.inps, layer, module_kwargs)
156 |         for h in handles:
157 |             h.remove()
158 | 
159 |     def _sanitize_kwargs(self, inputs_kwargs, module):
160 |         """
161 |         Remove the arguments that are not supported in the module's
162 |         forward pass to avoid breaking behaviour between different versions
163 |         of transformers.
164 | 
165 |         Args:
166 |             inputs_kwargs (`dict`):
167 |                 The input dictionary to pass to the model layer
168 |             module (`torch.nn.Module`):
169 |                 Target module to quantize.
170 |         """
171 |         module_signature = inspect.signature(module.forward).parameters
172 |         sanitized_kwargs = {}
173 |         for k, v in inputs_kwargs.items():
174 |             if k in module_signature:
175 |                 sanitized_kwargs[k] = v
176 |         return sanitized_kwargs
177 | 
178 |     @torch.no_grad()
179 |     def _module_forward(
180 |         self, x: torch.Tensor, module: torch.nn.Module, module_kwargs: Dict
181 |     ) -> torch.Tensor:
182 | 
183 |         if self.n_parallel_calib_samples is None:
184 |             # runs through all samples at once
185 |             # print(module, x, module_kwargs); exit(0)
186 |             module_output = module(x, **module_kwargs)
187 |             if isinstance(module_output, tuple):
188 |                 module_output = module_output[0]
189 |         else:
190 |             # memory efficiently runs through all calibration samples
191 |             # but only n_parallel_calib_samples at a time
192 |             module_output = []
193 |             partitioned_inputs = torch.split(x, self.n_parallel_calib_samples)
194 |             for x_partial in partitioned_inputs:
195 |                 partial_output = module(x_partial, **module_kwargs)
196 | 
197 |                 if isinstance(partial_output, tuple):
198 |                     partial_output = partial_output[0]
199 | 
200 |                 module_output.append(partial_output.cpu())
201 | 
202 |             module_output = torch.cat(module_output, dim=0)
203 | 
204 |         return module_output
205 | 
206 |     @staticmethod
207 |     def to_device(module, device):
208 |         for child_name, child_module in module.named_children():
209 |             if child_name == 'self_attn':
210 |                 for sub_name, sub_child in child_module.named_children():
211 |                     if sub_name != 'config':
212 |                         sub_child.to(device)
213 |             else:
214 |                 child_module.to(device)
215 | 
216 |     @staticmethod
217 |     def get_named_linears(module):
218 |         linears = {}
219 |         for child_name, child_module in module.named_children():
220 |             if child_name == 'self_attn':
221 |                 for name, mod in child_module.named_children():
222 |                     if name != 'config':
223 |                         if isinstance(mod, torch.nn.Linear):
224 |                             linears[f"{child_name}.{name}"] = mod
225 |             else:
226 |                 for name, mod in child_module.named_modules():
227 |                     if isinstance(mod, torch.nn.Linear):
228 |                         full_name = f"{child_name}.{name}" if name else child_name
229 |                         linears[full_name] = mod
230 | 
231 |         return linears
232 | 
233 |     @staticmethod
234 |     @torch.no_grad()
235 |     def smooth_ln_fcs(ln, fcs, act_scales, alpha=0.5):
236 |         if not isinstance(fcs, list):
237 |             fcs = [fcs]
238 |         if not SmoothQuantizer.is_allowed_norms(ln):
239 |             raise NotImplementedError(
240 |                 f"LayerNorm {ln} is not supported for smooth quantization."
241 |             )
242 |         for fc in fcs:
243 |             assert isinstance(fc, torch.nn.Linear)
244 |             assert ln.weight.numel() == fc.in_features == act_scales.numel()
245 |         device, dtype = fcs[0].weight.device, fcs[0].weight.dtype
246 |         act_scales = act_scales.to(device=device, dtype=dtype)
247 |         weight_scales = torch.cat(
248 |             [fc.weight.abs().max(dim=0, keepdim=True)[0] for fc in fcs], dim=0
249 |         )
250 |         weight_scales = weight_scales.max(dim=0)[0].clamp(min=1e-5)
251 |         scales = (
252 |             (act_scales.pow(alpha) / weight_scales.pow(1 - alpha))
253 |             .clamp(min=1e-5)
254 |             .to(device)
255 |             .to(dtype)
256 |         )
257 | 
258 |         if 'GemmaRMSNorm' in str(type(ln)):
259 |             ln.weight += 1
260 |             ln.weight.div_(scales)
261 |             ln.weight -= 1
262 |         else:
263 |             ln.weight.div_(scales)
264 | 
265 |         if hasattr(ln, "bias") and ln.bias is not None:
266 |             ln.bias.div_(scales)
267 | 
268 |         for fc in fcs:
269 |             fc.weight.mul_(scales.view(1, -1))
270 | 
271 |     @staticmethod
272 |     def is_allowed_norms(op):
273 |         if isinstance(op, torch.nn.LayerNorm):
274 |             return True
275 |         if any(t in str(type(op)) for t in ['LlamaRMSNorm', 'GemmaRMSNorm', 'CohereLayerNorm']):
276 |             return True
277 |         if "rmsnorm" in str(op.__class__).lower():
278 |             return True
279 |         return False
280 | 
281 |     def _apply_scale(self, idx, module):
282 |         attn_ln = module.input_layernorm
283 |         qkv = [
284 |                 module.self_attn.q_proj,
285 |                 module.self_attn.k_proj,
286 |                 module.self_attn.v_proj,
287 |             ]
288 | 
289 |         qkv_input_scales = self.act_scales[idx]["self_attn.q_proj"]
290 |         SmoothQuantizer.smooth_ln_fcs(attn_ln, qkv, qkv_input_scales, self.alpha)
291 | 
292 |         ffn_ln = module.post_attention_layernorm  # feed forward norm
293 |         fcs = [module.mlp.gate_proj, module.mlp.up_proj]
294 |         ffn_input_scales = self.act_scales[idx]["mlp.gate_proj"]
295 |         SmoothQuantizer.smooth_ln_fcs(ffn_ln, fcs, ffn_input_scales, self.alpha)
296 | 
297 |     @torch.no_grad()
298 |     def _get_all_static_scales(self, idx, layer, named_linears):
299 |         def stat_io_hook(m, x, y, name):
300 |             if isinstance(x, tuple):
301 |                 x = x[0]
302 |             if name not in self.act_dict[idx] or "input" not in self.act_dict[idx][name]:
303 |                 self.act_dict[idx][name]["input"] = x.detach().abs().max().item()
304 |             else:
305 |                 self.act_dict[idx][name]["input"] = max(
306 |                     self.act_dict[idx][name]["input"], x.detach().abs().max().item()
307 |                 )
308 |             if isinstance(y, tuple):
309 |                 y = y[0]
310 |             if name not in self.act_dict[idx] or "output" not in self.act_dict[idx][name]:
311 |                 self.act_dict[idx][name]["output"] = y.detach().abs().max().item()
312 |             else:
313 |                 self.act_dict[idx][name]["output"] = max(
314 |                     self.act_dict[idx][name]["output"], y.detach().abs().max().item()
315 |                 )
316 |         handles = []
317 |         for name in named_linears:
318 |             handles.append(
319 |                 named_linears[name].register_forward_hook(
320 |                     functools.partial(stat_io_hook, name=name)
321 |                 )
322 |             )
323 |         module_kwargs = self._sanitize_kwargs(self.module_kwargs, layer)
324 | 
325 |         self.inps = self._module_forward(self.inps, layer, module_kwargs)
326 |         for h in handles:
327 |             h.remove()
328 | 
329 |     @torch.no_grad()
330 |     def _extract_static_scales(self):
331 | 
332 |         print("Extracting static scales...")
333 | 
334 |         scale = 2 ** (self.act_bit - 1) - 1
335 | 
336 |         for idx in range(len(self.modules)):
337 |             for name, input_output in self.act_dict[idx].items():
338 |                 self.act_dict[idx][name]['input'] = input_output['input'] / scale
339 |                 self.act_dict[idx][name]['output'] = input_output['output'] / scale
340 | 
341 |     def quantize(self):
342 | 
343 |         for i in tqdm(range(len(self.samples)), desc="collecting data and computing scales..."):
344 |             sample = self.samples[i]
345 |             if sample.numel() == 0:
346 |                 continue
347 |             self.module_kwargs, self.inps = self._get_first_input(sample)
348 | 
349 |             for idx in range(len(self.modules)):
350 |                 SmoothQuantizer.to_device(self.modules[idx], self.best_device)
351 | 
352 |                 if self.module_kwargs.get("position_ids", None) is not None:
353 |                     self.module_kwargs["position_ids"] = self.module_kwargs["position_ids"].to(self.best_device)
354 | 
355 |                 if self.module_kwargs.get("attention_mask", None) is not None:
356 |                     self.module_kwargs["attention_mask"] = self.module_kwargs["attention_mask"].to(self.best_device)
357 | 
358 |                 named_linears = SmoothQuantizer.get_named_linears(self.modules[idx])
359 | 
360 |                 self._get_max_input(idx, self.modules[idx], named_linears)
361 |                 if "cpu" != self.best_device:
362 |                     SmoothQuantizer.to_device(self.modules[idx], "cpu")
363 | 
364 |         for idx in tqdm(range(len(self.modules)), desc="applying scales..."):
365 |             self._apply_scale(idx, self.modules[idx])
366 | 
367 |         for i in tqdm(range(len(self.samples)), desc="collecting static activation scales..."):
368 |             sample = self.samples[i]
369 |             if sample.numel() == 0:
370 |                 continue
371 |             self.module_kwargs, self.inps = self._get_first_input(sample)
372 | 
373 |             for idx in range(len(self.modules)):
374 |                 SmoothQuantizer.to_device(self.modules[idx], self.best_device)
375 | 
376 |                 if self.module_kwargs.get("position_ids", None) is not None:
377 |                     self.module_kwargs["position_ids"] = self.module_kwargs["position_ids"].to(self.best_device)
378 | 
379 |                 if self.module_kwargs.get("attention_mask", None) is not None:
380 |                     self.module_kwargs["attention_mask"] = self.module_kwargs["attention_mask"].to(self.best_device)
381 | 
382 |                 named_linears = SmoothQuantizer.get_named_linears(self.modules[idx])
383 | 
384 |                 self._get_all_static_scales(idx, self.modules[idx], named_linears)
385 |                 if "cpu" != self.best_device:
386 |                     SmoothQuantizer.to_device(self.modules[idx], "cpu")
387 |         self._extract_static_scales()
388 | 
389 |         SmoothQuantizer.clear_memory()
390 |         for idx in range(len(self.modules)):
391 |             SmoothQuantizer.to_device(self.modules[idx], "cpu")
392 | 
393 | 
394 | 
395 |     def apply(self, base_path):
396 |         mnn = json.load(open(base_path, 'rt'))
397 |         mnn['extraTensorDescribe'] = []
398 | 
399 |         max_val = 2 ** (self.act_bit - 1) - 1
400 |         min_val = -max_val
401 |         data_type = 'DT_INT16'
402 |         if self.act_bit <= 8:
403 |             data_type = 'DT_INT8'
404 |         if self.act_bit > 8 and self.act_bit <= 16:
405 |             data_type = 'DT_INT16'
406 | 
407 |         quant_info_dict = {}
408 | 
409 |         for op in mnn['oplists']:
410 |             if op['type'] == 'Convolution' and 'lm_head' not in op['name']:
411 |                 name_vec = op['name'].split('/')
412 |                 layer_idx = int(name_vec[1].split('.')[-1])
413 |                 layer_name = name_vec[2] + '.' + name_vec[3]
414 | 
415 |                 tensor_input_index = op['inputIndexes'][0]
416 |                 tensor_output_index = op['outputIndexes'][0]
417 | 
418 |                 if tensor_input_index not in quant_info_dict:
419 |                     quant_info_dict[tensor_input_index] = {
420 |                         'index': tensor_input_index,
421 |                         'quantInfo': {
422 |                             'scale': self.act_dict[layer_idx][layer_name]['input'],
423 |                             'min': min_val,
424 |                             'max': max_val,
425 |                             "type":data_type
426 |                         }
427 |                     }
428 | 
429 |                 if tensor_output_index not in quant_info_dict:
430 |                     quant_info_dict[tensor_output_index] = {
431 |                         'index': tensor_output_index,
432 |                         'quantInfo': {
433 |                             'scale': self.act_dict[layer_idx][layer_name]['output'],
434 |                             'min': min_val,
435 |                             'max': max_val,
436 |                             "type":data_type
437 |                         }
438 |                     }
439 |         mnn['extraTensorDescribe'] = list(quant_info_dict.values())
440 | 
441 |         with open(base_path, 'w', encoding='utf-8') as f:
442 |             json.dump(mnn, f, ensure_ascii=False, indent=4)
443 | 
444 |         return base_path
445 | 
446 | 
447 | 
448 | 
449 | 
450 | 
451 | 
452 | 
453 | 
454 | 
455 | 


--------------------------------------------------------------------------------
/llmexport/utils/token2wav.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import torch.nn.functional as F
  4 | torch.set_printoptions(precision=4, sci_mode=False)
  5 | from .model_mapper import ModelMapper
  6 | from .transformers import Rotary, Embedding, Decoder, Attention
  7 | from .spinner import spinner_run
  8 | from .torch_utils import onnx_export
  9 | 
 10 | class Token2Wav(torch.nn.Module):
 11 |     def __init__(self,token2wav, base):
 12 |         super().__init__()
 13 |         self.args = base.args
 14 |         self.token2wav = token2wav.float()
 15 |         self.config = base.config
 16 |         self.llm_config = base.llm_config
 17 |         self.rope_ratio = 1.0
 18 |         self.quant_bit = 8
 19 |         self.load()
 20 | 
 21 |     def load(self):
 22 |         raise NotImplementedError
 23 | 
 24 |     def add_token_embeds(self, thinker_embeds):
 25 |         raise NotImplementedError
 26 | 
 27 |     def add_hidden_states(self, thinker_hidden_states):
 28 |         raise NotImplementedError
 29 | 
 30 |     def add_generate_ids(self, token_id):
 31 |         raise NotImplementedError
 32 | 
 33 |     def forward(self, inputs_embeds, attention_mask, position_ids, past_key_values = None):
 34 |         raise NotImplementedError
 35 | 
 36 |     def export(self, onnx_path):
 37 |         raise NotImplementedError
 38 | 
 39 | class UpSample1d(torch.nn.Module):
 40 |     def __init__(self, upsample, channel):
 41 |         super().__init__()
 42 |         self.ratio = upsample.ratio
 43 |         self.stride = upsample.stride
 44 |         self.pad = upsample.pad
 45 |         self.pad_left = upsample.pad_left
 46 |         self.pad_right = upsample.pad_right
 47 |         self.filter = upsample.filter.expand(channel, -1, -1).clone()
 48 |         self.channel = channel
 49 | 
 50 |     def forward(self, x):
 51 |         x = F.pad(x, (self.pad, self.pad), mode="replicate")
 52 |         x = self.ratio * F.conv_transpose1d(x, self.filter, stride=self.stride, groups=self.channel)
 53 |         x = x[..., self.pad_left : -self.pad_right]
 54 |         return x
 55 | 
 56 | class DownSample1d(torch.nn.Module):
 57 |     def __init__(self, downsample, channel):
 58 |         super().__init__()
 59 |         self.pad_left = downsample.pad_left
 60 |         self.pad_right = downsample.pad_right
 61 |         self.stride = downsample.stride
 62 |         self.filter = downsample.filter.expand(channel, -1, -1).clone()
 63 |         self.channel = channel
 64 | 
 65 |     def forward(self, x):
 66 |         x = F.pad(x, (self.pad_left, self.pad_right), mode="replicate")
 67 |         out = F.conv1d(x, self.filter, stride=self.stride, groups=self.channel)
 68 |         return out
 69 | 
 70 | class TorchActivation1d(torch.nn.Module):
 71 |     def __init__(
 72 |         self,
 73 |         activation
 74 |     ):
 75 |         super().__init__()
 76 |         self.act = activation.act
 77 |         channel = self.act.in_features
 78 |         self.upsample = UpSample1d(activation.upsample, channel)
 79 |         self.downsample = DownSample1d(activation.downsample, channel)
 80 | 
 81 |     def forward(self, x):
 82 |         x = self.upsample(x)
 83 |         x = self.act(x)
 84 |         x = self.downsample(x)
 85 |         return x
 86 | 
 87 | # DiT model code
 88 | class ECAPA_TDNN(torch.nn.Module):
 89 |     def __init__(self, spk_encoder):
 90 |         super().__init__()
 91 |         self.blocks = spk_encoder.blocks
 92 |         self.mfa = spk_encoder.mfa
 93 |         self.asp = spk_encoder.asp
 94 |         self.fc = spk_encoder.fc
 95 | 
 96 |     def forward(self, x):
 97 |         # Minimize transpose for efficiency
 98 |         x = x.transpose(1, 2)
 99 |         xl = []
100 |         for layer in self.blocks:
101 |             x = layer(x)
102 |             xl.append(x)
103 |         # Multi-layer feature aggregation
104 |         x = torch.cat(xl[1:], dim=1)
105 |         x = self.mfa(x)
106 |         # Attentive Statistical Pooling
107 |         x = self.asp(x)
108 |         # Final linear transformation
109 |         x = self.fc(x)
110 |         # x = x.squeeze(-1) # avoid If when export to onnx
111 |         x = x.permute(0, 2, 1)
112 |         return x
113 | 
114 | class DitRotary(Rotary):
115 |     def __init__(self):
116 |         super().__init__(None)
117 |         self.model_type = 'dit'
118 |         self.rope_theta = 10000
119 |         self.rotary_dim = 64
120 |         self.theta = 1.0 / (self.rope_theta ** (torch.arange(0, self.rotary_dim, 2, dtype=torch.float32) / self.rotary_dim))
121 | 
122 |     def forward(self, position_ids):
123 |         position_ids = position_ids.float().reshape(-1, 1)
124 |         idx_theta = position_ids * self.theta
125 |         rotary_pos_emb = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)])
126 |         rotary_pos_emb = torch.stack((rotary_pos_emb, rotary_pos_emb), dim=-1)
127 |         rotary_pos_emb = rotary_pos_emb.reshape(*rotary_pos_emb.shape[:-2], -1)
128 |         rotary_pos_emb = rotary_pos_emb.unsqueeze(2).unsqueeze(1)
129 |         return rotary_pos_emb
130 | 
131 |     @staticmethod
132 |     def apply_rotary_pos(x, cos, sin):
133 |         def rotate_half(x):
134 |             x = x.reshape(*x.shape[:-1], -1, 2)
135 |             x1, x2 = x.unbind(dim=-1)
136 |             x = torch.stack((-x2, x1), dim=-1)
137 |             return x.reshape(*x.shape[:-2], -1)
138 | 
139 |         x = (x * cos) + (rotate_half(x) * sin)
140 |         return x
141 | 
142 | import math
143 | class DiTAttention(torch.nn.Module):
144 |     def __init__(self, attn):
145 |         super().__init__()
146 |         self.dim = attn.dim
147 |         self.heads = attn.heads
148 |         self.inner_dim = attn.inner_dim
149 |         self.to_q = attn.to_q
150 |         self.to_k = attn.to_k
151 |         self.to_v = attn.to_v
152 |         self.to_out = attn.to_out
153 | 
154 |     def forward(
155 |         self,
156 |         x,
157 |         rope=None,
158 |         mask=None,
159 |     ) -> torch.Tensor:
160 |         batch_size = x.shape[0]
161 | 
162 |         # `sample` projections.
163 |         query = self.to_q(x)
164 |         key = self.to_k(x)
165 |         value = self.to_v(x)
166 | 
167 |         # attention
168 |         inner_dim = key.shape[-1]
169 |         head_dim = inner_dim // self.heads
170 |         query = query.view(batch_size, -1, self.heads, head_dim)
171 |         key = key.view(batch_size, -1, self.heads, head_dim)
172 |         value = value.view(batch_size, -1, self.heads, head_dim)
173 |         # apply rotary position embedding
174 |         # Due to training process, only first head is applied with RoPE, will be fixed at next release
175 |         cos, sin = rope[0], rope[1]
176 |         first_query = query[:, :, :1, :]
177 |         first_key   = key[:, :, :1, :]
178 |         other_query = query[:, :, 1:, :]
179 |         other_key   = key[:, :, 1:, :]
180 |         first_query = DitRotary.apply_rotary_pos(first_query, cos, sin)
181 |         first_key   = DitRotary.apply_rotary_pos(first_key, cos, sin)
182 |         query = torch.concat([first_query, other_query], dim=2)
183 |         key = torch.concat([first_key, other_key], dim=2)
184 | 
185 |         attention_mask = (~mask) * torch.finfo(torch.float32).min
186 | 
187 |         query = query.transpose(1, 2)
188 |         key   = key.permute([0, 2, 3, 1])
189 |         value = value.transpose(1, 2)
190 |         attn_weights = torch.matmul(query, key) / math.sqrt(head_dim)
191 |         attn_weights = attn_weights + attention_mask
192 |         attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
193 |         attn_output = torch.matmul(attn_weights, value)
194 |         x = attn_output.transpose(1, 2)
195 | 
196 |         # mask. e.g. inference got a batch with different target durations, mask out the padding
197 |         x = x.reshape(batch_size, -1, self.heads * head_dim)
198 |         x = x.to(query.dtype)
199 | 
200 |         # linear proj
201 |         x = self.to_out[0](x)
202 |         # dropout
203 |         x = self.to_out[1](x)
204 | 
205 |         return x
206 | 
207 | class DiTBlock(torch.nn.Module):
208 |     def __init__(self, block):
209 |         super().__init__()
210 |         self.attn_norm = block.attn_norm
211 |         self.attn = DiTAttention(block.attn)
212 |         self.attn_ = block.attn
213 |         self.look_ahead_block = block.look_ahead_block
214 |         self.look_backward_block = block.look_backward_block
215 |         self.ff_norm = block.ff_norm
216 |         self.ff = block.ff
217 | 
218 |     def forward(self, x, t, rope=None, block_diff=None):
219 |         norm, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.attn_norm(x, emb=t)
220 | 
221 |         attn_output = self.attn(
222 |             x=norm,
223 |             rope=rope,
224 |             mask=(block_diff >= -float(self.look_backward_block)) & (block_diff <= float(self.look_ahead_block)),
225 |         )
226 | 
227 |         # process attention output for input x
228 |         x = x + gate_msa.unsqueeze(1) * attn_output
229 | 
230 |         norm = self.ff_norm(x) * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
231 |         ff_output = self.ff(norm)
232 |         x = x + gate_mlp.unsqueeze(1) * ff_output
233 | 
234 |         return x
235 | 
236 | class DitPreprocess(torch.nn.Module):
237 |     def __init__(self, dit):
238 |         super().__init__()
239 |         self.code_embed = dit.code_embed
240 |         self.input_proj = dit.proj_in_other
241 |         self.rotary_embed = DitRotary()
242 |         self.block_size = 24
243 | 
244 |     def forward(self, cond, spk, code):
245 |         max_duration = code.shape[1] * 2
246 |         spk = spk.repeat(1, max_duration, 1)
247 |         cond = cond.repeat(1, max_duration, 1)
248 |         code_embed = self.code_embed(code)
249 |         input_embeds = torch.cat((cond, code_embed, spk), dim=-1)
250 |         code_embeds = self.input_proj(input_embeds)
251 |         position_ids = torch.arange(max_duration)
252 |         rope = self.rotary_embed(position_ids)
253 | 
254 |         block_indices = position_ids // self.block_size
255 |         block_i = block_indices.unsqueeze(1)
256 |         block_j = block_indices.unsqueeze(0)
257 |         block_diff = block_j - block_i
258 |         mask = block_diff.reshape(1, 1, max_duration, max_duration)
259 |         return code_embeds, rope, mask
260 | 
261 | class DitWrapper(torch.nn.Module):
262 |     def __init__(self, dit):
263 |         super().__init__()
264 |         self.dit = dit
265 |         self.cfg = False
266 |         self.time_embed = dit.time_embed
267 |         self.code_embed = dit.text_embed
268 |         self.rotary_embed = DitRotary()
269 |         self.transformer_blocks = torch.nn.ModuleList()
270 |         for i in range(len(dit.transformer_blocks)):
271 |             self.transformer_blocks.append(DiTBlock(dit.transformer_blocks[i]))
272 |         self._create_block_diff = dit._create_block_diff
273 |         self.norm_out = dit.norm_out
274 |         self.proj_out = dit.proj_out
275 |         proj_in = dit.input_embed.proj
276 |         oc, ic = proj_in.weight.shape
277 |         x_ic = 80
278 |         other_ic = ic - x_ic
279 |         self.proj_in_x = torch.nn.Linear(x_ic, oc)
280 |         self.proj_in_x.weight.data = proj_in.weight[:, :x_ic]
281 |         self.proj_in_x.bias = None
282 |         self.proj_in_other = torch.nn.Linear(other_ic, oc)
283 |         self.proj_in_other.weight.data = proj_in.weight[:, x_ic:]
284 |         self.proj_in_other.bias = proj_in.bias
285 |         self.spk_encoder = ECAPA_TDNN(dit.input_embed.spk_encoder)
286 |         self.preprocess = DitPreprocess(self)
287 | 
288 |     def spk_encode(self, spk):
289 |         return self.spk_encoder(spk)
290 | 
291 |     def forward(self, x, code_embeds, rope, mask, time):
292 |         t = self.time_embed(time)
293 |         hidden = self.proj_in_x(x) + code_embeds
294 |         for block in self.transformer_blocks:
295 |             hidden = block(hidden, t, rope=rope, block_diff=mask)
296 |         hidden = self.norm_out(hidden, t)
297 |         output = self.proj_out(hidden)
298 |         return output
299 | 
300 | # end
301 | 
302 | class Qwen2_5OmniToken2Wav(Token2Wav):
303 |     def __init__(self, token2wav, base):
304 |         super().__init__(token2wav, base)
305 | 
306 |     def load(self):
307 |         self.dit = self.token2wav.code2wav_dit_model
308 |         self.bigvgan = self.token2wav.code2wav_bigvgan_model
309 |         # some code change for export
310 |         self.dit = DitWrapper(self.dit)
311 |         # bigvgan.resblocks.activations.up/downsample contain conv weight channel by input
312 |         for i in range(len(self.bigvgan.resblocks)):
313 |             for j in range(len(self.bigvgan.resblocks[i].activations)):
314 |                 old_act = self.bigvgan.resblocks[i].activations[j]
315 |                 self.bigvgan.resblocks[i].activations[j] = TorchActivation1d(old_act)
316 |         self.bigvgan.activation_post = TorchActivation1d(self.bigvgan.activation_post)
317 |         # spk
318 |         path = os.path.join(self.args.path, 'spk_dict.pt')
319 |         self.speaker_map = {}
320 |         for key, value in torch.load(path).items():
321 |             spk = value["cond"].float()
322 |             cond = value['ref_mel'].float()
323 |             value.pop("ref_mel", None)
324 |             value['spk'] = spk.unsqueeze(1)
325 |             value['cond'] =self.dit.spk_encode(cond)
326 |             self.speaker_map[key] = value
327 |         spk = "Chelsie"
328 |         self.speaker_params = self.speaker_map[spk]
329 | 
330 |     def dit_forward(self, code, initial_noise = None):
331 |         spk = self.speaker_params["spk"].float()
332 |         cond = self.speaker_params["cond"].float()
333 |         max_duration = code.shape[1] * 2
334 |         code_embeds, rope, mask = self.dit.preprocess(cond, spk, code)
335 |         def func(t, x):
336 |             pred = self.dit(x=x, code_embeds=code_embeds, rope=rope, mask=mask, time=torch.tensor([t]))
337 |             return pred
338 | 
339 |         steps = 5
340 |         t = torch.linspace(0, 1, steps, dtype=cond.dtype)
341 |         t = 1 - torch.cos(torch.pi / 2 * t)
342 | 
343 |         if initial_noise is None:
344 |             torch.manual_seed(42)
345 |             y0 = torch.randn([1, max_duration, 80], dtype=cond.dtype)
346 |         else:
347 |             y0 = initial_noise.clone()
348 | 
349 |         for t0, t1 in zip(t[:-1], t[1:]):
350 |             dt = t1 - t0
351 |             k1 = func(t0, y0)
352 |             k2 = func(t0 + dt * 1/3, y0 + dt * k1 * 1/3)
353 |             k3 = func(t0 + dt * 2/3, y0 + dt * (k2 - k1 * 2/3))
354 |             k4 = func(t1, y0 + dt * (k1 - k2 + k3))
355 |             dy = (k1 + 3 * (k2 + k3) + k4) * dt * 0.125
356 |             y0 += dy
357 | 
358 |         generated_mel = y0.permute(0, 2, 1)
359 |         # print('generated_mel = ', generated_mel, generated_mel.shape)
360 |         # print('generated_mel.shape = ', generated_mel.shape)
361 |         return generated_mel
362 | 
363 |     @torch.no_grad()
364 |     def generate(self, code):
365 |         generated_mel = self.dit_forward(code)
366 |         waveform = self.bigvgan(generated_mel)
367 |         return waveform
368 | 
369 |     @torch.no_grad()
370 |     def generate_stream(self, code):
371 |         # Defeine dit streaming parameters
372 |         dit_chunk_size = 48
373 |         dit_left_context = 24
374 |         dit_right_context = 12
375 |         dit_left_padding = 0
376 |         dit_right_padding = dit_right_context
377 |         dit_start_index = 0
378 |         dit_mel_len = 0
379 | 
380 |         # Define vocoder streaming parameters
381 |         vocoder_left_context = 10
382 |         vocoder_right_context = 10
383 |         vocoder_left_pad = 0
384 |         vocoder_right_pad = vocoder_right_context
385 |         vocoder_upsample_rate = 240
386 | 
387 |         torch.manual_seed(42)
388 |         initial_noise = torch.randn([1, 30000, 80], dtype=torch.float32)
389 |         code_buffer = torch.full((1, 0), 0, dtype=torch.long, device=code.device)
390 |         mel_buffer = torch.full((1, 80, 0), 0, dtype=torch.float32, device=code.device)
391 |         waveform_buffer = torch.full((0,), 0, dtype=torch.float32)
392 |         for next_code in code[0]:
393 |             code_buffer = torch.cat([code_buffer, next_code.reshape(1, 1)], dim=1)
394 |             if code_buffer.size(1) == dit_left_padding + dit_chunk_size + dit_right_padding:
395 |                 # dit
396 |                 generated_mel = self.dit_forward(code_buffer, initial_noise[:, dit_start_index: dit_start_index + code_buffer.size(1) * 2])
397 |                 generated_mel = generated_mel[:, :, dit_left_padding * 2: -dit_right_padding * 2]
398 |                 dit_left_padding = dit_left_context
399 |                 code_buffer = code_buffer[:, -(dit_left_padding + dit_right_padding):]
400 |                 dit_mel_len += generated_mel.size(-1)
401 |                 dit_start_index = dit_mel_len - dit_left_context * 2
402 |                 # bigvgan
403 |                 mel_buffer = torch.cat([mel_buffer, generated_mel], dim=-1)
404 |                 waveform = self.bigvgan(mel_buffer)
405 |                 waveform = waveform[vocoder_left_pad * vocoder_upsample_rate: -vocoder_right_pad * vocoder_upsample_rate]
406 |                 waveform_buffer = torch.cat([waveform_buffer, waveform], dim=-1)
407 |                 vocoder_left_pad = vocoder_left_context
408 |                 mel_buffer = mel_buffer[:, :, -(vocoder_left_pad + vocoder_right_pad):]
409 | 
410 |         if code_buffer.size(1) > 0:
411 |             generated_mel = self.dit_forward(code_buffer, initial_noise[:, dit_start_index: dit_start_index + code_buffer.size(1) * 2])
412 |             generated_mel = generated_mel[:, :, dit_left_padding * 2:]
413 |             mel_buffer = torch.cat([mel_buffer, generated_mel], dim=-1)
414 |             waveform = self.bigvgan(mel_buffer)
415 |             waveform = waveform[vocoder_left_pad * vocoder_upsample_rate:]
416 |             waveform_buffer = torch.cat([waveform_buffer, waveform], dim=-1)
417 | 
418 |         return waveform_buffer
419 | 
420 |     def export_spk(self):
421 |         import MNN.expr as expr
422 |         def torch_to_mnn(x):
423 |             return expr.const(x.data_ptr(), x.shape)
424 |         var_list = []
425 |         for key, value in self.speaker_map.items():
426 |             for k, v in value.items():
427 |                 if type(v) is not torch.Tensor:
428 |                     v = torch.tensor(v)
429 |                 mnn_var = torch_to_mnn(v.contiguous().float())
430 |                 mnn_var.name = f'{key}_{k}'
431 |                 var_list.append(mnn_var)
432 |         expr.save(var_list, f'{self.args.dst_path}/spk_dict.mnn')
433 | 
434 |     @spinner_run(f'export token2wav.predit to ')
435 |     def export_predit(self, onnx_path):
436 |         cond = torch.randn([1, 1, 128], dtype=torch.float32)
437 |         spk = torch.randn([1, 1, 192], dtype=torch.float32)
438 |         code = torch.ones([1, 256], dtype=torch.int32)
439 |         onnx_model = f'{onnx_path}/predit.onnx'
440 |         onnx_export(self.dit.preprocess, (cond, spk, code),
441 |                     onnx_model,
442 |                     input_names=['cond', 'spk', 'code'],
443 |                     output_names=['code_embeds', 'rope', 'mask'],
444 |                     dynamic_axes={
445 |                         "code": { 1: "size" },
446 |                     })
447 |         return onnx_model
448 | 
449 |     @spinner_run(f'export token2wav.dit to ')
450 |     def export_dit(self, onnx_path):
451 |         x = torch.randn([1, 512, 80], dtype=torch.float32)
452 |         code_embeds = torch.randn([1, 512, 1024], dtype=torch.float32)
453 |         rope = torch.randn([2, 1, 512, 1, 64], dtype=torch.float32)
454 |         mask = torch.ones([1, 1, 512, 512], dtype=torch.int32)
455 |         time = torch.tensor([0.0])
456 |         onnx_model = f'{onnx_path}/dit.onnx'
457 |         onnx_export(self.dit, (x, code_embeds, rope, mask, time),
458 |                     onnx_model,
459 |                     input_names=['x', 'code_embeds', 'rope', 'mask', 'time'],
460 |                     output_names=['mel'],
461 |                     dynamic_axes={
462 |                         "x": { 1: "size" },
463 |                         "code_embeds": { 1: "size" },
464 |                         "rope": { 2: "size" },
465 |                         "mask": { 2: "size", 3: "size" },
466 |                     })
467 |         return onnx_model
468 | 
469 |     @spinner_run(f'export token2wav.bigvgan to ')
470 |     def export_bigvgan(self, onnx_path):
471 |         generated_mel = torch.randn([1, 80, 512], dtype=torch.float32)
472 |         onnx_model = f'{onnx_path}/bigvgan.onnx'
473 |         onnx_export(self.bigvgan, (generated_mel),
474 |                     onnx_model,
475 |                     input_names=['generated_mel'],
476 |                     output_names=['waveform'],
477 |                     dynamic_axes={
478 |                         "generated_mel": { 2: "size" },
479 |                     })
480 |         return onnx_model
481 | 
482 |     def export(self, onnx_path):
483 |         self.export_spk()
484 |         predit = self.export_predit(onnx_path)
485 |         dit = self.export_dit(onnx_path)
486 |         bigvgan = self.export_bigvgan(onnx_path)
487 |         return predit, dit, bigvgan


--------------------------------------------------------------------------------
/llmexport/utils/mnn_converter.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import copy
  4 | import json
  5 | import torch
  6 | import numpy as np
  7 | 
  8 | from .torch_utils import quant as torch_quant
  9 | from .torch_utils import onnx_export
 10 | from tqdm import tqdm
 11 | from .spinner import spinner_run
 12 | from .gptq import GPTQ
 13 | from .lora import LoRA
 14 | 
 15 | EXPORT_LOG = '.export.log'
 16 | 
 17 | class MNNConveter:
 18 |     def __init__(self, config, weight_ops = None):
 19 |         self.weight_ops = weight_ops
 20 |         self.config = config
 21 |         self.quant_block = config.args.quant_block
 22 |         self.quant_bit = config.args.quant_bit
 23 |         self.lm_quant_bit = config.args.lm_quant_bit
 24 |         self.symmetric = config.args.sym
 25 |         self.hqq = config.args.hqq
 26 |         self.mnn_weight_offset = 0
 27 |         if os.path.exists(config.args.mnnconvert):
 28 |             self.mnnconvert = config.args.mnnconvert
 29 |         else:
 30 |             self.mnnconvert = None
 31 |         self.lm_weight = None
 32 | 
 33 |     def convert(self, convert_args):
 34 |         sfd = os.dup(1)
 35 |         log_fp = open(EXPORT_LOG, "a")
 36 |         log_fd = log_fp.fileno()
 37 |         # mnnconvert ... > .export.log
 38 |         os.dup2(log_fd, 1)
 39 |         try:
 40 |             sys.argv = convert_args
 41 |             sys.argc = len(convert_args)
 42 |             if self.mnnconvert is None:
 43 |                 from MNN.tools import mnnconvert
 44 |                 mnnconvert.main()
 45 |             else:
 46 |                 convert_args[0] = self.mnnconvert
 47 |                 cmd = ' '.join(convert_args)
 48 |                 message = os.popen(cmd).read()
 49 |                 print(message)
 50 |             sys.argv = []
 51 |         finally:
 52 |             os.dup2(sfd, 1)
 53 |             log_fp.close()
 54 | 
 55 |     @spinner_run(f'convert onnx model to ')
 56 |     def onnx2mnn(self, onnx_path, mnn_path, args = [], transformer_fuse = True, group_conv_native = False, weight_sym = False, save_external_data = True):
 57 |         convert_args = [
 58 |             '',
 59 |             '-f',
 60 |             'ONNX',
 61 |             '--modelFile',
 62 |             str(onnx_path),
 63 |             '--MNNModel',
 64 |             str(mnn_path),
 65 |             '--allowCustomOp'
 66 |         ]
 67 |         if transformer_fuse:
 68 |             convert_args += ['--transformerFuse']
 69 |         if group_conv_native:
 70 |             convert_args += ['--groupConvNative']
 71 |         if weight_sym:
 72 |             convert_args += ['--weightQuantAsymmetric=0']
 73 |         if save_external_data:
 74 |             convert_args += ['--saveExternalData']
 75 |         if self.hqq:
 76 |             convert_args += ['--hqq']
 77 |         convert_args += args
 78 |         self.convert(convert_args)
 79 |         return mnn_path
 80 | 
 81 |     def mnn2json(self, mnn_path, json_path):
 82 |         convert_args = [
 83 |             '',
 84 |             '-f',
 85 |             'MNN',
 86 |             '--modelFile',
 87 |             str(mnn_path),
 88 |             '--JsonFile',
 89 |             str(json_path)
 90 |         ]
 91 |         self.convert(convert_args)
 92 |         return json_path
 93 | 
 94 |     def json2mnn(self, json_path, mnn_path):
 95 |         convert_args = [
 96 |             '',
 97 |             '-f',
 98 |             'JSON',
 99 |             '--modelFile',
100 |             str(json_path),
101 |             '--MNNModel',
102 |             str(mnn_path)
103 |         ]
104 |         self.convert(convert_args)
105 |         return mnn_path
106 | 
107 |     def removeDupOps(self, mnn_path):
108 |         convert_args = [
109 |             '',
110 |             '-f',
111 |             'MNN',
112 |             '--modelFile',
113 |             str(mnn_path),
114 |             '--MNNModel',
115 |             str(mnn_path),
116 |             '--optimizeLevel=1'
117 |         ]
118 |         self.convert(convert_args)
119 |         return mnn_path
120 | 
121 |     def export(self, onnx_path, quant_bit = None, quant_block = None, transformer_fuse = True, group_conv_native = False, weight_sym = None):
122 |         self.onnx_model_path = onnx_path
123 |         self.mnn_name = os.path.basename(onnx_path).replace('.onnx', '.mnn')
124 |         self.mnn_model_path = os.path.join(self.config.args.dst_path, self.mnn_name)
125 |         self.mnn_weight_path = f'{self.mnn_model_path}.weight'
126 |         if self.weight_ops is None:
127 |             if quant_bit is None:
128 |                 quant_bit = self.quant_bit
129 |             if quant_block is None:
130 |                 quant_block = self.quant_block
131 |             if weight_sym is None:
132 |                 weight_sym = self.symmetric
133 |             if quant_bit == 16:
134 |                 quant_args = ['--fp16']
135 |             else:
136 |                 quant_args = [
137 |                     '--weightQuantBits',
138 |                     str(quant_bit),
139 |                     '--weightQuantBlock',
140 |                     str(quant_block)
141 |                 ]
142 |             if quant_bit == 32:
143 |                 quant_args = []
144 |             self.onnx2mnn(self.onnx_model_path, self.mnn_model_path, quant_args, transformer_fuse=transformer_fuse, group_conv_native=group_conv_native, weight_sym=weight_sym)
145 |         else:
146 |             mnn_json = f'{self.mnn_model_path}.json'
147 |             self.onnx2mnn(self.onnx_model_path, self.mnn_model_path, transformer_fuse=transformer_fuse, group_conv_native=group_conv_native, weight_sym=weight_sym)
148 |             self.mnn2json(self.mnn_model_path, mnn_json)
149 |             self.rebuild(mnn_json)
150 |             self.json2mnn(mnn_json, self.mnn_model_path)
151 |             self.removeDupOps(self.mnn_model_path)
152 |             self.mnn2json(self.mnn_model_path, mnn_json)
153 |             if self.config.args.gptq_path is not None:
154 |                 self.apply_gptq(mnn_json)
155 |             if self.config.args.lora_path is not None and self.config.args.lora_split:
156 |                  self.export_lora(mnn_json)
157 |             if self.config.args.smooth:
158 |                 self.export_smooth_quant(mnn_json)
159 | 
160 |     def get_experts_graphs(self, experts):
161 |         hidden_states = torch.randn((1, self.config.hidden_size))
162 |         layers_num = len(experts)
163 |         expert_num = len(experts[0])
164 |         dummy_expert = experts[0][0]
165 |         onnx_model = f'{self.config.onnx_path}/expert.onnx'
166 |         onnx_export(
167 |             dummy_expert, (hidden_states),
168 |             onnx_model,
169 |             input_names=['hidden_states'],
170 |             output_names=['hidden_states'])
171 |         mnn_model = f'{onnx_model}.mnn'
172 |         mnn_json = f'{mnn_model}.json'
173 |         self.onnx2mnn(onnx_model, mnn_model)
174 |         self.mnn2json(mnn_model, mnn_json)
175 |         expert_graph = json.load(open(mnn_json, 'rt'))
176 |         tensors = expert_graph['tensorName']
177 |         nodes = expert_graph['oplists']
178 |         # get input and output
179 |         inputs = []
180 |         outputs = []
181 |         for node in nodes:
182 |             if node['type'] == 'Input':
183 |                 inputs.append(node['outputIndexes'][0])
184 |         for output_name in expert_graph['outputName']:
185 |             outputs.append(tensors.index(output_name))
186 |         subgraphs = []
187 |         for i in range(layers_num):
188 |             for j in range(expert_num):
189 |                 ijnodes = copy.deepcopy(nodes)
190 |                 for op in ijnodes:
191 |                     if op['type'] == 'Extra':
192 |                         for attr in op['main']['attr']:
193 |                             if attr['key'] == 'name':
194 |                                 names = attr['s'].split('/')
195 |                                 names[2] = f'{i}_{j}'
196 |                                 attr['s'] = '/'.join(names)
197 |                 subgraph = {
198 |                     'name': f'/expert/{i}_{j}',
199 |                     'inputs': inputs,
200 |                     'outputs': outputs,
201 |                     'tensors': copy.deepcopy(tensors),
202 |                     'nodes': ijnodes
203 |                 }
204 |                 subgraphs.append(subgraph)
205 |         return subgraphs
206 | 
207 | 
208 |     @spinner_run(f'apply gptq to ')
209 |     def apply_gptq(self, mnn_json):
210 |         GPTQ(self.config.args.gptq_path).apply(mnn_json, self.mnn_weight_path)
211 |         return self.mnn_weight_path
212 | 
213 |     @spinner_run(f'export split lora to ')
214 |     def export_lora(self, mnn_json):
215 |         lora_model = os.path.join(self.config.args.dst_path, 'lora.mnn')
216 |         lora_json = f'{lora_model}.json'
217 |         LoRA(self.config.args.lora_path).apply(mnn_json, lora_json)
218 |         self.json2mnn(lora_json, lora_model)
219 |         if os.path.exists(lora_json):
220 |             os.remove(lora_json)
221 |         return lora_model
222 | 
223 |     @spinner_run(f'export smooth quant scale to ')
224 |     def export_smooth_quant(self, mnn_json):
225 |         self.config.smooth_quantizer.apply(mnn_json)
226 |         self.json2mnn(mnn_json, self.mnn_model_path)
227 |         return self.mnn_model_path
228 | 
229 |     @spinner_run(f'quant model weight to ', True)
230 |     def rebuild(self, json_path):
231 |         mnn_graph = json.load(open(json_path, 'rt'))
232 |         has_experts = len(self.config.experts) > 0
233 |         if has_experts:
234 |             subgraphs = self.get_experts_graphs(self.config.experts)
235 |             mnn_graph['subgraphs'] = subgraphs
236 |         new_ops = []
237 |         # Load layernorm weight from external
238 |         with open(self.mnn_weight_path, 'rb') as f:
239 |             for op in tqdm(mnn_graph['oplists'], 'Load LayerNorm data'):
240 |                 if op['type'] == 'LayerNorm' and 'external' in op['main']:
241 |                     external = op['main']['external']
242 |                     f.seek(external[0])
243 |                     op['main']['gamma'] = np.frombuffer(f.read(external[1]), np.float32).tolist()
244 |                     op['main']['beta'] = np.frombuffer(f.read(external[2]), np.float32).tolist()
245 |                     del op['main']['external']
246 |         # Rebuild ops
247 |         with open(self.mnn_weight_path, 'wb') as self.mnn_weight:
248 |             for op in tqdm(mnn_graph['oplists'], 'Quant weights'):
249 |                 if op['type'] == 'Extra' or op['type'] == 'LayerNorm':
250 |                     new_ops += self.rebuild_op(op, mnn_graph)
251 |                 else:
252 |                     new_ops.append(op)
253 |             mnn_graph['oplists'] = new_ops
254 |             if has_experts and 'subgraphs' in mnn_graph:
255 |                 for subgraph in tqdm(mnn_graph['subgraphs'], 'Quant subgraphs weights'):
256 |                     new_subops = []
257 |                     for op in subgraph['nodes']:
258 |                         if op['type'] == 'Extra' or op['type'] == 'LayerNorm':
259 |                             new_subops += self.rebuild_op(op, subgraph)
260 |                         else:
261 |                             new_subops.append(op)
262 |                     subgraph['nodes'] = new_subops
263 |         with open(json_path, 'w', encoding='utf-8') as file:
264 |             json.dump(mnn_graph, file, ensure_ascii=False, indent=4)
265 |         return self.mnn_weight_path
266 | 
267 |     def quant(self, weight, quant_bit, quant_block, symmetric):
268 |         q_weight, alpha = torch_quant(weight, quant_bit, quant_block, symmetric, self.config.args.awq, self.config.args.hqq)
269 |         return q_weight, alpha
270 | 
271 |     def write_weight(self, data):
272 |         if isinstance(data, torch.Tensor):
273 |             data = data.numpy()
274 |         if isinstance(data, list):
275 |             data = np.array(data).astype(np.float32)
276 |         return self.mnn_weight.write(data.tobytes())
277 | 
278 |     def write_header(self, ic, oc, quant_bit):
279 |         dim_num = self.mnn_weight.write(b'\x02')
280 |         shape_dtype = np.int16
281 |         if oc > 65535 or ic > 65535:
282 |             shape_dtype = np.int32
283 |         dim_length = self.write_weight(np.array([oc, ic]).astype(shape_dtype))
284 |         offset = 1 << (quant_bit - 1)
285 |         weight_map = [i for i in range(-offset, offset)]
286 |         if len(weight_map) == 256:
287 |             weight_map.insert(0, 0)
288 |         else:
289 |             weight_map.insert(0, len(weight_map))
290 |         map_length = self.write_weight(np.array(weight_map, dtype=np.int8))
291 |         header_length = dim_num + dim_length + map_length
292 |         return header_length, shape_dtype == np.int32
293 | 
294 |     def build_weight(self, linear, quant_bit, quant_block, symmetric):
295 |         ic, oc = linear.in_features, linear.out_features
296 |         if quant_bit == 16:
297 |             half_weight = linear.weight.data.flatten().half()
298 |             weight_len = self.write_weight(half_weight)
299 |             alpha_len, q_min, shape_int32, header_len = 0, 0, False, 0
300 |         else:
301 |             q_min = 1
302 |             assert(quant_bit in (1, 2, 4, 8))
303 |             q_weight, alpha = self.quant(linear.weight.data, quant_bit, quant_block, symmetric)
304 |             header_len, shape_int32 = self.write_header(ic, oc, quant_bit)
305 |             weight_len = self.write_weight(q_weight) + header_len
306 |             alpha_len = self.write_weight(alpha)
307 |         if linear.bias is not None:
308 |             bias = linear.bias.data.flatten().float()
309 |             bias_length = self.write_weight(bias)
310 |         else:
311 |             bias_length = 0
312 |             # bias = np.zeros([oc], dtype=np.float32)
313 |             # bias_length = self.write_weight(bias)
314 |         external = [self.mnn_weight_offset, weight_len, alpha_len, bias_length, 0]
315 |         self.mnn_weight_offset += (weight_len + alpha_len + bias_length)
316 |         return external, q_min, shape_int32, header_len
317 | 
318 |     def build_tensor(self, graph, tensor_name):
319 |         tensor_key = 'tensorName'
320 |         if tensor_key not in graph and 'tensors' in graph:
321 |             tensor_key = 'tensors'
322 |         tensor_idx = [len(graph[tensor_key])]
323 |         graph[tensor_key].append(tensor_name)
324 |         return tensor_idx
325 | 
326 |     def rebuild_op(self, op, graph):
327 |         if "type" in op['main']:
328 |             op_type = op['main']['type']
329 |         else:
330 |             op_type = op['type']
331 |         if op_type == 'FakeLinear':
332 |             return self.rebuild_linear(op, graph)
333 |         if op_type == 'FusedAttention':
334 |             return self.rebuild_attnention(op, graph)
335 |         if op_type == "LayerNorm":
336 |             return self.rebuild_layernorm(op, graph)
337 |         if op_type == 'MoE':
338 |             return self.rebuild_moe(op, graph)
339 |         return None
340 | 
341 |     def rebuild_moe(self, op, graph):
342 |         moe = copy.deepcopy(op)
343 |         moe['main'] = { 'attr': moe['main']['attr'][:3] }
344 |         moe['type'] = 'MoE'
345 |         return [moe]
346 | 
347 |     def rebuild_layernorm(self, op, graph):
348 |         if "gamma" not in op['main'] or "beta" not in op['main']:
349 |             return [op]
350 |         attr = op['main']
351 |         gamma = attr['gamma']
352 |         beta = attr['beta']
353 |         gamma_len = self.write_weight(gamma)
354 |         beta_len = self.write_weight(beta)
355 |         del attr['gamma']
356 |         del attr['beta']
357 |         external = [self.mnn_weight_offset, gamma_len, beta_len]
358 |         self.mnn_weight_offset += (gamma_len + beta_len)
359 |         attr['external'] = external
360 |         layernorm_op = {
361 |             "name": op['name'],
362 |             "inputIndexes": op['inputIndexes'],
363 |             "outputIndexes": op['outputIndexes'],
364 |             "type": "LayerNorm",
365 |             "main_type": "LayerNorm",
366 |             "main": attr,
367 |             "defaultDimentionFormat": op['defaultDimentionFormat']
368 |         }
369 |         return [layernorm_op]
370 | 
371 |     def rebuild_attnention(self, op, graph):
372 |         attrs = op['main']['attr']
373 |         for attr in attrs:
374 |             if attr['key'] == 'name':
375 |                 name = attr['s']
376 |         origin_input = op['inputIndexes']
377 |         origin_output = op['outputIndexes']
378 |         fused_attention = {
379 |             "inputIndexes": origin_input,
380 |             "main_type": "AttentionParam",
381 |             "main": { "kv_cache": True },
382 |             "name": name,
383 |             "outputIndexes": origin_output,
384 |             "type": "Attention",
385 |             "defaultDimentionFormat": "NHWC"
386 |         }
387 |         return [fused_attention]
388 | 
389 |     def rebuild_linear(self, op, graph):
390 |         attrs = op['main']['attr']
391 |         for attr in attrs:
392 |             if attr['key'] == 'name':
393 |                 name = attr['s']
394 |             elif attr['key'] == "in_features":
395 |                 ic = attr["i"]
396 |             elif attr['key'] == "out_features":
397 |                 oc = attr["i"]
398 |             elif attr['key'] == "has_bias":
399 |                 has_bias = attr["i"]
400 |         linear = self.weight_ops[name]
401 |         assert(linear.in_features == ic and
402 |                linear.out_features == oc and
403 |                (linear.bias is not None) == has_bias)
404 | 
405 |         is_lm = 'lm_head' in name
406 |         quant_bit = self.lm_quant_bit if is_lm else self.quant_bit
407 |         block_size = ic if self.quant_block == 0 else self.quant_block
408 |         if is_lm and self.lm_weight is not None:
409 |             external, q_min, shape_int32, header_len = self.lm_weight
410 |         else:
411 |             external, q_min, shape_int32, header_len = self.build_weight(linear, quant_bit, self.quant_block, self.symmetric)
412 |         if is_lm and self.lm_weight is None:
413 |             self.lm_weight = [external, q_min, shape_int32, header_len]
414 |         if is_lm and self.config.tie_word_embeddings:
415 |             weight_offset = external[0] + header_len
416 |             alpha_offset = external[0] + external[1]
417 |             alpha_size = external[2]
418 |             self.config.llm_config['tie_embeddings'] = [weight_offset, alpha_offset, alpha_size, quant_bit, self.quant_block]
419 | 
420 |         origin_input = op['inputIndexes']
421 |         origin_output = op['outputIndexes']
422 |         # build new tensor
423 |         pre_reshape_name = f'{name}/pre_reshape'
424 |         pre_convert_name = f'{name}/pre_convert'
425 |         conv_name = name
426 |         post_convert_name = f'{name}/post_convert'
427 |         post_reshape_name = f'{name}/post_reshape'
428 |         pre_reshape_output = self.build_tensor(graph, pre_reshape_name)
429 |         pre_convert_output = self.build_tensor(graph, pre_convert_name)
430 |         conv_output = self.build_tensor(graph, conv_name)
431 |         post_convert_output = self.build_tensor(graph, post_convert_name)
432 |         # [batch, seq, hidden_size_i] -[Linear] -> [batch, seq, hidden_size_o]
433 |         # [1, seq, hidden_size_i] ->[Reshape]-> [seq, hidden_size_i, 1, 1]
434 |         # -[Convert]-[Convolution]-[Convert]-> [Reshape] -> [1, seq, hidden_size_o]
435 |         pre_reshape = {
436 |             "name": pre_reshape_name,
437 |             "type": "Reshape",
438 |             "inputIndexes": origin_input,
439 |             "outputIndexes": pre_reshape_output,
440 |             "main_type": "Reshape",
441 |             "main": {
442 |                 "dims": [-1, ic, 1, 1],
443 |                 "dimType": "NCHW"
444 |             },
445 |             "defaultDimentionFormat": "NHWC"
446 |         }
447 |         pre_convert = {
448 |             "name": pre_convert_name,
449 |             "inputIndexes": pre_reshape_output,
450 |             "outputIndexes": pre_convert_output,
451 |             "type": "ConvertTensor",
452 |             "main_type": "TensorConvertInfo",
453 |             "main": {
454 |                 "source": "NCHW",
455 |                 "dest": "NC4HW4"
456 |             },
457 |             "defaultDimentionFormat": "NHWC"
458 |         }
459 | 
460 |         if quant_bit == 16:
461 |             quanParameter = { "type": 3 }
462 |         else:
463 |             if self.symmetric:
464 |                 aMin = 0
465 |                 readType = 0
466 |             else:
467 |                 aMin = q_min
468 |                 readType = oc * (ic // block_size)
469 | 
470 |             quanParameter = {
471 |                 "quantScale": 1.0, "scaleIn": 0.0, "scaleOut": 0.0,
472 |                 "useInt32": False, "has_scaleInt": False, "shapeInt32": shape_int32,
473 |                 "type": 1, "aMaxOrBits": quant_bit, "aMin": aMin, "readType": readType, "weightSize": 0
474 |             }
475 |         conv_op = {
476 |             "name": conv_name,
477 |             "inputIndexes": pre_convert_output,
478 |             "outputIndexes": conv_output,
479 |             "type": "Convolution",
480 |             "main_type": "Convolution2D",
481 |             "main": {
482 |                 'common': {
483 |                     'dilateX': 1, 'dilateY': 1, 'strideX': 1, 'strideY': 1,
484 |                     'kernelX': 1, 'kernelY': 1, 'padX': 0, 'padY': 0, 'group': 1,
485 |                     'outputCount': oc, 'relu': False, 'padMode': 'CAFFE',
486 |                     'relu6': False, 'inputCount': ic, 'hasOutputShape': False
487 |                 },
488 |                 "quanParameter": quanParameter,
489 |                 "external": external
490 |             },
491 |             "defaultDimentionFormat": "NHWC"
492 |         }
493 |         post_convert = {
494 |             "name": post_convert_name,
495 |             "inputIndexes": conv_output,
496 |             "outputIndexes": post_convert_output,
497 |             "type": "ConvertTensor",
498 |             "main_type": "TensorConvertInfo",
499 |             "main": {
500 |                 "source": "NC4HW4",
501 |                 "dest": "NCHW"
502 |             },
503 |             "defaultDimentionFormat": "NHWC"
504 |         }
505 |         post_reshape = {
506 |             "name": post_reshape_name,
507 |             "type": "Reshape",
508 |             "inputIndexes": post_convert_output,
509 |             "outputIndexes": origin_output,
510 |             "main_type": "Reshape",
511 |             "main": {
512 |                 "dims": [1, -1, oc],
513 |                 "dimType": "NCHW"
514 |             },
515 |             "defaultDimentionFormat": "NHWC"
516 |         }
517 |         if name.startswith('/expert/'):
518 |             post_reshape['main']['dims'] = [-1, oc]
519 |         return [pre_reshape, pre_convert, conv_op, post_convert, post_reshape]
520 | 


--------------------------------------------------------------------------------
/llmexport/gguf2mnn.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from gguf import gguf_reader
  3 | from gguf import constants
  4 | import numpy
  5 | import json
  6 | import argparse
  7 | 
  8 | from .utils.mnn_utils import *
  9 | 
 10 | class TokenContent:
 11 |     def __init__(self):
 12 |         self.token_type = -1
 13 |         self.spec_ids = []
 14 |         self.names = []
 15 |         self.stop_ids = []
 16 |         self.pre_ids = []
 17 |         self.token_num = 0
 18 | 
 19 | def load_token(reader):
 20 |     content = TokenContent()
 21 |     model = reader.fields['tokenizer.ggml.model'].parts[4].tobytes().decode('utf-8')
 22 |     field = reader.fields['tokenizer.ggml.token_type']
 23 |     valids = []
 24 |     for i in range(0, len(field.data)):
 25 |         p = field.data[i]
 26 |         if field.parts[p] == 1:
 27 |             #normal
 28 |             valids.append(i)
 29 |         elif field.parts[p] == 3 or field.parts[p] == 4:
 30 |             valids.append(i)
 31 |             content.spec_ids.append(i)
 32 |     tokens = reader.fields['tokenizer.ggml.tokens']
 33 |     stopes = ["<|eot_id|>", "<|im_end|>", "<|end|>", "<end_of_turn>", "<|endoftext|>", "<|eom_id|>", "<EOT>"]
 34 | 
 35 |     for i in valids:
 36 |         p = tokens.data[i]
 37 |         tok = tokens.parts[p].tobytes().decode('utf-8')
 38 |         if tok in stopes:
 39 |             content.stop_ids.append(i)
 40 |         content.names.append(tok)
 41 |     content.token_num = len(content.names)
 42 |     if model == "gpt2":
 43 |         # bpe -> HUGGINGFACE
 44 |         content.token_type = 3
 45 |         # load merge
 46 |         merges = reader.fields['tokenizer.ggml.merges']
 47 |         for i in range(0, len(merges.data)):
 48 |             p = merges.data[i]
 49 |             tok = merges.parts[p].tobytes().decode('utf-8')
 50 |             content.names.append(tok)
 51 |     elif model == 'llama':
 52 |         content.token_type = 1
 53 |     else:
 54 |         print("[Error] Not support token type: , you can try download tokenizer.txt from old MNN LLM model", model)
 55 |     return content
 56 | 
 57 | def write_token_file(filename, token):
 58 |     with open(filename, 'w') as f:
 59 |         f.write("430 %d\n" %token.token_type)
 60 |         f.write("%d " %(len(token.spec_ids)) + '%d 0\n' %(len(token.stop_ids)))
 61 |         l = ""
 62 |         for i in token.spec_ids:
 63 |             l += "%d " %i
 64 |         for i in token.stop_ids:
 65 |             l += "%d " %i
 66 |         l+='\n'
 67 |         f.write(l)
 68 |         if token.token_type == 3:
 69 |             merge_num = len(token.names) - token.token_num
 70 |             f.write("%d " %token.token_num +  "%d\n" %merge_num)
 71 |         else:
 72 |             f.write("%d\n" %token.token_num)
 73 |         for name in token.names:
 74 |             f.write(name + '\n')
 75 |     return
 76 | 
 77 | def shuffle_weight_int4(weight_main):
 78 |     # shuffle weight
 79 |     block_number = weight_main.shape[0]
 80 |     half_block_size = weight_main.shape[1]
 81 |     weight_main_low = weight_main % 16
 82 |     weight_main_high = weight_main // 16
 83 |     weight_main =  numpy.concatenate([weight_main_low, weight_main_high], axis = 1).reshape([block_number, half_block_size, 2])
 84 |     weight_main_low = weight_main[:, :, 1]
 85 |     weight_main_high = weight_main[:, :, 0]
 86 |     weight_main = weight_main_low + weight_main_high * 16
 87 |     return weight_main
 88 | 
 89 | # const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
 90 | # const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
 91 | 
 92 | # const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16;
 93 | # const int32_t x1 = ((x[i].qs[j] >>   4) | xh_1) - 16;
 94 | 
 95 | # y[i*qk + j + 0   ] = x0*d;
 96 | # y[i*qk + j + qk/2] = x1*d;
 97 | 
 98 | def shuffle_weight_int5(weight, repack = True):
 99 |     block_number = weight.shape[0]
100 |     qh = weight[:, 0:4]
101 |     qs = weight[:, 4:20]
102 |     x0 = qs & 0x0F
103 |     x1 = qs >> 4
104 |     qh = numpy.frombuffer(qh.tobytes(), numpy.uint32).reshape([block_number, 1])
105 |     mask_0 = []
106 |     mask_1 = []
107 |     for i in range(0, 16):
108 |         mask_0.append(((qh >> i)<< 4) & 0x10)
109 |         mask_1.append(((qh >> (i+12))) & 0x10)
110 |     mask_0 = numpy.concatenate(mask_0, axis=1)
111 |     mask_1 = numpy.concatenate(mask_1, axis=1)
112 |     x0 = x0 + mask_0
113 |     x1 = x1 + mask_1
114 |     x = numpy.concatenate([x0, x1], axis=1)
115 |     if repack:
116 |         return repack_low_bits(x, 5, 32)
117 |     return x
118 | 
119 | def extract_tensor_as_int8(weight):
120 |     ic = int(weight.shape[0])
121 |     oc = int(weight.shape[1])
122 |     if weight.tensor_type == constants.GGMLQuantizationType.Q6_K:
123 |         block_size, type_size = constants.GGML_QUANT_SIZES[weight.tensor_type]
124 |         block_number = oc * ic // block_size
125 |         weight = weight.data.reshape([oc * ic // block_size, type_size])
126 |         scale_int8 = weight[:, 192:208]
127 |         scale_half = weight[:, 208:210]
128 |         scale_int8 = numpy.frombuffer(scale_int8.tobytes(), numpy.int8).astype(numpy.float32).reshape([block_number, 16, 1])
129 |         scale_half = numpy.frombuffer(scale_half.tobytes(), numpy.float16).astype(numpy.float32).reshape([block_number, 1, 1])
130 |         weight_scale = scale_half * scale_int8
131 | 
132 |         # Extract to int8
133 |         ql = weight[:, 0:128]
134 |         qh = weight[:, 128:192]
135 | 
136 |         qall = []
137 |         for i in range(256):
138 |             qall.append(None)
139 |         for nnp in range(0, 2):
140 |             for l in range(0, 32):
141 |                 q1 = ((ql[:, l +  0 + 64 * nnp] & 0xF) | (((qh[:, l + 32*nnp] >> 0) & 3) << 4))
142 |                 q2 = ((ql[:, l + 32 + 64 * nnp] & 0xF) | (((qh[:, l + 32*nnp] >> 2) & 3) << 4))
143 |                 q3 = ((ql[:, l +  0 + 64 * nnp]  >> 4) | (((qh[:, l + 32*nnp] >> 4) & 3) << 4))
144 |                 q4 = ((ql[:, l + 32 + 64 * nnp]  >> 4) | (((qh[:, l + 32*nnp] >> 6) & 3) << 4))
145 |                 qall[l + 0 + 128 * nnp] = q1.reshape([block_number, 1])
146 |                 qall[l + 32 + 128 * nnp] = q2.reshape([block_number, 1])
147 |                 qall[l + 64 + 128 * nnp] = q3.reshape([block_number, 1])
148 |                 qall[l + 96 + 128 * nnp] = q4.reshape([block_number, 1])
149 |         q_raw = numpy.concatenate(qall, axis = 1)
150 |         return q_raw, weight_scale, 16, 6
151 |     elif weight.tensor_type == constants.GGMLQuantizationType.Q5_0:
152 |         block_size, type_size = constants.GGML_QUANT_SIZES[weight.tensor_type]
153 |         weight = weight.data.reshape([oc * ic // block_size, type_size])
154 |         # Seperate Scale and Bias
155 |         weight_main = weight[:, 2:type_size]
156 |         weight_main = shuffle_weight_int5(weight_main, False)
157 |         weight_scale = weight[:, 0:2]
158 |         weight_scale = numpy.frombuffer(weight_scale.tobytes(), numpy.float16).astype(numpy.float32)
159 |         return weight_main, weight_scale, 32, 5
160 |     return None
161 | 
162 | def write_external_weight(weight, mnn_weight_file, mnn_weight_offset):
163 |     ic = int(weight.shape[0])
164 |     oc = int(weight.shape[1])
165 |     bias_length = oc * 4
166 |     conv = {}
167 |     block_size = 0
168 |     block_number = 0
169 |     quant_bit = 0
170 |     tie_embedding = False
171 |     header_len = 0
172 |     if weight.tensor_type == constants.GGMLQuantizationType.F16:
173 |         # FP16
174 |         quan = {}
175 |         quan['type'] = 3
176 |         conv['quanParameter'] = quan
177 |         rawbytes = weight.data.tobytes()
178 |         weightlen = mnn_weight_file.write(rawbytes)
179 |         external = [mnn_weight_offset, weightlen, 0, bias_length, 0]
180 |         conv['external'] = external
181 |         mnn_weight_offset += weightlen
182 |         tie_embedding = True
183 |         quant_bit = 16
184 |     elif weight.tensor_type == constants.GGMLQuantizationType.F32:
185 |         # FP16
186 |         quan = {}
187 |         quan['type'] = 3
188 |         conv['quanParameter'] = quan
189 |         rawbytes = weight.data.astype(numpy.float16).tobytes()
190 |         weightlen = mnn_weight_file.write(rawbytes)
191 |         external = [mnn_weight_offset, weightlen, 0, bias_length, 0]
192 |         conv['external'] = external
193 |         mnn_weight_offset += weightlen
194 |     elif weight.tensor_type == constants.GGMLQuantizationType.Q4_0:
195 |         tie_embedding = True
196 |         quant_bit = 4
197 |         block_size, type_size = constants.GGML_QUANT_SIZES[weight.tensor_type]
198 |         block_number = oc * ic // block_size
199 |         weight = weight.data.reshape([block_number, type_size])
200 |         # Seperate Scale and Bias
201 |         weight_main = weight[:, 2:type_size]
202 |         weight_scale = weight[:, 0:2]
203 |         weight_scale = numpy.frombuffer(weight_scale.tobytes(), numpy.float16).astype(numpy.float32)
204 | 
205 |         # shuffle weight
206 |         weight_main = shuffle_weight_int4(weight_main)
207 |         conv, header_len, mnn_weight_offset = write_quant_parameters(quant_bit, False, mnn_weight_file, ic, oc, weight_main, weight_scale, mnn_weight_offset)
208 |     elif weight.tensor_type == constants.GGMLQuantizationType.Q4_1:
209 |         quant_bit = 4
210 |         tie_embedding = True
211 |         block_size, type_size = constants.GGML_QUANT_SIZES[weight.tensor_type]
212 |         block_number = oc * ic // block_size
213 |         weight = weight.data.reshape([oc * ic // block_size, type_size])
214 |         # Seperate Scale and Bias
215 |         weight_main = weight[:, 4:type_size]
216 | 
217 |         # shuffle weight
218 |         weight_main = shuffle_weight_int4(weight_main);
219 | 
220 |         weight_scale = weight[:, 0:2]
221 |         weight_bias = weight[:, 2:4]
222 |         weight_scale = numpy.frombuffer(weight_scale.tobytes(), numpy.float16).reshape((block_number, 1))
223 |         weight_bias = numpy.frombuffer(weight_bias.tobytes(), numpy.float16).reshape((block_number, 1))
224 |         scalebias = numpy.concatenate((weight_bias, weight_scale), axis=1).astype(numpy.float32)
225 | 
226 |         conv, header_len, mnn_weight_offset = write_quant_parameters(quant_bit, True, mnn_weight_file, ic, oc, weight_main, scalebias, mnn_weight_offset)
227 |     elif weight.tensor_type == constants.GGMLQuantizationType.Q4_K:
228 |         quant_bit = 4
229 |         tie_embedding = True
230 |         block_size, type_size = constants.GGML_QUANT_SIZES[weight.tensor_type]
231 |         block_number = oc * ic // block_size
232 |         weight = weight.data.reshape([oc * ic // block_size, type_size])
233 |         # Seperate Scale and Bias
234 |         d = weight[:, 0:2]
235 |         dmin = weight[:, 2:4]
236 |         scales = weight[:, 4:16]
237 |         weight_main = weight[:, 16:type_size]
238 | 
239 |         # shuffle weight
240 |         weight_main = weight_main.reshape((block_number * 4, 32))
241 |         weight_main = shuffle_weight_int4(weight_main)
242 | 
243 |         # Compute Scale
244 |         d = numpy.frombuffer(d.tobytes(), numpy.float16).reshape((block_number, 1)).astype(numpy.float32)
245 |         dmin = numpy.frombuffer(dmin.tobytes(), numpy.float16).reshape((block_number, 1)).astype(numpy.float32)
246 | 
247 |         def get_scale_min_k4(j, q):
248 |             if j < 4:
249 |                 d = q[:, j] & 63
250 |                 m = q[:, j + 4] & 63
251 |             else:
252 |                 d = (q[:, j+4] & 0xF) | ((q[:, j-4] >> 6) << 4)
253 |                 m = (q[:, j+4] >>  4) | ((q[:, j-0] >> 6) << 4)
254 |             return d, m
255 |         dgroup=[]
256 |         mgroup=[]
257 |         for j in range(0, 8):
258 |             dgroup.append(None)
259 |             mgroup.append(None)
260 |         for j in range(0, 8):
261 |             vd, vm = get_scale_min_k4(j, scales)
262 |             vd = vd.reshape((block_number, 1))
263 |             vm = vm.reshape((block_number, 1))
264 |             vd = vd.astype(numpy.float32) * d
265 |             vm = vm.astype(numpy.float32) * dmin
266 |             dgroup[j] = vd
267 |             mgroup[j] = -vm
268 |         weight_scale = numpy.concatenate(dgroup, -1).reshape((block_number, 8, 1))
269 |         weight_bias = numpy.concatenate(mgroup, -1).reshape((block_number, 8, 1))
270 |         scalebias = numpy.concatenate((weight_bias, weight_scale), axis=-1).astype(numpy.float32)
271 | 
272 | 
273 |         block_size = 32
274 | 
275 |         conv, header_len, mnn_weight_offset = write_quant_parameters(quant_bit, True, mnn_weight_file, ic, oc, weight_main, scalebias, mnn_weight_offset)
276 | 
277 |     elif weight.tensor_type == constants.GGMLQuantizationType.Q8_0:
278 |         quant_bit = 8
279 |         tie_embedding = True
280 |         block_size, type_size = constants.GGML_QUANT_SIZES[weight.tensor_type]
281 |         weight = weight.data.reshape([oc * ic // block_size, type_size])
282 |         # Seperate Scale and Bias
283 |         weight_main = weight[:, 2:type_size]
284 |         weight_scale = weight[:, 0:2]
285 |         weight_scale = numpy.frombuffer(weight_scale.tobytes(), numpy.float16).astype(numpy.float32)
286 |         weight_main = numpy.frombuffer(weight_main.tobytes(), numpy.int8).astype(numpy.int16) + 128
287 |         weight_main = weight_main.astype(numpy.uint8)
288 |         conv, header_len, mnn_weight_offset = write_quant_parameters(quant_bit, False, mnn_weight_file, ic, oc, weight_main, weight_scale, mnn_weight_offset)
289 | 
290 |     elif weight.tensor_type == constants.GGMLQuantizationType.Q5_0:
291 |         tie_embedding = False
292 |         block_size, type_size = constants.GGML_QUANT_SIZES[weight.tensor_type]
293 |         weight = weight.data.reshape([oc * ic // block_size, type_size])
294 |         # Seperate Scale and Bias
295 |         weight_main = weight[:, 2:type_size]
296 |         weight_main = shuffle_weight_int5(weight_main)
297 |         weight_scale = weight[:, 0:2]
298 |         weight_scale = numpy.frombuffer(weight_scale.tobytes(), numpy.float16).astype(numpy.float32)
299 |         quant_bit = 5
300 |         conv, header_len, mnn_weight_offset = write_quant_parameters(quant_bit, False, mnn_weight_file, ic, oc, weight_main, weight_scale, mnn_weight_offset)
301 | 
302 |     elif weight.tensor_type == constants.GGMLQuantizationType.Q5_1:
303 |         tie_embedding = False
304 |         block_size, type_size = constants.GGML_QUANT_SIZES[weight.tensor_type]
305 |         block_number = oc * ic // block_size
306 |         weight = weight.data.reshape([oc * ic // block_size, type_size])
307 |         # Seperate Scale and Bias
308 |         weight_main = weight[:, 4:type_size]
309 |         weight_main = shuffle_weight_int5(weight_main)
310 |         weight_scale = weight[:, 0:2]
311 |         weight_bias = weight[:, 2:4]
312 |         weight_scale = numpy.frombuffer(weight_scale.tobytes(), numpy.float16).reshape((block_number, 1))
313 |         weight_bias = numpy.frombuffer(weight_bias.tobytes(), numpy.float16).reshape((block_number, 1))
314 |         weight_scale = numpy.concatenate((weight_bias, weight_scale), axis=1).astype(numpy.float32)
315 |         quant_bit = 5
316 |         conv, header_len, mnn_weight_offset = write_quant_parameters(quant_bit, True, mnn_weight_file, ic, oc, weight_main, weight_scale, mnn_weight_offset)
317 |     elif weight.tensor_type == constants.GGMLQuantizationType.Q6_K:
318 |         block_size, type_size = constants.GGML_QUANT_SIZES[weight.tensor_type]
319 |         block_number = oc * ic // block_size
320 |         q_raw, weight_scale, block_size, bits = extract_tensor_as_int8(weight)
321 |         weight_main = repack_low_bits(q_raw, 6, 256)
322 |         quant_bit = 6
323 |         conv, header_len, mnn_weight_offset = write_quant_parameters(quant_bit, False, mnn_weight_file, ic, oc, weight_main, weight_scale, mnn_weight_offset)
324 | 
325 |     else:
326 |         print('Not support type: ',  weight.tensor_type)
327 |         print(weight.data.shape, ic, oc)
328 |         assert(False)
329 |     return mnn_weight_offset, conv, tie_embedding, block_size, quant_bit, header_len
330 | 
331 | def convert(args):
332 |     gguf = args.gguf
333 |     mnn_dir = args.mnn_dir
334 |     src_json = os.path.join(mnn_dir, "llm.mnn.json")
335 |     dst_json = os.path.join(mnn_dir, "llm.mnn_new.json")
336 | 
337 |     mnn, opmap, convs, _, __ = load_mnn(src_json)
338 |     llm_config = {}
339 |     with open(os.path.join(mnn_dir, "llm_config.json")) as f:
340 |         llm_config = json.load(f)
341 | 
342 |     reader = gguf_reader.GGUFReader(gguf)
343 |     if args.load_token:
344 |         write_token_file(os.path.join(mnn_dir, "tokenizer.txt"), load_token(reader))
345 |     arch = reader.fields['general.architecture'].parts[4].tobytes().decode('utf-8')
346 |     print("Arch:", arch)
347 |     tensormap = {}
348 |     for t in reader.tensors:
349 |         tensormap[t.name] = t
350 | 
351 |     mnn_weight_file = open(os.path.join(mnn_dir, "llm.mnn.weight"), "wb")
352 |     mnn_weight_offset = 0
353 |     if 'tie_embeddings' in llm_config:
354 |         del llm_config['tie_embeddings']
355 |     for name in opmap:
356 |         op = opmap[name]
357 |         print('Load layernorm: ', name)
358 |         if op['type'] == 'LayerNorm':
359 |             weight_tensor = tensormap[name+'.weight']
360 |             layernorm = op['main']
361 |             layernorm['gamma'] = weight_tensor.data.tolist()
362 |             if name+'.bias' in tensormap:
363 |                 layernorm['beta'] = tensormap[name+'.bias'].data.tolist()
364 |             else:
365 |                 layernorm['beta'] = [0.0] * len(layernorm['gamma'])
366 |             continue
367 |     for op in convs:
368 |         conv = op['main']
369 |         name = op['name']
370 |         if 'quanParameter' in conv:
371 |             del conv['quanParameter']
372 |         weight_name = name+'.weight'
373 |         weight = None
374 |         tie_embedding = False
375 |         ichannel = conv['common']['inputCount']
376 |         ochannel = conv['common']['outputCount']
377 |         if name == 'output':
378 |             print('hidden size: ', ichannel)
379 |             llm_config['hidden_size'] = ichannel
380 |         if weight_name in tensormap:
381 |             weight = tensormap[weight_name]
382 |         elif name == 'output':
383 |             weight = tensormap['token_embd.weight']
384 |             tie_embedding = True
385 |         else:
386 |             print("Error: Can't find weight for " + name)
387 |             assert(False)
388 |         print('Load Convolution: ', name, ", weight type: ", weight.tensor_type)
389 |         if weight.shape[0] != ichannel or weight.shape[1] != ochannel:
390 |             print(name, ", weight not match: ", ichannel, ", ", ochannel, " : ", weight.shape, ", reset to ", weight.shape)
391 |             ichannel = int(weight.shape[0])
392 |             ochannel = int(weight.shape[1])
393 |             conv['common']['inputCount'] = ichannel
394 |             conv['common']['outputCount'] = ochannel
395 |             # Change post reshape for convolution
396 |             outputIndex = op['outputIndexes'][0]
397 |             for subop in mnn["oplists"]:
398 |                 if 'inputIndexes' not in subop:
399 |                     continue
400 |                 if subop['inputIndexes'][0] == outputIndex and subop['type'] == 'ConvertTensor':
401 |                     outputIndex = subop['outputIndexes'][0]
402 |                     break
403 |             for subop in mnn["oplists"]:
404 |                 if 'inputIndexes' not in subop:
405 |                     continue
406 |                 if subop['inputIndexes'][0] == outputIndex and subop['type'] == 'Reshape':
407 |                     subop['main']['dims'][2] = ochannel
408 |                     break
409 |         mnn_weight_offset, conv_new, can_tie_embedding, block_size, quant_bit, header_len = write_external_weight(weight, mnn_weight_file, mnn_weight_offset)
410 |         if not can_tie_embedding:
411 |             tie_embedding = False
412 |         conv['quanParameter'] = conv_new['quanParameter']
413 |         conv['external'] = conv_new['external']
414 | 
415 |         bias = None
416 |         bias_name = name + '.bias'
417 |         if bias_name in tensormap:
418 |             if tensormap[bias_name].tensor_type > 1:
419 |                 print('Error: Bias is quant: ', tensormap[bias_name].tensor_type)
420 |                 assert(False)
421 |             bias = tensormap[bias_name].data.astype(numpy.float32)
422 |         else:
423 |             bias = numpy.zeros(ochannel).astype(numpy.float32)
424 |         mnn_weight_offset += mnn_weight_file.write(bias.tobytes())
425 |         if tie_embedding:
426 |             external = conv['external']
427 |             weight_offset = external[0] + header_len
428 |             alpha_offset = external[0] + external[1]
429 |             alpha_size = external[2]
430 |             llm_config['tie_embeddings'] = [weight_offset, alpha_offset, alpha_size, quant_bit, 32]
431 |     embedding_file = os.path.join(mnn_dir, "embeddings_bf16.bin")
432 | 
433 |     embeding_in_weight = True
434 |     if 'tie_embeddings' not in llm_config:
435 |         # Need write embedding
436 |         weight = tensormap['token_embd.weight']
437 |         print("Embedding type: ", weight.tensor_type)
438 |         if weight.tensor_type <= 1:
439 |             embeding_in_weight = False
440 |             print("Write ", embedding_file)
441 |             weight = weight.data.astype(numpy.float32)
442 |             weight = numpy.frombuffer(weight.tobytes(), numpy.uint32) >> 16
443 |             weight = weight.astype(numpy.uint16)
444 |             with open(embedding_file, 'wb') as f:
445 |                 f.write(weight.tobytes())
446 |         elif weight.tensor_type == constants.GGMLQuantizationType.Q8_0 or weight.tensor_type == constants.GGMLQuantizationType.Q4_0 or weight.tensor_type == constants.GGMLQuantizationType.Q4_1:
447 |             mnn_weight_offset, conv, can_tie_embedding, block_size, quant_bit, header_len = write_external_weight(weight, mnn_weight_file, mnn_weight_offset)
448 |             external = conv['external']
449 |             weight_offset = external[0] + header_len
450 |             alpha_offset = external[0] + external[1]
451 |             alpha_size = external[2]
452 |             llm_config['tie_embeddings'] = [weight_offset, alpha_offset, alpha_size, quant_bit, block_size]
453 |         elif weight.tensor_type == constants.GGMLQuantizationType.Q6_K or weight.tensor_type == constants.GGMLQuantizationType.Q5_0:
454 |             q_raw, weight_scale, block_size, bits = extract_tensor_as_int8(weight)
455 |             # embeding_in_weight = False
456 |             ic = int(weight.shape[0])
457 |             oc = int(weight.shape[1])
458 |             offset = (1 << (bits - 1))
459 |             q_raw = repack_low_bits(q_raw, 8, q_raw.shape[1])
460 |             q_raw = q_raw + (128-offset)
461 |             quant_bit = 8
462 |             conv, header_len, mnn_weight_offset = write_quant_parameters(quant_bit, False, mnn_weight_file, ic, oc, q_raw, weight_scale, mnn_weight_offset)
463 |             external = conv['external']
464 |             weight_offset = external[0] + header_len
465 |             alpha_offset = external[0] + external[1]
466 |             alpha_size = external[2]
467 |             llm_config['tie_embeddings'] = [weight_offset, alpha_offset, alpha_size, quant_bit, block_size]
468 |         else:
469 |             assert(False)
470 | 
471 |     if embeding_in_weight:
472 |         if os.path.exists(embedding_file):
473 |             os.remove(embedding_file)
474 | 
475 |     mnn_weight_file.close()
476 |     with open(dst_json, 'w') as f:
477 |         f.write(json.dumps(mnn, indent=4))
478 |     with open(os.path.join(mnn_dir, "llm_config.json"), 'w') as f:
479 |         f.write(json.dumps(llm_config, indent=4))
480 | 
481 |     convert_args = [
482 |         '',
483 |         '-f',
484 |         'JSON',
485 |         '--modelFile',
486 |         dst_json,
487 |         '--MNNModel',
488 |         os.path.join(mnn_dir, 'llm.mnn'),
489 |     ]
490 | 
491 |     print(convert_args)
492 |     from MNN.tools import mnnconvert
493 |     mnnconvert.convert(convert_args)
494 |     os.remove(dst_json)
495 | 
496 | if __name__ == '__main__':
497 |     parser = argparse.ArgumentParser(description='gguf2mnn', formatter_class=argparse.RawTextHelpFormatter)
498 |     parser.add_argument('--gguf', type=str, required=True,help='src gguf model')
499 |     parser.add_argument('--mnn_dir', type=str, required=True,help='mnn llm dir')
500 |     parser.add_argument('--load_token', type=bool, default = False, help='Override tokenizer.txt from gguf')
501 |     args = parser.parse_args()
502 |     import time
503 |     sta = time.time()
504 |     convert(args)
505 |     fin = time.time()
506 |     print("Cost time ", fin - sta, " s")
507 | 


--------------------------------------------------------------------------------