├── LICENSE
├── README.md
├── .gitignore
├── convert_diffusion_to_gguf.py
└── custom_quants.py


/LICENSE:
--------------------------------------------------------------------------------
1 | It's public domain, whatever, use it as you want :)
2 | 
3 | Best,
4 | Xuan-Son Nguyen
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Convert diffusion (Flux, SD, etc) safetensors to GGUF
 2 | 
 3 | **THIS IS A WIP**
 4 | 
 5 | Example usage:
 6 | 
 7 | ```sh
 8 | # prepare
 9 | pip install gguf torch
10 | 
11 | # example model from https://huggingface.co/black-forest-labs/FLUX.1-dev/tree/main
12 | python convert_diffusion_to_gguf.py ../models/FLUX.1-dev/flux1-dev.safetensors --arch flux --outtype Q4_0
13 | # output file: model-Q4_0.gguf
14 | 
15 | # to view help: python convert_diffusion_to_gguf.py -h
16 | ```
17 | 
18 | Note: `Q2_K` is not yet supported
19 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # UV
 98 | #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #uv.lock
102 | 
103 | # poetry
104 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
106 | #   commonly ignored for libraries.
107 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | #poetry.lock
109 | #poetry.toml
110 | 
111 | # pdm
112 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113 | #pdm.lock
114 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
115 | #   in version control.
116 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
117 | .pdm.toml
118 | .pdm-python
119 | .pdm-build/
120 | 
121 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
122 | __pypackages__/
123 | 
124 | # Celery stuff
125 | celerybeat-schedule
126 | celerybeat.pid
127 | 
128 | # SageMath parsed files
129 | *.sage.py
130 | 
131 | # Environments
132 | .env
133 | .venv
134 | env/
135 | venv/
136 | ENV/
137 | env.bak/
138 | venv.bak/
139 | 
140 | # Spyder project settings
141 | .spyderproject
142 | .spyproject
143 | 
144 | # Rope project settings
145 | .ropeproject
146 | 
147 | # mkdocs documentation
148 | /site
149 | 
150 | # mypy
151 | .mypy_cache/
152 | .dmypy.json
153 | dmypy.json
154 | 
155 | # Pyre type checker
156 | .pyre/
157 | 
158 | # pytype static type analyzer
159 | .pytype/
160 | 
161 | # Cython debug symbols
162 | cython_debug/
163 | 
164 | # PyCharm
165 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
166 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
167 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
168 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
169 | #.idea/
170 | 
171 | # Abstra
172 | # Abstra is an AI-powered process automation framework.
173 | # Ignore directories containing user credentials, local state, and settings.
174 | # Learn more at https://abstra.io/docs
175 | .abstra/
176 | 
177 | # Visual Studio Code
178 | #  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore 
179 | #  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
180 | #  and can be added to the global gitignore or merged into this file. However, if you prefer, 
181 | #  you could uncomment the following to ignore the entire vscode folder
182 | # .vscode/
183 | 
184 | # Ruff stuff:
185 | .ruff_cache/
186 | 
187 | # PyPI configuration file
188 | .pypirc
189 | 
190 | *.gguf
191 | *.safetensors
192 | 
193 | tmp
194 | 


--------------------------------------------------------------------------------
/convert_diffusion_to_gguf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | from __future__ import annotations
  5 | 
  6 | import ast
  7 | import logging
  8 | import argparse
  9 | import contextlib
 10 | import json
 11 | import safetensors.torch
 12 | import os
 13 | import re
 14 | import sys
 15 | from enum import IntEnum
 16 | from pathlib import Path
 17 | from hashlib import sha256
 18 | from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
 19 | from itertools import chain
 20 | from torch import Tensor
 21 | 
 22 | import math
 23 | import numpy as np
 24 | import torch
 25 | import gguf
 26 | import ctypes
 27 | 
 28 | 
 29 | # TODO: add more:
 30 | SUPPORTED_ARCHS = ["flux", "sd3", "ltxv", "hyvid", "wan", "hidream"]
 31 | 
 32 | logger = logging.getLogger(__name__)
 33 | 
 34 | class QuantConfig():
 35 |     ftype: gguf.LlamaFileType
 36 |     qtype: gguf.GGMLQuantizationType
 37 | 
 38 |     def __init__(self, ftype: gguf.LlamaFileType, qtype: gguf.GGMLQuantizationType):
 39 |         self.ftype = ftype
 40 |         self.qtype = qtype
 41 | 
 42 | 
 43 | qconfig_map: dict[str, QuantConfig] = {
 44 |     "F16": QuantConfig(gguf.LlamaFileType.MOSTLY_F16, gguf.GGMLQuantizationType.F16),
 45 |     "BF16": QuantConfig(gguf.LlamaFileType.MOSTLY_BF16, gguf.GGMLQuantizationType.BF16),
 46 |     "Q8_0": QuantConfig(gguf.LlamaFileType.MOSTLY_Q8_0, gguf.GGMLQuantizationType.Q8_0),
 47 |     "Q6_K": QuantConfig(gguf.LlamaFileType.MOSTLY_Q6_K, gguf.GGMLQuantizationType.Q6_K),
 48 |     "Q5_K_S": QuantConfig(gguf.LlamaFileType.MOSTLY_Q5_K_S, gguf.GGMLQuantizationType.Q5_K),
 49 |     "Q5_1": QuantConfig(gguf.LlamaFileType.MOSTLY_Q5_1, gguf.GGMLQuantizationType.Q5_1),
 50 |     "Q5_0": QuantConfig(gguf.LlamaFileType.MOSTLY_Q5_0, gguf.GGMLQuantizationType.Q5_0),
 51 |     "Q4_K_S": QuantConfig(gguf.LlamaFileType.MOSTLY_Q4_K_S, gguf.GGMLQuantizationType.Q4_K),
 52 |     "Q4_1": QuantConfig(gguf.LlamaFileType.MOSTLY_Q4_1, gguf.GGMLQuantizationType.Q4_1),
 53 |     "Q4_0": QuantConfig(gguf.LlamaFileType.MOSTLY_Q4_0, gguf.GGMLQuantizationType.Q4_0),
 54 |     "Q3_K_S": QuantConfig(gguf.LlamaFileType.MOSTLY_Q3_K_S, gguf.GGMLQuantizationType.Q3_K),
 55 |     #"Q2_S": QuantConfig(gguf.LlamaFileType.MOSTLY_Q2_K, gguf.GGMLQuantizationType.Q2_K), # not yet supported in python
 56 | }
 57 | 
 58 | 
 59 | # tree of lazy tensors
 60 | class LazyTorchTensor(gguf.LazyBase):
 61 |     _tensor_type = torch.Tensor
 62 |     # to keep the type-checker happy
 63 |     dtype: torch.dtype
 64 |     shape: torch.Size
 65 | 
 66 |     # only used when converting a torch.Tensor to a np.ndarray
 67 |     _dtype_map: dict[torch.dtype, type] = {
 68 |         torch.float16: np.float16,
 69 |         torch.float32: np.float32,
 70 |     }
 71 | 
 72 |     # used for safetensors slices
 73 |     # ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046
 74 |     # TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734
 75 |     _dtype_str_map: dict[str, torch.dtype] = {
 76 |         "F64": torch.float64,
 77 |         "F32": torch.float32,
 78 |         "BF16": torch.bfloat16,
 79 |         "F16": torch.float16,
 80 |         # "U64": torch.uint64,
 81 |         "I64": torch.int64,
 82 |         # "U32": torch.uint32,
 83 |         "I32": torch.int32,
 84 |         # "U16": torch.uint16,
 85 |         "I16": torch.int16,
 86 |         "U8": torch.uint8,
 87 |         "I8": torch.int8,
 88 |         "BOOL": torch.bool,
 89 |         "F8_E4M3": torch.float8_e4m3fn,
 90 |         "F8_E5M2": torch.float8_e5m2,
 91 |     }
 92 | 
 93 |     def numpy(self) -> gguf.LazyNumpyTensor:
 94 |         dtype = self._dtype_map[self.dtype]
 95 |         return gguf.LazyNumpyTensor(
 96 |             meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape),
 97 |             args=(self,),
 98 |             func=(lambda s: s.numpy())
 99 |         )
100 | 
101 |     @classmethod
102 |     def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -> Tensor:
103 |         return torch.empty(size=shape, dtype=dtype, device="meta")
104 | 
105 |     @classmethod
106 |     def from_safetensors_slice(cls, st_slice: Any) -> Tensor:
107 |         dtype = cls._dtype_str_map[st_slice.get_dtype()]
108 |         shape: tuple[int, ...] = tuple(st_slice.get_shape())
109 |         lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:])
110 |         return cast(torch.Tensor, lazy)
111 | 
112 |     @classmethod
113 |     def __torch_function__(cls, func, types, args=(), kwargs=None):
114 |         del types  # unused
115 | 
116 |         if kwargs is None:
117 |             kwargs = {}
118 | 
119 |         if func is torch.Tensor.numpy:
120 |             return args[0].numpy()
121 | 
122 |         return cls._wrap_fn(func)(*args, **kwargs)
123 | 
124 | 
125 | class Converter():
126 |     path_safetensors: Path
127 |     endianess: gguf.GGUFEndian
128 |     outtype: QuantConfig
129 |     outfile: Path
130 |     gguf_writer: gguf.GGUFWriter
131 | 
132 |     def __init__(self, arch: str, path_safetensors: Path, endianess: gguf.GGUFEndian, outtype: QuantConfig, outfile: Path):
133 |         self.path_safetensors = path_safetensors
134 |         self.endianess = endianess
135 |         self.outtype = outtype
136 |         self.outfile = outfile
137 | 
138 |         self.gguf_writer = gguf.GGUFWriter(path=None, arch=arch, endianess=self.endianess)
139 |         self.gguf_writer.add_file_type(self.outtype.ftype)
140 |         self.gguf_writer.add_type("diffusion") # for HF hub to detect the type correctly
141 | 
142 |         # load tensors and process
143 |         from safetensors import safe_open
144 |         ctx = cast(ContextManager[Any], safe_open(path_safetensors, framework="pt", device="cpu"))
145 |         with ctx as model_part:
146 |             for name in model_part.keys():
147 |                 data = model_part.get_slice(name)
148 |                 data = LazyTorchTensor.from_safetensors_slice(data)
149 |                 self.process_tensor(name, data)
150 | 
151 | 
152 |     def process_tensor(self, name: str, data_torch: LazyTorchTensor) -> None:
153 |         is_1d = len(data_torch.shape) == 1
154 |         current_dtype = data_torch.dtype
155 |         target_dtype = gguf.GGMLQuantizationType.F32 if is_1d else self.outtype.qtype
156 | 
157 |         if data_torch.dtype not in (torch.float16, torch.float32):
158 |             data_torch = data_torch.to(torch.float32)
159 | 
160 |         data = data_torch.numpy()
161 | 
162 |         if current_dtype != target_dtype:
163 |             from custom_quants import quantize as custom_quantize, QuantError
164 |             try:
165 |                 data = custom_quantize(data, target_dtype)
166 |             except QuantError as e:
167 |                 logger.warning("%s, %s", e, "falling back to F16")
168 |                 target_dtype = gguf.GGMLQuantizationType.F16
169 |                 data = custom_quantize(data, target_dtype)
170 | 
171 |         # reverse shape to make it similar to the internal ggml dimension order
172 |         shape = gguf.quant_shape_from_byte_shape(data.shape, target_dtype) if data.dtype == np.uint8 else data.shape
173 |         shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}"
174 |         logger.info(f"{f'%-32s' % f'{name},'} {current_dtype} --> {target_dtype.name}, shape = {shape_str}")
175 | 
176 |         # add tensor to gguf
177 |         self.gguf_writer.add_tensor(name, data, raw_dtype=target_dtype)
178 | 
179 |     def write(self) -> None:
180 |         self.gguf_writer.write_header_to_file(path=self.outfile)
181 |         self.gguf_writer.write_kv_data_to_file()
182 |         self.gguf_writer.write_tensors_to_file(progress=True)
183 |         self.gguf_writer.close()
184 | 
185 | # https://github.com/bghira/SimpleTuner/blob/cea2457ab063f6dedb9e697830ae68a96be90641/helpers/training/save_hooks.py#L64
186 | def _merge_sharded_checkpoints(folder: Path):
187 |     with open(folder / "diffusion_pytorch_model.safetensors.index.json", "r") as f:
188 |         ckpt_metadata = json.load(f) 
189 |     weight_map = ckpt_metadata.get("weight_map", None)
190 |     if weight_map is None:
191 |         raise KeyError("'weight_map' key not found in the shard index file.")
192 | 
193 |     # Collect all unique safetensors files from weight_map
194 |     files_to_load = set(weight_map.values())
195 |     merged_state_dict = {}
196 | 
197 |     # Load tensors from each unique file
198 |     for file_name in files_to_load:
199 |         part_file_path = folder /  file_name
200 |         if not os.path.exists(part_file_path):
201 |             raise FileNotFoundError(f"Part file {file_name} not found.")
202 | 
203 |         with safetensors.safe_open(part_file_path, framework="pt", device="cpu") as f:
204 |             for tensor_key in f.keys():
205 |                 if tensor_key in weight_map:
206 |                     merged_state_dict[tensor_key] = f.get_tensor(tensor_key)
207 | 
208 |     return merged_state_dict
209 | 
210 | 
211 | def parse_args() -> argparse.Namespace:
212 |     parser = argparse.ArgumentParser(
213 |         description="Convert a flux model to GGUF")
214 |     parser.add_argument(
215 |         "--outfile", type=Path, default=Path("model-{ftype}.gguf"),
216 |         help="path to write to; default: 'model-{ftype}.gguf' ; note: {ftype} will be replaced by the outtype",
217 |     )
218 |     parser.add_argument(
219 |         "--outtype", type=str, choices=qconfig_map.keys(), default="F16",
220 |         help="output quantization scheme",
221 |     )
222 |     parser.add_argument(
223 |         "--arch", type=str, choices=SUPPORTED_ARCHS,
224 |         help="output model architecture",
225 |     )
226 |     parser.add_argument(
227 |         "--bigendian", action="store_true",
228 |         help="model is executed on big endian machine",
229 |     )
230 |     parser.add_argument(
231 |         "model", type=Path,
232 |         help="directory containing safetensors model file",
233 |         nargs="?",
234 |     )
235 |     parser.add_argument(
236 |         "--verbose", action="store_true",
237 |         help="increase output verbosity",
238 |     )
239 | 
240 |     args = parser.parse_args()
241 |     if args.model is None:
242 |         parser.error("the following arguments are required: model")
243 |     if args.arch is None:
244 |         parser.error("the following arguments are required: --arch")
245 |     if args.arch not in SUPPORTED_ARCHS:
246 |         parser.error(f"Unsupported architecture: {args.arch}. Supported architectures: {', '.join(SUPPORTED_ARCHS)}")
247 |     return args
248 | 
249 | def main() -> None:
250 |     args = parse_args()
251 | 
252 |     if args.verbose:
253 |         logging.basicConfig(level=logging.DEBUG)
254 |     else:
255 |         logging.basicConfig(level=logging.INFO)
256 | 
257 |     if not args.model.is_dir() and not args.model.is_file():
258 |         logging.error(f"Model path {args.model} does not exist.")
259 |         sys.exit(1)
260 | 
261 |     if args.model.is_dir():
262 |         logging.info("Supplied a directory.")
263 |         merged_state_dict = None
264 |         files = list(args.model.glob('*.safetensors'))
265 |         n = len(files)
266 |         if n == 0:
267 |             logging.error("No safetensors files found.")
268 |             sys.exit(1)
269 |         if n == 1:
270 |             logging.info(f"Assinging {files[0]} to `args.model`")
271 |             args.model = files[0]
272 |         if n > 1:
273 |             assert args.model / "diffusion_pytorch_model.safetensors.index.json" in list(args.model.glob("*.*"))
274 |             merged_state_dict = _merge_sharded_checkpoints(args.model)
275 |             filepath = "merged_state_dict.safetensors"
276 |             safetensors.torch.save_file(merged_state_dict, filepath)
277 |             logging.info(f"Serialized merged state dict to {filepath}")
278 |             args.model = Path(filepath)
279 | 
280 |     if args.model.suffix != ".safetensors":
281 |         logging.error(f"Model path {args.model} is not a safetensors file.")
282 |         sys.exit(1)
283 | 
284 |     if args.outfile.suffix != ".gguf":
285 |         logging.error("Output file must have .gguf extension.")
286 |         sys.exit(1)
287 | 
288 |     qconfig = qconfig_map[args.outtype]
289 |     outfile = Path(str(args.outfile).format(ftype=args.outtype.upper()))
290 | 
291 |     logger.info(f"Converting model in {args.model} to {outfile} with quantization {args.outtype}")
292 |     converter = Converter(
293 |         arch=args.arch,
294 |         path_safetensors=args.model,
295 |         endianess=gguf.GGUFEndian.BIG if args.bigendian else gguf.GGUFEndian.LITTLE,
296 |         outtype=qconfig,
297 |         outfile=outfile
298 |     )
299 |     converter.write()
300 |     logger.info(f"Conversion complete. Output written to {outfile}, architecture: {args.arch}, quantization: {qconfig.qtype.name}")
301 | 
302 | if __name__ == "__main__":
303 |     main()
304 | 


--------------------------------------------------------------------------------
/custom_quants.py:
--------------------------------------------------------------------------------
   1 | from __future__ import annotations
   2 | from abc import ABC, abstractmethod
   3 | from typing import Any, Callable, Sequence
   4 | from math import log2, ceil
   5 | 
   6 | from numpy.typing import DTypeLike
   7 | 
   8 | from gguf.constants import GGML_QUANT_SIZES, GGMLQuantizationType, QK_K
   9 | from gguf.lazy import LazyNumpyTensor
  10 | 
  11 | import numpy as np
  12 | 
  13 | 
  14 | def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType) -> tuple[int, ...]:
  15 |     block_size, type_size = GGML_QUANT_SIZES[quant_type]
  16 |     if shape[-1] % block_size != 0:
  17 |         raise ValueError(f"Quantized tensor row size ({shape[-1]}) is not a multiple of {quant_type.name} block size ({block_size})")
  18 |     return (*shape[:-1], shape[-1] // block_size * type_size)
  19 | 
  20 | 
  21 | def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType) -> tuple[int, ...]:
  22 |     block_size, type_size = GGML_QUANT_SIZES[quant_type]
  23 |     if shape[-1] % type_size != 0:
  24 |         raise ValueError(f"Quantized tensor bytes per row ({shape[-1]}) is not a multiple of {quant_type.name} type size ({type_size})")
  25 |     return (*shape[:-1], shape[-1] // type_size * block_size)
  26 | 
  27 | 
  28 | # This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time
  29 | def _apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, oshape: tuple[int, ...]) -> np.ndarray:
  30 |     rows = arr.reshape((-1, arr.shape[-1]))
  31 |     osize = 1
  32 |     for dim in oshape:
  33 |         osize *= dim
  34 |     out = np.empty(shape=osize, dtype=otype)
  35 |     # compute over groups of 16 rows (arbitrary, but seems good for performance)
  36 |     n_groups = (rows.shape[0] // 16) or 1
  37 |     np.concatenate([func(group).ravel() for group in np.array_split(rows, n_groups)], axis=0, out=out)
  38 |     return out.reshape(oshape)
  39 | 
  40 | 
  41 | # round away from zero
  42 | # ref: https://stackoverflow.com/a/59143326/22827863
  43 | def np_roundf(n: np.ndarray) -> np.ndarray:
  44 |     a = abs(n)
  45 |     floored = np.floor(a)
  46 |     b = floored + np.floor(2 * (a - floored))
  47 |     return np.sign(n) * b
  48 | 
  49 | 
  50 | class QuantError(Exception): ...
  51 | 
  52 | 
  53 | _type_traits: dict[GGMLQuantizationType, type[__Quant]] = {}
  54 | 
  55 | 
  56 | def quantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray:
  57 |     if qtype == GGMLQuantizationType.F32:
  58 |         return data.astype(np.float32, copy=False)
  59 |     elif qtype == GGMLQuantizationType.F16:
  60 |         return data.astype(np.float16, copy=False)
  61 |     elif (q := _type_traits.get(qtype)) is not None:
  62 |         return q.quantize(data)
  63 |     else:
  64 |         raise NotImplementedError(f"Quantization for {qtype.name} is not yet implemented")
  65 | 
  66 | 
  67 | def dequantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray:
  68 |     if qtype == GGMLQuantizationType.F32:
  69 |         return data.view(np.float32)
  70 |     elif qtype == GGMLQuantizationType.F16:
  71 |         return data.view(np.float16).astype(np.float32)
  72 |     elif (q := _type_traits.get(qtype)) is not None:
  73 |         return q.dequantize(data)
  74 |     else:
  75 |         raise NotImplementedError(f"Dequantization for {qtype.name} is not yet implemented")
  76 | 
  77 | 
  78 | class __Quant(ABC):
  79 |     qtype: GGMLQuantizationType
  80 |     block_size: int
  81 |     type_size: int
  82 | 
  83 |     grid: np.ndarray[Any, np.dtype[np.float32]] | None = None
  84 |     grid_shape: tuple[int, int] = (0, 0)
  85 |     grid_map: tuple[int | float, ...] = ()
  86 |     grid_hex: bytes | None = None
  87 | 
  88 |     def __init__(self):
  89 |         return TypeError("Quant conversion classes can't have instances")
  90 | 
  91 |     def __init_subclass__(cls, qtype: GGMLQuantizationType) -> None:
  92 |         cls.qtype = qtype
  93 |         cls.block_size, cls.type_size = GGML_QUANT_SIZES[qtype]
  94 |         cls.__quantize_lazy = LazyNumpyTensor._wrap_fn(
  95 |             cls.__quantize_array,
  96 |             meta_noop=(np.uint8, cls.__shape_to_bytes)
  97 |         )
  98 |         cls.__dequantize_lazy = LazyNumpyTensor._wrap_fn(
  99 |             cls.__dequantize_array,
 100 |             meta_noop=(np.float32, cls.__shape_from_bytes)
 101 |         )
 102 |         assert qtype not in _type_traits
 103 |         _type_traits[qtype] = cls
 104 | 
 105 |     @classmethod
 106 |     def init_grid(cls):
 107 |         if cls.grid is not None or cls.grid_hex is None:
 108 |             return
 109 | 
 110 |         bits_per_elem = ceil(log2(len(cls.grid_map)))
 111 |         assert bits_per_elem != 0, cls.qtype.name
 112 |         elems_per_byte = 8 // bits_per_elem
 113 | 
 114 |         grid = np.frombuffer(cls.grid_hex, dtype=np.uint8)
 115 |         # decode hexadecimal chars from grid
 116 |         grid = grid.reshape((-1, 2))
 117 |         grid = (np.where(grid > 0x40, grid + 9, grid) & 0x0F) << np.array([4, 0], dtype=np.uint8).reshape((1, 2))
 118 |         grid = grid[..., 0] | grid[..., 1]
 119 |         # unpack the grid values
 120 |         grid = grid.reshape((-1, 1)) >> np.array([i for i in range(0, 8, 8 // elems_per_byte)], dtype=np.uint8).reshape((1, elems_per_byte))
 121 |         grid = (grid & ((1 << bits_per_elem) - 1)).reshape((-1, 1))
 122 |         grid_map = np.array(cls.grid_map, dtype=np.float32).reshape((1, -1))
 123 |         grid = np.take_along_axis(grid_map, grid, axis=-1)
 124 |         cls.grid = grid.reshape((1, 1, *cls.grid_shape))
 125 | 
 126 |     @classmethod
 127 |     @abstractmethod
 128 |     def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
 129 |         raise NotImplementedError
 130 | 
 131 |     @classmethod
 132 |     @abstractmethod
 133 |     def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
 134 |         raise NotImplementedError
 135 | 
 136 |     @classmethod
 137 |     def quantize_rows(cls, rows: np.ndarray) -> np.ndarray:
 138 |         rows = rows.astype(np.float32, copy=False)
 139 |         shape = rows.shape
 140 |         n_blocks = rows.size // cls.block_size
 141 |         blocks = rows.reshape((n_blocks, cls.block_size))
 142 |         blocks = cls.quantize_blocks(blocks)
 143 |         assert blocks.dtype == np.uint8
 144 |         assert blocks.shape[-1] == cls.type_size
 145 |         return blocks.reshape(cls.__shape_to_bytes(shape))
 146 | 
 147 |     @classmethod
 148 |     def dequantize_rows(cls, rows: np.ndarray) -> np.ndarray:
 149 |         rows = rows.view(np.uint8)
 150 |         shape = rows.shape
 151 |         n_blocks = rows.size // cls.type_size
 152 |         blocks = rows.reshape((n_blocks, cls.type_size))
 153 |         blocks = cls.dequantize_blocks(blocks)
 154 |         assert blocks.dtype == np.float32
 155 |         assert blocks.shape[-1] == cls.block_size
 156 |         return blocks.reshape(cls.__shape_from_bytes(shape))
 157 | 
 158 |     @classmethod
 159 |     def __shape_to_bytes(cls, shape: Sequence[int]):
 160 |         return quant_shape_to_byte_shape(shape, cls.qtype)
 161 | 
 162 |     @classmethod
 163 |     def __shape_from_bytes(cls, shape: Sequence[int]):
 164 |         return quant_shape_from_byte_shape(shape, cls.qtype)
 165 | 
 166 |     @classmethod
 167 |     def __quantize_array(cls, array: np.ndarray) -> np.ndarray:
 168 |         return _apply_over_grouped_rows(cls.quantize_rows, arr=array, otype=np.uint8, oshape=cls.__shape_to_bytes(array.shape))
 169 | 
 170 |     @classmethod
 171 |     def __dequantize_array(cls, array: np.ndarray) -> np.ndarray:
 172 |         cls.init_grid()
 173 |         return _apply_over_grouped_rows(cls.dequantize_rows, arr=array, otype=np.float32, oshape=cls.__shape_from_bytes(array.shape))
 174 | 
 175 |     @classmethod
 176 |     def __quantize_lazy(cls, lazy_tensor: LazyNumpyTensor, /) -> Any:
 177 |         pass
 178 | 
 179 |     @classmethod
 180 |     def __dequantize_lazy(cls, lazy_tensor: LazyNumpyTensor, /) -> Any:
 181 |         pass
 182 | 
 183 |     @classmethod
 184 |     def can_quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> bool:
 185 |         return tensor.shape[-1] % cls.block_size == 0
 186 | 
 187 |     @classmethod
 188 |     def quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> np.ndarray:
 189 |         if not cls.can_quantize(tensor):
 190 |             raise QuantError(f"Can't quantize tensor with shape {tensor.shape} to {cls.qtype.name}")
 191 |         if isinstance(tensor, LazyNumpyTensor):
 192 |             return cls.__quantize_lazy(tensor)
 193 |         else:
 194 |             return cls.__quantize_array(tensor)
 195 | 
 196 |     @classmethod
 197 |     def dequantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> np.ndarray:
 198 |         if isinstance(tensor, LazyNumpyTensor):
 199 |             return cls.__dequantize_lazy(tensor)
 200 |         else:
 201 |             return cls.__dequantize_array(tensor)
 202 | 
 203 | 
 204 | class BF16(__Quant, qtype=GGMLQuantizationType.BF16):
 205 |     @classmethod
 206 |     # same as ggml_compute_fp32_to_bf16 in ggml-impl.h
 207 |     def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
 208 |         n = blocks.view(np.uint32)
 209 |         # force nan to quiet
 210 |         n = np.where((n & 0x7fffffff) > 0x7f800000, (n & np.uint32(0xffff0000)) | np.uint32(64 << 16), n)
 211 |         # round to nearest even
 212 |         n = (np.uint64(n) + (0x7fff + ((n >> 16) & 1))) >> 16
 213 |         return n.astype(np.uint16).view(np.uint8)
 214 | 
 215 |     @classmethod
 216 |     def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
 217 |         return (blocks.view(np.int16).astype(np.int32) << 16).view(np.float32)
 218 | 
 219 | 
 220 | class Q4_0(__Quant, qtype=GGMLQuantizationType.Q4_0):
 221 |     @classmethod
 222 |     def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
 223 |         n_blocks = blocks.shape[0]
 224 | 
 225 |         imax = abs(blocks).argmax(axis=-1, keepdims=True)
 226 |         max = np.take_along_axis(blocks, imax, axis=-1)
 227 | 
 228 |         d = max / -8
 229 |         with np.errstate(divide="ignore"):
 230 |             id = np.where(d == 0, 0, 1 / d)
 231 |         # FIXME: Q4_0's reference rounding is cursed and depends on FMA
 232 |         qs = np.trunc((np.float64(blocks) * np.float64(id)) + np.float64(8.5), dtype=np.float32).astype(np.uint8).clip(0, 15)
 233 | 
 234 |         qs = qs.reshape((n_blocks, 2, cls.block_size // 2))
 235 |         qs = qs[..., 0, :] | (qs[..., 1, :] << np.uint8(4))
 236 | 
 237 |         d = d.astype(np.float16).view(np.uint8)
 238 | 
 239 |         return np.concatenate([d, qs], axis=-1)
 240 | 
 241 |     @classmethod
 242 |     def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
 243 |         n_blocks = blocks.shape[0]
 244 | 
 245 |         d, qs = np.hsplit(blocks, [2])
 246 | 
 247 |         d = d.view(np.float16).astype(np.float32)
 248 | 
 249 |         qs = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1))
 250 |         qs = (qs & np.uint8(0x0F)).reshape((n_blocks, -1)).astype(np.int8) - np.int8(8)
 251 | 
 252 |         return (d * qs.astype(np.float32))
 253 | 
 254 | 
 255 | class Q4_1(__Quant, qtype=GGMLQuantizationType.Q4_1):
 256 |     @classmethod
 257 |     def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
 258 |         n_blocks = blocks.shape[0]
 259 | 
 260 |         max = blocks.max(axis=-1, keepdims=True)
 261 |         min = blocks.min(axis=-1, keepdims=True)
 262 | 
 263 |         d = (max - min) / 15
 264 |         with np.errstate(divide="ignore"):
 265 |             id = np.where(d == 0, 0, 1 / d)
 266 |         qs = np.trunc((blocks - min) * id + np.float32(0.5), dtype=np.float32).astype(np.uint8).clip(0, 15)
 267 | 
 268 |         qs = qs.reshape((n_blocks, 2, cls.block_size // 2))
 269 |         qs = qs[..., 0, :] | (qs[..., 1, :] << np.uint8(4))
 270 | 
 271 |         d = d.astype(np.float16).view(np.uint8)
 272 |         m = min.astype(np.float16).view(np.uint8)
 273 | 
 274 |         return np.concatenate([d, m, qs], axis=-1)
 275 | 
 276 |     @classmethod
 277 |     def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
 278 |         n_blocks = blocks.shape[0]
 279 | 
 280 |         d, rest = np.hsplit(blocks, [2])
 281 |         m, qs = np.hsplit(rest, [2])
 282 | 
 283 |         d = d.view(np.float16).astype(np.float32)
 284 |         m = m.view(np.float16).astype(np.float32)
 285 | 
 286 |         qs = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1))
 287 |         qs = (qs & np.uint8(0x0F)).reshape((n_blocks, -1)).astype(np.float32)
 288 | 
 289 |         return (d * qs) + m
 290 | 
 291 | 
 292 | class Q5_0(__Quant, qtype=GGMLQuantizationType.Q5_0):
 293 |     @classmethod
 294 |     def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
 295 |         n_blocks = blocks.shape[0]
 296 | 
 297 |         imax = abs(blocks).argmax(axis=-1, keepdims=True)
 298 |         max = np.take_along_axis(blocks, imax, axis=-1)
 299 | 
 300 |         d = max / -16
 301 |         with np.errstate(divide="ignore"):
 302 |             id = np.where(d == 0, 0, 1 / d)
 303 |         # FIXME: Q5_0's reference rounding is cursed and depends on FMA
 304 |         q = np.trunc((np.float64(blocks) * np.float64(id)) + np.float64(16.5), dtype=np.float32).astype(np.uint8).clip(0, 31)
 305 | 
 306 |         qs = q.reshape((n_blocks, 2, cls.block_size // 2))
 307 |         qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4))
 308 | 
 309 |         qh = np.packbits(q.reshape((n_blocks, 1, 32)) >> np.uint8(4), axis=-1, bitorder="little").reshape(n_blocks, 4)
 310 | 
 311 |         d = d.astype(np.float16).view(np.uint8)
 312 | 
 313 |         return np.concatenate([d, qh, qs], axis=-1)
 314 | 
 315 |     @classmethod
 316 |     def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
 317 |         n_blocks = blocks.shape[0]
 318 | 
 319 |         d, rest = np.hsplit(blocks, [2])
 320 |         qh, qs = np.hsplit(rest, [4])
 321 | 
 322 |         d = d.view(np.float16).astype(np.float32)
 323 |         qh = qh.view(np.uint32)
 324 | 
 325 |         qh = qh.reshape((n_blocks, 1)) >> np.array([i for i in range(32)], dtype=np.uint32).reshape((1, 32))
 326 |         ql = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1))
 327 |         qh = (qh & np.uint32(0x01)).astype(np.uint8)
 328 |         ql = (ql & np.uint8(0x0F)).reshape((n_blocks, -1))
 329 | 
 330 |         qs = (ql | (qh << np.uint8(4))).astype(np.int8) - np.int8(16)
 331 | 
 332 |         return (d * qs.astype(np.float32))
 333 | 
 334 | 
 335 | class Q5_1(__Quant, qtype=GGMLQuantizationType.Q5_1):
 336 |     @classmethod
 337 |     def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
 338 |         n_blocks = blocks.shape[0]
 339 | 
 340 |         max = blocks.max(axis=-1, keepdims=True)
 341 |         min = blocks.min(axis=-1, keepdims=True)
 342 | 
 343 |         d = (max - min) / 31
 344 |         with np.errstate(divide="ignore"):
 345 |             id = np.where(d == 0, 0, 1 / d)
 346 |         q = np.trunc((blocks - min) * id + np.float32(0.5), dtype=np.float32).astype(np.uint8).clip(0, 31)
 347 | 
 348 |         qs = q.reshape((n_blocks, 2, cls.block_size // 2))
 349 |         qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4))
 350 | 
 351 |         qh = np.packbits(q.reshape((n_blocks, 1, 32)) >> np.uint8(4), axis=-1, bitorder="little").reshape(n_blocks, 4)
 352 | 
 353 |         d = d.astype(np.float16).view(np.uint8)
 354 |         m = min.astype(np.float16).view(np.uint8)
 355 | 
 356 |         return np.concatenate([d, m, qh, qs], axis=-1)
 357 | 
 358 |     @classmethod
 359 |     def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
 360 |         n_blocks = blocks.shape[0]
 361 | 
 362 |         d, rest = np.hsplit(blocks, [2])
 363 |         m, rest = np.hsplit(rest, [2])
 364 |         qh, qs = np.hsplit(rest, [4])
 365 | 
 366 |         d = d.view(np.float16).astype(np.float32)
 367 |         m = m.view(np.float16).astype(np.float32)
 368 |         qh = qh.view(np.uint32)
 369 | 
 370 |         qh = qh.reshape((n_blocks, 1)) >> np.array([i for i in range(32)], dtype=np.uint32).reshape((1, 32))
 371 |         ql = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1))
 372 |         qh = (qh & np.uint32(0x01)).astype(np.uint8)
 373 |         ql = (ql & np.uint8(0x0F)).reshape((n_blocks, -1))
 374 | 
 375 |         qs = (ql | (qh << np.uint8(4))).astype(np.float32)
 376 | 
 377 |         return (d * qs) + m
 378 | 
 379 | 
 380 | class Q8_0(__Quant, qtype=GGMLQuantizationType.Q8_0):
 381 |     @classmethod
 382 |     # Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c
 383 |     def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
 384 | 
 385 |         d = abs(blocks).max(axis=1, keepdims=True) / 127
 386 |         with np.errstate(divide="ignore"):
 387 |             id = np.where(d == 0, 0, 1 / d)
 388 |         qs = np_roundf(blocks * id)
 389 | 
 390 |         # (n_blocks, 2)
 391 |         d = d.astype(np.float16).view(np.uint8)
 392 |         # (n_blocks, block_size)
 393 |         qs = qs.astype(np.int8).view(np.uint8)
 394 | 
 395 |         return np.concatenate([d, qs], axis=1)
 396 | 
 397 |     @classmethod
 398 |     def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
 399 |         d, x = np.split(blocks, [2], axis=1)
 400 |         d = d.view(np.float16).astype(np.float32)
 401 |         x = x.view(np.int8).astype(np.float32)
 402 | 
 403 |         return (x * d)
 404 | 
 405 | 
 406 | class Q2_K(__Quant, qtype=GGMLQuantizationType.Q2_K):
 407 |     @classmethod
 408 |     def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
 409 |         n_blocks = blocks.shape[0]
 410 | 
 411 |         scales, rest = np.hsplit(blocks, [QK_K // 16])
 412 |         qs, rest = np.hsplit(rest, [QK_K // 4])
 413 |         d, dmin = np.hsplit(rest, [2])
 414 | 
 415 |         d = d.view(np.float16).astype(np.float32)
 416 |         dmin = dmin.view(np.float16).astype(np.float32)
 417 | 
 418 |         # (n_blocks, 16, 1)
 419 |         dl = (d * (scales & 0xF).astype(np.float32)).reshape((n_blocks, QK_K // 16, 1))
 420 |         ml = (dmin * (scales >> 4).astype(np.float32)).reshape((n_blocks, QK_K // 16, 1))
 421 | 
 422 |         shift = np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1))
 423 | 
 424 |         qs = (qs.reshape((n_blocks, -1, 1, 32)) >> shift) & np.uint8(3)
 425 | 
 426 |         qs = qs.reshape((n_blocks, QK_K // 16, 16)).astype(np.float32)
 427 | 
 428 |         qs = dl * qs - ml
 429 | 
 430 |         return qs.reshape((n_blocks, -1))
 431 | 
 432 | 
 433 | class Q3_K(__Quant, qtype=GGMLQuantizationType.Q3_K):
 434 |     @classmethod
 435 |     def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
 436 |         """
 437 |         Quantizes a numpy array of floats into Q3_K format.
 438 |         Vectorized implementation of the C++ reference code.
 439 |         """
 440 |         n_blocks = blocks.shape[0]
 441 |         sub_blocks = blocks.reshape((n_blocks, 16, 16))
 442 | 
 443 |         # --- Vectorized make_qx_quants logic for per-sub-block scales ---
 444 |         nmax_data = 4  # Quantization range for data: [-4, 3]
 445 | 
 446 |         flat_sub_blocks = sub_blocks.reshape(-1, 16)
 447 |         weights_data = flat_sub_blocks * flat_sub_blocks  # rmse_type=1 uses w=x*x
 448 | 
 449 |         # Find max absolute values for each sub-block
 450 |         abs_sub_blocks = np.abs(flat_sub_blocks)
 451 |         max_indices = np.argmax(abs_sub_blocks, axis=-1, keepdims=True)
 452 |         max_vals = np.take_along_axis(flat_sub_blocks, max_indices, axis=-1)
 453 | 
 454 |         # Iteratively find the best scale for each sub-block
 455 |         with np.errstate(divide="ignore", invalid="ignore"):
 456 |             initial_iscale = np.where(max_vals == 0, 0, -nmax_data / max_vals)
 457 | 
 458 |         # Initial calculation (is=0)
 459 |         l = np_roundf(flat_sub_blocks * initial_iscale).clip(-nmax_data, nmax_data - 1)
 460 |         sumlx = np.sum(weights_data * flat_sub_blocks * l, axis=-1)
 461 |         suml2 = np.sum(weights_data * l * l, axis=-1)
 462 | 
 463 |         with np.errstate(divide="ignore", invalid="ignore"):
 464 |             current_scales = np.divide(sumlx, suml2, out=np.zeros_like(sumlx), where=suml2 != 0)
 465 | 
 466 |         best_scores = current_scales * sumlx
 467 |         best_scales = current_scales.copy()
 468 | 
 469 |         # Iterative search over potential iscale adjustments
 470 |         for is_ in range(-9, 10):
 471 |             if is_ == 0: continue
 472 |             with np.errstate(divide="ignore", invalid="ignore"):
 473 |                 iscale_try = -(nmax_data + 0.1 * is_) / max_vals
 474 |                 iscale_try[max_vals == 0] = 0
 475 | 
 476 |             l_try = np_roundf(flat_sub_blocks * iscale_try).clip(-nmax_data, nmax_data - 1)
 477 |             sumlx_try = np.sum(weights_data * flat_sub_blocks * l_try, axis=-1)
 478 |             suml2_try = np.sum(weights_data * l_try * l_try, axis=-1)
 479 | 
 480 |             improvement_mask = (suml2_try > 0) & (sumlx_try * sumlx_try * suml2 > best_scores * suml2_try)
 481 |             if np.any(improvement_mask):
 482 |                 with np.errstate(divide="ignore", invalid="ignore"):
 483 |                     scales_try = np.divide(sumlx_try, suml2_try, out=np.zeros_like(sumlx_try), where=suml2_try != 0)
 484 |                 best_scores[improvement_mask] = (scales_try * sumlx_try)[improvement_mask]
 485 |                 best_scales[improvement_mask] = scales_try[improvement_mask]
 486 |                 # Update suml2 for the next comparison in the loop
 487 |                 suml2[improvement_mask] = suml2_try[improvement_mask]
 488 | 
 489 |         scales = best_scales.reshape(n_blocks, 16)
 490 | 
 491 |         # --- Vectorized logic to quantize the scales themselves ---
 492 |         nmax_scales = 32  # Quantization range for scales: [-32, 31]
 493 |         abs_scales = np.abs(scales)
 494 |         max_scale_indices = np.argmax(abs_scales, axis=-1, keepdims=True)
 495 |         max_scale_vals = np.take_along_axis(scales, max_scale_indices, axis=-1)
 496 | 
 497 |         with np.errstate(divide="ignore", invalid="ignore"):
 498 |             iscale_s = np.where(max_scale_vals == 0, 0, -nmax_scales / max_scale_vals)
 499 | 
 500 |         l_s = np_roundf(scales * iscale_s).clip(-nmax_scales, nmax_scales - 1)
 501 |         d_val = np.divide(np.sum(scales * l_s, axis=-1, keepdims=True),
 502 |                           np.sum(l_s * l_s, axis=-1, keepdims=True),
 503 |                           out=np.zeros((n_blocks,1)), where=np.sum(l_s*l_s, axis=-1, keepdims=True)!=0)
 504 | 
 505 |         # Pack the 6-bit quantized scales into 12 bytes
 506 |         l = (l_s + 32).astype(np.uint8)
 507 |         scales_packed = np.zeros((n_blocks, 12), dtype=np.uint8)
 508 |         l_low = l & 0x0F
 509 |         l_high = (l >> 4) & 0x03
 510 |         scales_packed[:, 0:8] = l_low[:, 0:8] | (l_low[:, 8:16] << 4)
 511 |         l_high_reshaped = l_high.reshape(n_blocks, 4, 4).transpose(0, 2, 1)
 512 |         packed_high_bits = l_high_reshaped[:, :, 0] | \
 513 |                            (l_high_reshaped[:, :, 1] << 2) | \
 514 |                            (l_high_reshaped[:, :, 2] << 4) | \
 515 |                            (l_high_reshaped[:, :, 3] << 6)
 516 |         scales_packed[:, 8:12] = packed_high_bits
 517 |         d = d_val.astype(np.float16).view(np.uint8)
 518 | 
 519 |         # --- Re-quantize data with final scales and pack ---
 520 |         sc_dequant = (l.astype(np.int8) - 32).astype(np.float32)
 521 |         d_eff = (d_val * sc_dequant).reshape(n_blocks, 16, 1)
 522 | 
 523 |         with np.errstate(divide="ignore", invalid="ignore"):
 524 |             l_data_float = np.divide(sub_blocks, d_eff, out=np.zeros_like(sub_blocks), where=d_eff != 0)
 525 | 
 526 |         l_data = (np.clip(np_roundf(l_data_float), -4, 3) + 4).astype(np.uint8)
 527 |         l_data = l_data.reshape(n_blocks, 256)
 528 | 
 529 |         # hmask stores the 3rd bit
 530 |         hmask_values = (l_data > 3).reshape(n_blocks, 8, 32).transpose(0, 2, 1)
 531 |         hmask = np.packbits(hmask_values, axis=-1, bitorder='little').reshape(n_blocks, -1)
 532 | 
 533 |         # qs stores the lower 2 bits
 534 |         l_data[l_data > 3] -= 4
 535 |         l_data_low = (l_data & 0x03).reshape(n_blocks, 2, 4, 32)
 536 |         qs_parts = l_data_low[:, :, 0, :] | \
 537 |                    (l_data_low[:, :, 1, :] << 2) | \
 538 |                    (l_data_low[:, :, 2, :] << 4) | \
 539 |                    (l_data_low[:, :, 3, :] << 6)
 540 |         qs = qs_parts.reshape(n_blocks, 64)
 541 | 
 542 |         return np.concatenate([hmask, qs, scales_packed, d], axis=1)
 543 | 
 544 |     @classmethod
 545 |     def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
 546 |         n_blocks = blocks.shape[0]
 547 | 
 548 |         hmask, rest = np.hsplit(blocks, [QK_K // 8])
 549 |         qs, rest = np.hsplit(rest, [QK_K // 4])
 550 |         scales, d = np.hsplit(rest, [12])
 551 | 
 552 |         d = d.view(np.float16).astype(np.float32)
 553 | 
 554 |         # The scales are packed at 6-bit each in this pattern:
 555 |         #  0: IIIIAAAA
 556 |         #  1: JJJJBBBB
 557 |         #  2: KKKKCCCC
 558 |         #  3: LLLLDDDD
 559 |         #  4: MMMMEEEE
 560 |         #  5: NNNNFFFF
 561 |         #  6: OOOOGGGG
 562 |         #  7: PPPPHHHH
 563 |         #  8: MMIIEEAA
 564 |         #  9: NNJJFFBB
 565 |         # 10: OOKKGGCC
 566 |         # 11: PPLLHHDD
 567 |         lscales, hscales = np.hsplit(scales, [8])
 568 |         lscales = lscales.reshape((n_blocks, 1, 8)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 2, 1))
 569 |         lscales = lscales.reshape((n_blocks, 16))
 570 |         hscales = hscales.reshape((n_blocks, 1, 4)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 4, 1))
 571 |         hscales = hscales.reshape((n_blocks, 16))
 572 |         scales = (lscales & np.uint8(0x0F)) | ((hscales & np.uint8(0x03)) << np.uint8(4))
 573 |         scales = (scales.astype(np.int8) - np.int8(32)).astype(np.float32)
 574 | 
 575 |         dl = (d * scales).reshape((n_blocks, 16, 1))
 576 | 
 577 |         ql = qs.reshape((n_blocks, -1, 1, 32)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1))
 578 |         qh = hmask.reshape(n_blocks, -1, 1, 32) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 8, 1))
 579 |         ql = ql.reshape((n_blocks, 16, QK_K // 16)) & np.uint8(3)
 580 |         qh = (qh.reshape((n_blocks, 16, QK_K // 16)) & np.uint8(1))
 581 |         qh = qh ^ np.uint8(1)  # strangely, the offset is zero when the bitmask is 1
 582 |         q = (ql.astype(np.int8) - (qh << np.uint8(2)).astype(np.int8)).astype(np.float32)
 583 | 
 584 |         return (dl * q).reshape((n_blocks, QK_K))
 585 | 
 586 | 
 587 | class Q4_K(__Quant, qtype=GGMLQuantizationType.Q4_K):
 588 |     K_SCALE_SIZE = 12
 589 |     QK_K = QK_K # Block size
 590 | 
 591 |     @classmethod
 592 |     def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
 593 |         """
 594 |         Quantizes a numpy array of floats into Q4_K format.
 595 |         Vectorized implementation inspired by the C++ reference code.
 596 |         """
 597 |         if blocks.shape[-1] % cls.QK_K != 0:
 598 |             raise ValueError(f"The last dimension of the input array must be a multiple of {cls.QK_K}, but got {blocks.shape[-1]}")
 599 | 
 600 |         n_blocks = blocks.size // cls.QK_K
 601 |         sub_blocks = blocks.reshape((n_blocks, 8, 32))
 602 | 
 603 |         # --- Vectorized make_qkx2_quants logic ---
 604 |         nmax = 15
 605 |         rmin = -1.0
 606 |         rdelta = 0.1
 607 |         nstep = 20
 608 | 
 609 |         # Calculate weights for all sub-blocks
 610 |         sum_x2 = np.sum(sub_blocks * sub_blocks, axis=-1, keepdims=True)
 611 |         # Use np.maximum to avoid sqrt of negative number due to float precision
 612 |         av_x = np.sqrt(np.maximum(0, sum_x2 / 32.0))
 613 |         weights = av_x + np.abs(sub_blocks)
 614 |         sum_w = np.sum(weights, axis=-1, keepdims=True)
 615 |         sum_x = np.sum(weights * sub_blocks, axis=-1, keepdims=True)
 616 | 
 617 |         # Initial guess for scales and mins
 618 |         min_v = np.min(sub_blocks, axis=-1, keepdims=True)
 619 |         max_v = np.max(sub_blocks, axis=-1, keepdims=True)
 620 |         min_v[min_v > 0] = 0.0
 621 | 
 622 |         max_minus_min = max_v - min_v
 623 | 
 624 |         # Handle cases where all values in a sub-block are the same
 625 |         is_flat = max_minus_min < 1e-8
 626 |         max_minus_min[is_flat] = 1.0 # Avoid division by zero
 627 | 
 628 |         with np.errstate(divide="ignore"):
 629 |             iscale = nmax / max_minus_min
 630 |         scale = 1.0 / iscale
 631 |         scale[is_flat] = 0.0
 632 | 
 633 |         l_current = np_roundf(iscale * (sub_blocks - min_v)).clip(0, nmax).astype(np.uint8)
 634 |         diff = scale * l_current + min_v - sub_blocks
 635 |         best_mse = np.sum(weights * (diff * diff), axis=-1)
 636 | 
 637 |         scale_best = scale.squeeze(-1)
 638 |         min_best = min_v.squeeze(-1)
 639 | 
 640 |         # Iterative search loop over all sub-blocks at once
 641 |         for is_ in range(nstep + 1):
 642 |             with np.errstate(divide="ignore"):
 643 |                 current_iscale = (rmin + rdelta * is_ + nmax) / max_minus_min
 644 |             current_iscale[is_flat] = 0.0
 645 | 
 646 |             l_aux = np_roundf(current_iscale * (sub_blocks - min_v)).clip(0, nmax).astype(np.uint8)
 647 | 
 648 |             w_l = weights * l_aux
 649 |             sum_l = np.sum(w_l, axis=-1, keepdims=True)
 650 |             sum_l2 = np.sum(w_l * l_aux, axis=-1, keepdims=True)
 651 |             sum_xl = np.sum(w_l * sub_blocks, axis=-1, keepdims=True)
 652 | 
 653 |             D = sum_w * sum_l2 - sum_l * sum_l
 654 | 
 655 |             valid_D_mask = D > 0
 656 |             # Use np.where for safe division, filling invalid entries with 0
 657 |             this_scale = np.divide((sum_w * sum_xl - sum_x * sum_l), D, out=np.zeros_like(D), where=valid_D_mask)
 658 |             this_min = np.divide((sum_l2 * sum_x - sum_l * sum_xl), D, out=np.zeros_like(D), where=valid_D_mask)
 659 | 
 660 |             # Handle case where candidate min > 0
 661 |             min_gt_zero_mask = valid_D_mask & (this_min > 0)
 662 |             if np.any(min_gt_zero_mask):
 663 |                 recalc_scale = np.divide(sum_xl, sum_l2, out=np.zeros_like(sum_xl), where=sum_l2 > 0)
 664 |                 this_scale = np.where(min_gt_zero_mask, recalc_scale, this_scale)
 665 |                 this_min = np.where(min_gt_zero_mask, 0.0, this_min)
 666 | 
 667 |             # Calculate current MSE
 668 |             diff = this_scale * l_aux + this_min - sub_blocks
 669 |             current_mse = np.sum(weights * (diff * diff), axis=-1)
 670 | 
 671 |             # Update best values where MSE has improved
 672 |             improvement_mask = valid_D_mask.squeeze(-1) & (current_mse < best_mse)
 673 |             if np.any(improvement_mask):
 674 |                 best_mse[improvement_mask] = current_mse[improvement_mask]
 675 |                 scale_best[improvement_mask] = this_scale.squeeze(-1)[improvement_mask]
 676 |                 min_best[improvement_mask] = this_min.squeeze(-1)[improvement_mask]
 677 | 
 678 |         scales_all = scale_best
 679 |         mins_all = -min_best
 680 |         # --- End of vectorized search ---
 681 | 
 682 |         # Find block-level d and dmin
 683 |         max_scale_per_block = np.max(scales_all, axis=1, keepdims=True)
 684 |         max_min_per_block = np.max(mins_all, axis=1, keepdims=True)
 685 | 
 686 |         # Quantize and pack scales and mins
 687 |         with np.errstate(divide="ignore", invalid="ignore"):
 688 |             inv_scale = np.where(max_scale_per_block == 0, 0, 63.0 / max_scale_per_block)
 689 |             inv_min = np.where(max_min_per_block == 0, 0, 63.0 / max_min_per_block)
 690 | 
 691 |         ls = np.clip(np_roundf(scales_all * inv_scale), 0, 63).astype(np.uint8)
 692 |         lm = np.clip(np_roundf(mins_all * inv_min), 0, 63).astype(np.uint8)
 693 | 
 694 |         scales_packed = np.zeros((n_blocks, cls.K_SCALE_SIZE), dtype=np.uint8)
 695 |         scales_packed[:, 0:4] = ls[:, 0:4] & 0x3F
 696 |         scales_packed[:, 4:8] = lm[:, 0:4] & 0x3F
 697 |         scales_packed[:, 8:12] = (ls[:, 4:8] & 0x0F) | ((lm[:, 4:8] & 0x0F) << 4)
 698 |         scales_packed[:, 0:4] |= (ls[:, 4:8] >> 4) << 6
 699 |         scales_packed[:, 4:8] |= (lm[:, 4:8] >> 4) << 6
 700 | 
 701 |         # Store block-level d and dmin
 702 |         with np.errstate(divide="ignore", invalid="ignore"):
 703 |             d_val = np.where(max_scale_per_block == 0, 0, max_scale_per_block / 63.0)
 704 |             dmin_val = np.where(max_min_per_block == 0, 0, max_min_per_block / 63.0)
 705 | 
 706 |         d = d_val.reshape(n_blocks, 1).astype(np.float16).view(np.uint8)
 707 |         dmin = dmin_val.reshape(n_blocks, 1).astype(np.float16).view(np.uint8)
 708 | 
 709 |         # Re-quantize the actual data
 710 |         d_eff = (d_val * ls.astype(np.float32)).reshape(n_blocks, 8, 1)
 711 |         m_eff = (dmin_val * lm.astype(np.float32)).reshape(n_blocks, 8, 1)
 712 | 
 713 |         with np.errstate(divide="ignore", invalid="ignore"):
 714 |             L_float = np.divide(sub_blocks + m_eff, d_eff, out=np.zeros_like(sub_blocks), where=d_eff != 0)
 715 | 
 716 |         L = np.clip(np_roundf(L_float), 0, 15).astype(np.uint8)
 717 | 
 718 |         # Pack the 4-bit quantized data
 719 |         L_reshaped = L.reshape((n_blocks, cls.QK_K // 64, 2, 32))
 720 |         L_low = L_reshaped[:, :, 0, :].reshape(n_blocks, -1)
 721 |         L_high = L_reshaped[:, :, 1, :].reshape(n_blocks, -1)
 722 |         qs = L_low | (L_high << 4)
 723 | 
 724 |         # Assemble and return the final block
 725 |         return np.concatenate([d, dmin, scales_packed, qs], axis=1)
 726 | 
 727 |     @staticmethod
 728 |     def get_scale_min(scales: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
 729 |         n_blocks = scales.shape[0]
 730 |         s = scales.view(np.uint8).reshape(n_blocks, Q4_K.K_SCALE_SIZE)
 731 | 
 732 |         sc = np.zeros((n_blocks, 8), dtype=np.uint8)
 733 |         m  = np.zeros((n_blocks, 8), dtype=np.uint8)
 734 | 
 735 |         sc[:, 0:4] = s[:, 0:4] & 0x3F
 736 |         m[:, 0:4]  = s[:, 4:8] & 0x3F
 737 | 
 738 |         sc[:, 4:8] = (s[:, 8:12] & 0x0F) | ((s[:, 0:4] >> 6) << 4)
 739 |         m[:, 4:8]  = (s[:, 8:12] >> 4)   | ((s[:, 4:8] >> 6) << 4)
 740 | 
 741 |         return sc, m
 742 | 
 743 |     @classmethod
 744 |     def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
 745 |         n_blocks = blocks.shape[0]
 746 | 
 747 |         d, rest = np.hsplit(blocks, [2])
 748 |         dmin, rest = np.hsplit(rest, [2])
 749 |         scales, qs = np.hsplit(rest, [cls.K_SCALE_SIZE])
 750 | 
 751 |         d = d.view(np.float16).astype(np.float32)
 752 |         dmin = dmin.view(np.float16).astype(np.float32)
 753 | 
 754 |         sc, m = cls.get_scale_min(scales)
 755 | 
 756 |         d_eff = (d * sc.astype(np.float32)).reshape((n_blocks, 8, 1))
 757 |         dm_eff = (dmin * m.astype(np.float32)).reshape((n_blocks, 8, 1))
 758 | 
 759 |         # Unpack 4-bit values and arrange back into sub-blocks
 760 |         qs_reshaped = qs.reshape(n_blocks, QK_K // 64, 32)
 761 |         qs_unpacked = np.empty((n_blocks, 8, 32), dtype=np.float32)
 762 |         qs_unpacked[:, [0, 2, 4, 6], :] = (qs_reshaped & 0x0F)
 763 |         qs_unpacked[:, [1, 3, 5, 7], :] = (qs_reshaped >> 4)
 764 | 
 765 |         return (d_eff * qs_unpacked - dm_eff).reshape((n_blocks, QK_K))
 766 | 
 767 | 
 768 | class Q5_K(__Quant, qtype=GGMLQuantizationType.Q5_K):
 769 |     @classmethod
 770 |     def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
 771 |         """
 772 |         Quantizes a numpy array of floats into Q5_K format.
 773 |         Vectorized implementation of the C++ reference code.
 774 |         """
 775 |         if blocks.shape[-1] % QK_K != 0:
 776 |             raise ValueError(f"The last dimension of the input array must be a multiple of {QK_K}, but got {blocks.shape[-1]}")
 777 | 
 778 |         n_blocks = blocks.size // QK_K
 779 |         sub_blocks = blocks.reshape((n_blocks, 8, 32))
 780 | 
 781 |         # --- Vectorized make_qkx3_quants logic for 5 bits ---
 782 |         nmax = 31
 783 |         nstep = 36
 784 |         rmin = -0.9
 785 |         rdelta = 0.05
 786 | 
 787 |         # Calculate weights for all sub-blocks
 788 |         sum_x2 = np.sum(sub_blocks * sub_blocks, axis=-1, keepdims=True)
 789 |         av_x = np.sqrt(np.maximum(0, 2 * sum_x2 / QK_K)) # sigma calculation from C++
 790 |         weights = av_x + np.abs(sub_blocks)
 791 |         sum_w = np.sum(weights, axis=-1, keepdims=True)
 792 |         sum_x = np.sum(weights * sub_blocks, axis=-1, keepdims=True)
 793 | 
 794 |         min_v = np.min(sub_blocks, axis=-1, keepdims=True)
 795 |         max_v = np.max(sub_blocks, axis=-1, keepdims=True)
 796 |         min_v[min_v > 0] = 0.0
 797 | 
 798 |         max_minus_min = max_v - min_v
 799 |         is_flat = max_minus_min < 1e-8
 800 |         max_minus_min[is_flat] = 1.0
 801 | 
 802 |         # Initial mse for comparison
 803 |         with np.errstate(divide="ignore"):
 804 |             iscale_initial = nmax / max_minus_min
 805 |         scale_initial = 1.0 / iscale_initial
 806 |         scale_initial[is_flat] = 0.0
 807 |         l_initial = np_roundf(iscale_initial * (sub_blocks - min_v)).clip(0, nmax).astype(np.uint8)
 808 |         diff = scale_initial * l_initial + min_v - sub_blocks
 809 |         best_mse = np.sum(weights * (diff * diff), axis=-1)
 810 | 
 811 |         scale_best = scale_initial.squeeze(-1)
 812 |         min_best = min_v.squeeze(-1)
 813 | 
 814 |         # Iterative search
 815 |         for is_ in range(nstep + 1):
 816 |             with np.errstate(divide="ignore"):
 817 |                 current_iscale = (rmin + rdelta * is_ + nmax) / max_minus_min
 818 |             current_iscale[is_flat] = 0.0
 819 | 
 820 |             l_aux = np_roundf(current_iscale * (sub_blocks - min_v)).clip(0, nmax).astype(np.uint8)
 821 |             w_l = weights * l_aux
 822 |             sum_l = np.sum(w_l, axis=-1, keepdims=True)
 823 |             sum_l2 = np.sum(w_l * l_aux, axis=-1, keepdims=True)
 824 |             sum_xl = np.sum(w_l * sub_blocks, axis=-1, keepdims=True)
 825 | 
 826 |             D = sum_w * sum_l2 - sum_l * sum_l
 827 |             valid_D_mask = D > 0
 828 |             this_scale = np.divide((sum_w * sum_xl - sum_x * sum_l), D, out=np.zeros_like(D), where=valid_D_mask)
 829 |             this_min = np.divide((sum_l2 * sum_x - sum_l * sum_xl), D, out=np.zeros_like(D), where=valid_D_mask)
 830 | 
 831 |             min_gt_zero_mask = valid_D_mask & (this_min > 0)
 832 |             if np.any(min_gt_zero_mask):
 833 |                 recalc_scale = np.divide(sum_xl, sum_l2, out=np.zeros_like(sum_xl), where=sum_l2 > 0)
 834 |                 this_scale = np.where(min_gt_zero_mask, recalc_scale, this_scale)
 835 |                 this_min = np.where(min_gt_zero_mask, 0.0, this_min)
 836 | 
 837 |             diff = this_scale * l_aux + this_min - sub_blocks
 838 |             current_mse = np.sum(weights * (diff * diff), axis=-1)
 839 |             improvement_mask = valid_D_mask.squeeze(-1) & (current_mse < best_mse)
 840 |             if np.any(improvement_mask):
 841 |                 best_mse[improvement_mask] = current_mse[improvement_mask]
 842 |                 scale_best[improvement_mask] = this_scale.squeeze(-1)[improvement_mask]
 843 |                 min_best[improvement_mask] = this_min.squeeze(-1)[improvement_mask]
 844 | 
 845 |         scales_all = scale_best
 846 |         mins_all = -min_best
 847 | 
 848 |         # --- Quantize and pack scales/mins (identical to Q4_K) ---
 849 |         max_scale_per_block = np.max(scales_all, axis=1, keepdims=True)
 850 |         max_min_per_block = np.max(mins_all, axis=1, keepdims=True)
 851 |         with np.errstate(divide="ignore", invalid="ignore"):
 852 |             inv_scale = np.where(max_scale_per_block == 0, 0, 63.0 / max_scale_per_block)
 853 |             inv_min = np.where(max_min_per_block == 0, 0, 63.0 / max_min_per_block)
 854 |         ls = np.clip(np_roundf(scales_all * inv_scale), 0, 63).astype(np.uint8)
 855 |         lm = np.clip(np_roundf(mins_all * inv_min), 0, 63).astype(np.uint8)
 856 | 
 857 |         scales_packed = np.zeros((n_blocks, Q4_K.K_SCALE_SIZE), dtype=np.uint8)
 858 |         scales_packed[:, 0:4] = ls[:, 0:4] & 0x3F
 859 |         scales_packed[:, 4:8] = lm[:, 0:4] & 0x3F
 860 |         scales_packed[:, 8:12] = (ls[:, 4:8] & 0x0F) | ((lm[:, 4:8] & 0x0F) << 4)
 861 |         scales_packed[:, 0:4] |= (ls[:, 4:8] >> 4) << 6
 862 |         scales_packed[:, 4:8] |= (lm[:, 4:8] >> 4) << 6
 863 | 
 864 |         # --- Store block-level d and dmin (identical to Q4_K) ---
 865 |         with np.errstate(divide="ignore", invalid="ignore"):
 866 |             d_val = np.where(max_scale_per_block == 0, 0, max_scale_per_block / 63.0)
 867 |             dmin_val = np.where(max_min_per_block == 0, 0, max_min_per_block / 63.0)
 868 |         d = d_val.reshape(n_blocks, 1).astype(np.float16).view(np.uint8)
 869 |         dmin = dmin_val.reshape(n_blocks, 1).astype(np.float16).view(np.uint8)
 870 | 
 871 |         # --- Re-quantize the actual data to 5 bits ---
 872 |         d_eff = (d_val * ls.astype(np.float32)).reshape(n_blocks, 8, 1)
 873 |         m_eff = (dmin_val * lm.astype(np.float32)).reshape(n_blocks, 8, 1)
 874 |         with np.errstate(divide="ignore", invalid="ignore"):
 875 |             L_float = np.divide(sub_blocks + m_eff, d_eff, out=np.zeros_like(sub_blocks), where=d_eff != 0)
 876 |         L = np.clip(np_roundf(L_float), 0, 31).astype(np.uint8)
 877 | 
 878 |         # --- Pack the 5-bit quantized data into qh and qs ---
 879 |         # qh (high bits)
 880 |         h = (L > 15).astype(np.uint8)
 881 |         h_reshaped = h.reshape(n_blocks, 8, 32).transpose(0, 2, 1)
 882 |         bit_shifts = 2**np.arange(8, dtype=np.uint8).reshape(1, 1, 8)
 883 |         qh = np.sum(h_reshaped * bit_shifts, axis=-1).astype(np.uint8)
 884 | 
 885 |         # qs (low bits)
 886 |         L[L > 15] -= 16
 887 |         l_reshaped = L.reshape(n_blocks, 8, 32)
 888 |         part1 = l_reshaped[:, ::2, :].reshape(n_blocks, -1)
 889 |         part2 = l_reshaped[:, 1::2, :].reshape(n_blocks, -1)
 890 |         qs = part1 | (part2 << 4)
 891 | 
 892 |         return np.concatenate([d, dmin, scales_packed, qh, qs], axis=1)
 893 | 
 894 |     @classmethod
 895 |     def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
 896 |         n_blocks = blocks.shape[0]
 897 | 
 898 |         d, rest = np.hsplit(blocks, [2])
 899 |         dmin, rest = np.hsplit(rest, [2])
 900 |         scales, rest = np.hsplit(rest, [Q4_K.K_SCALE_SIZE])
 901 |         qh, qs = np.hsplit(rest, [QK_K // 8])
 902 | 
 903 |         d = d.view(np.float16).astype(np.float32)
 904 |         dmin = dmin.view(np.float16).astype(np.float32)
 905 | 
 906 |         sc, m = Q4_K.get_scale_min(scales)
 907 | 
 908 |         d_eff = (d * sc.astype(np.float32)).reshape((n_blocks, -1, 1))
 909 |         dm_eff = (dmin * m.astype(np.float32)).reshape((n_blocks, -1, 1))
 910 | 
 911 |         # Unpack high bits (qh)
 912 |         bit_shifts = 2**np.arange(8, dtype=np.uint8).reshape(1, 1, 8)
 913 |         qh_unpacked = (qh[:, :, np.newaxis] & bit_shifts) != 0
 914 |         qh_unpacked = qh_unpacked.transpose(0, 2, 1).reshape(n_blocks, -1, 32)
 915 | 
 916 |         # Unpack low bits (qs)
 917 |         ql_unpacked = np.empty((n_blocks, 8, 32), dtype=np.uint8)
 918 |         qs_reshaped = qs.reshape(n_blocks, 4, 32)
 919 |         ql_unpacked[:, ::2, :] = qs_reshaped & 0x0F
 920 |         ql_unpacked[:, 1::2, :] = qs_reshaped >> 4
 921 | 
 922 |         # Combine high and low bits and dequantize
 923 |         q = (ql_unpacked + (qh_unpacked * 16)).astype(np.float32)
 924 |         return (d_eff * q - dm_eff).reshape((n_blocks, QK_K))
 925 | 
 926 | 
 927 | class Q6_K(__Quant, qtype=GGMLQuantizationType.Q6_K):
 928 |     @classmethod
 929 |     def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
 930 |         """
 931 |         Quantizes a numpy array of floats into Q6_K format.
 932 |         Vectorized implementation of the C++ reference code.
 933 |         """
 934 |         n_blocks = blocks.shape[0]
 935 |         # Reshape for sub-block processing
 936 |         sub_blocks = blocks.reshape(n_blocks * 16, 16)
 937 | 
 938 |         # --- Vectorized `make_qx_quants` for all sub-blocks to find initial scales ---
 939 |         nmax_data = 32  # For Q6_K, data range is [-32, 31]
 940 | 
 941 |         # Weights are x*x for the reference implementation
 942 |         weights_data = sub_blocks * sub_blocks
 943 | 
 944 |         # Find max absolute values for each sub-block to determine the initial scale
 945 |         abs_sub_blocks = np.abs(sub_blocks)
 946 |         max_indices = np.argmax(abs_sub_blocks, axis=-1, keepdims=True)
 947 |         max_vals = np.take_along_axis(sub_blocks, max_indices, axis=-1)
 948 | 
 949 |         is_zero_mask = np.abs(max_vals) < 1e-15
 950 | 
 951 |         with np.errstate(divide="ignore", invalid="ignore"):
 952 |             initial_iscale = np.where(is_zero_mask, 0, -nmax_data / max_vals)
 953 | 
 954 |         # Use np.round for round-half-to-even, matching C's nearest_int
 955 |         l = np.round(sub_blocks * initial_iscale).clip(-nmax_data, nmax_data - 1)
 956 |         sumlx = np.sum(weights_data * sub_blocks * l, axis=-1)
 957 |         suml2 = np.sum(weights_data * l * l, axis=-1)
 958 | 
 959 |         with np.errstate(divide="ignore", invalid="ignore"):
 960 |             scales_cand = np.divide(sumlx, suml2, out=np.zeros_like(sumlx), where=suml2 != 0)
 961 |         best_scores = scales_cand * sumlx
 962 |         best_l = l.copy()
 963 | 
 964 |         # Iterative search over potential iscale adjustments
 965 |         for is_ in range(-9, 10):
 966 |             if is_ == 0: continue
 967 |             with np.errstate(divide="ignore", invalid="ignore"):
 968 |                 iscale_try = np.where(is_zero_mask, 0, -(nmax_data + 0.1 * is_) / max_vals)
 969 | 
 970 |             l_try = np.round(sub_blocks * iscale_try).clip(-nmax_data, nmax_data - 1)
 971 |             sumlx_try = np.sum(weights_data * sub_blocks * l_try, axis=-1)
 972 |             suml2_try = np.sum(weights_data * l_try * l_try, axis=-1)
 973 | 
 974 |             improvement_mask = (suml2_try > 0) & (sumlx_try * sumlx_try * suml2 > best_scores * suml2_try)
 975 |             if np.any(improvement_mask):
 976 |                 with np.errstate(divide="ignore", invalid="ignore"):
 977 |                     new_best_scores = np.divide(sumlx_try * sumlx_try, suml2_try, where=suml2_try > 0)
 978 |                 best_scores[improvement_mask] = new_best_scores[improvement_mask]
 979 |                 best_l[improvement_mask] = l_try[improvement_mask]
 980 |                 suml2[improvement_mask] = suml2_try[improvement_mask]
 981 | 
 982 |         # Recompute final best scales from the best quants (best_l)
 983 |         sumlx_final = np.sum(weights_data * sub_blocks * best_l, axis=-1)
 984 |         suml2_final = np.sum(weights_data * best_l * best_l, axis=-1)
 985 |         with np.errstate(divide="ignore", invalid="ignore"):
 986 |             scales = np.divide(sumlx_final, suml2_final, out=np.zeros_like(sumlx_final), where=suml2_final != 0)
 987 | 
 988 |         scales[np.all(sub_blocks == 0, axis=-1)] = 0.0
 989 |         scales = scales.reshape(n_blocks, 16)
 990 | 
 991 |         # --- Quantize the scales themselves ---
 992 |         abs_scales = np.abs(scales)
 993 |         max_abs_scale_indices = np.argmax(abs_scales, axis=-1, keepdims=True)
 994 |         max_scale_vals = np.take_along_axis(scales, max_abs_scale_indices, axis=-1)
 995 | 
 996 |         with np.errstate(divide="ignore", invalid="ignore"):
 997 |             is_zero_mask = np.abs(max_scale_vals) < 1e-15
 998 |             iscale_s = np.where(is_zero_mask, 0, -128.0 / max_scale_vals)
 999 |             d_val = np.where(is_zero_mask, 0, max_scale_vals / -128.0)
1000 | 
1001 |         quantized_scales = np.round(scales * iscale_s).clip(-128, 127).astype(np.int8)
1002 |         d = d_val.astype(np.float16).view(np.uint8)
1003 | 
1004 |         # --- Re-quantize original data with final scales ---
1005 |         d_sub = d_val * quantized_scales.astype(np.float32)
1006 |         d_sub_reshaped = d_sub.reshape(n_blocks, 16, 1)
1007 | 
1008 |         sub_blocks_reshaped = blocks.reshape(n_blocks, 16, 16)
1009 |         with np.errstate(divide="ignore", invalid="ignore"):
1010 |             l_float = np.divide(sub_blocks_reshaped, d_sub_reshaped, out=np.zeros_like(sub_blocks_reshaped), where=d_sub_reshaped != 0)
1011 | 
1012 |         l_final = np.round(l_float).clip(-32, 31).astype(np.int8)
1013 |         L = (l_final + 32).astype(np.uint8).reshape(n_blocks, 256)
1014 | 
1015 |         # --- Pack the 6-bit quantized data ---
1016 |         L_reshaped = L.reshape(n_blocks, 2, 4, 32)
1017 |         L_low = L_reshaped & 0xF
1018 |         L_high = L_reshaped >> 4
1019 | 
1020 |         # Pack lower 4 bits into ql
1021 |         ql = np.empty((n_blocks, 128), dtype=np.uint8)
1022 |         ql[:, 0:32]   = L_low[:, 0, 0, :] | (L_low[:, 0, 2, :] << 4)
1023 |         ql[:, 32:64]  = L_low[:, 0, 1, :] | (L_low[:, 0, 3, :] << 4)
1024 |         ql[:, 64:96]  = L_low[:, 1, 0, :] | (L_low[:, 1, 2, :] << 4)
1025 |         ql[:, 96:128] = L_low[:, 1, 1, :] | (L_low[:, 1, 3, :] << 4)
1026 | 
1027 |         # Pack higher 2 bits into qh
1028 |         qh_packed = L_high[:, :, 0, :] | (L_high[:, :, 1, :] << 2) | (L_high[:, :, 2, :] << 4) | (L_high[:, :, 3, :] << 6)
1029 |         qh = qh_packed.reshape(n_blocks, -1)
1030 | 
1031 |         # Final assembly: view scales as uint8 before concatenating
1032 |         return np.concatenate([ql, qh, quantized_scales.view(np.uint8), d], axis=1)
1033 | 
1034 |     @classmethod
1035 |     def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
1036 |         n_blocks = blocks.shape[0]
1037 | 
1038 |         ql, rest = np.hsplit(blocks, [QK_K // 2])
1039 |         qh, rest = np.hsplit(rest, [QK_K // 4])
1040 |         scales, d = np.hsplit(rest, [QK_K // 16])
1041 | 
1042 |         scales = scales.view(np.int8).astype(np.float32)
1043 |         d = d.view(np.float16).astype(np.float32)
1044 |         d = (d * scales).reshape((n_blocks, QK_K // 16, 1))
1045 | 
1046 |         ql = ql.reshape((n_blocks, -1, 1, 64)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1))
1047 |         ql = (ql & np.uint8(0x0F)).reshape((n_blocks, -1, 32))
1048 |         qh = qh.reshape((n_blocks, -1, 1, 32)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1))
1049 |         qh = (qh & np.uint8(0x03)).reshape((n_blocks, -1, 32))
1050 |         q = (ql | (qh << np.uint8(4))).astype(np.int8) - np.int8(32)
1051 |         q = q.reshape((n_blocks, QK_K // 16, -1)).astype(np.float32)
1052 | 
1053 |         return (d * q).reshape((n_blocks, QK_K))
1054 | 
1055 | 
1056 | class TQ1_0(__Quant, qtype=GGMLQuantizationType.TQ1_0):
1057 |     @classmethod
1058 |     def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
1059 |         n_blocks = blocks.shape[0]
1060 | 
1061 |         d = abs(blocks).max(axis=-1, keepdims=True)
1062 |         with np.errstate(divide="ignore"):
1063 |             id = np.where(d == 0, 0, 1 / d)
1064 |         qs = np_roundf(blocks * id)
1065 |         qs = (qs.astype(np.int8) + np.int8(1)).astype(np.uint8)
1066 | 
1067 |         qs0, qs1, qh = qs[..., :(32 * 5)], qs[..., (32 * 5):(48 * 5)], qs[..., (48 * 5):]
1068 |         qs0 = qs0.reshape((n_blocks, -1, 5, 32)) * np.array([81, 27, 9, 3, 1], dtype=np.uint8).reshape((1, 1, 5, 1))
1069 |         qs0 = np.sum(qs0, axis=-2).reshape((n_blocks, -1))
1070 |         qs1 = qs1.reshape((n_blocks, -1, 5, 16)) * np.array([81, 27, 9, 3, 1], dtype=np.uint8).reshape((1, 1, 5, 1))
1071 |         qs1 = np.sum(qs1, axis=-2).reshape((n_blocks, -1))
1072 |         qh = qh.reshape((n_blocks, -1, 4, 4)) * np.array([81, 27, 9, 3], dtype=np.uint8).reshape((1, 1, 4, 1))
1073 |         qh = np.sum(qh, axis=-2).reshape((n_blocks, -1))
1074 |         qs = np.concatenate([qs0, qs1, qh], axis=-1)
1075 |         qs = (qs.astype(np.uint16) * 256 + (243 - 1)) // 243
1076 | 
1077 |         qs = qs.astype(np.uint8)
1078 |         d = d.astype(np.float16).view(np.uint8)
1079 | 
1080 |         return np.concatenate([qs, d], axis=-1)
1081 | 
1082 |     @classmethod
1083 |     def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
1084 |         n_blocks = blocks.shape[0]
1085 | 
1086 |         qs, rest = np.hsplit(blocks, [(QK_K - 4 * QK_K // 64) // 5])
1087 |         qh, d = np.hsplit(rest, [QK_K // 64])
1088 | 
1089 |         d = d.view(np.float16).astype(np.float32)
1090 | 
1091 |         qs0, qs1 = qs[..., :32], qs[..., 32:]
1092 |         qs0 = qs0.reshape((n_blocks, -1, 1, 32)) * np.array([1, 3, 9, 27, 81], dtype=np.uint8).reshape((1, 1, 5, 1))
1093 |         qs0 = qs0.reshape((n_blocks, -1))
1094 |         qs1 = qs1.reshape((n_blocks, -1, 1, 16)) * np.array([1, 3, 9, 27, 81], dtype=np.uint8).reshape((1, 1, 5, 1))
1095 |         qs1 = qs1.reshape((n_blocks, -1))
1096 |         qh = qh.reshape((n_blocks, -1, 1, 4)) * np.array([1, 3, 9, 27], dtype=np.uint8).reshape((1, 1, 4, 1))
1097 |         qh = qh.reshape((n_blocks, -1))
1098 |         qs = np.concatenate([qs0, qs1, qh], axis=-1)
1099 |         qs = ((qs.astype(np.uint16) * 3) >> 8).astype(np.int8) - np.int8(1)
1100 | 
1101 |         return (d * qs.astype(np.float32))
1102 | 
1103 | 
1104 | class TQ2_0(__Quant, qtype=GGMLQuantizationType.TQ2_0):
1105 |     @classmethod
1106 |     def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
1107 |         n_blocks = blocks.shape[0]
1108 | 
1109 |         d = abs(blocks).max(axis=-1, keepdims=True)
1110 |         with np.errstate(divide="ignore"):
1111 |             id = np.where(d == 0, 0, 1 / d)
1112 |         qs = np_roundf(blocks * id)
1113 |         qs = (qs.astype(np.int8) + np.int8(1)).astype(np.uint8)
1114 | 
1115 |         qs = qs.reshape((n_blocks, -1, 4, 32)) << np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1))
1116 |         qs = qs[..., 0, :] | qs[..., 1, :] | qs[..., 2, :] | qs[..., 3, :]
1117 |         qs = qs.reshape((n_blocks, -1))
1118 | 
1119 |         d = d.astype(np.float16).view(np.uint8)
1120 | 
1121 |         return np.concatenate([qs, d], axis=-1)
1122 | 
1123 |     @classmethod
1124 |     def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
1125 |         n_blocks = blocks.shape[0]
1126 | 
1127 |         qs, d = np.hsplit(blocks, [QK_K // 4])
1128 | 
1129 |         d = d.view(np.float16).astype(np.float32)
1130 | 
1131 |         qs = qs.reshape((n_blocks, -1, 1, 32)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1))
1132 |         qs = (qs & 0x03).reshape((n_blocks, -1)).astype(np.int8) - np.int8(1)
1133 | 
1134 |         return (d * qs.astype(np.float32))
1135 | 
1136 | 
1137 | class IQ2_XXS(__Quant, qtype=GGMLQuantizationType.IQ2_XXS):
1138 |     ksigns: bytes = (
1139 |         b"\x00\x81\x82\x03\x84\x05\x06\x87\x88\x09\x0a\x8b\x0c\x8d\x8e\x0f"
1140 |         b"\x90\x11\x12\x93\x14\x95\x96\x17\x18\x99\x9a\x1b\x9c\x1d\x1e\x9f"
1141 |         b"\xa0\x21\x22\xa3\x24\xa5\xa6\x27\x28\xa9\xaa\x2b\xac\x2d\x2e\xaf"
1142 |         b"\x30\xb1\xb2\x33\xb4\x35\x36\xb7\xb8\x39\x3a\xbb\x3c\xbd\xbe\x3f"
1143 |         b"\xc0\x41\x42\xc3\x44\xc5\xc6\x47\x48\xc9\xca\x4b\xcc\x4d\x4e\xcf"
1144 |         b"\x50\xd1\xd2\x53\xd4\x55\x56\xd7\xd8\x59\x5a\xdb\x5c\xdd\xde\x5f"
1145 |         b"\x60\xe1\xe2\x63\xe4\x65\x66\xe7\xe8\x69\x6a\xeb\x6c\xed\xee\x6f"
1146 |         b"\xf0\x71\x72\xf3\x74\xf5\xf6\x77\x78\xf9\xfa\x7b\xfc\x7d\x7e\xff"
1147 |     )
1148 | 
1149 |     # iq2xxs_grid, but with each byte of the original packed in 2 bits,
1150 |     # by mapping 0x08 to 0, 0x19 to 1, and 0x2b to 2.
1151 |     grid_shape = (256, 8)
1152 |     grid_map = (0x08, 0x19, 0x2b)
1153 |     grid_hex = (
1154 |         b"00000200050008000a00110014002000220028002a0041004400500058006100"
1155 |         b"6400800082008a00a20001010401100115014001840198010002020222028202"
1156 |         b"010404041004210424044004420448046004810484049004a404000502050805"
1157 |         b"200546056905800591050906100640068406a406000805080808140828084108"
1158 |         b"440850085208880804094009020a140a01100410101021104010601084109010"
1159 |         b"951000110811201150115a118011241245120014081420142514491480141815"
1160 |         b"6215001616160118041810184018811800190519a019511a002002200a204420"
1161 |         b"6120802082202921482100220222012404241024402456240025412564259026"
1162 |         b"082820289428442a014004401040184021402440404048405640604081408440"
1163 |         b"9040004120416141804185410142104248425642684200440844204480449944"
1164 |         b"124524450046014804481048404845480049584961498249454a904a00500850"
1165 |         b"1150195020508050885004514251a4519152905492540a550156545600581158"
1166 |         b"195864584059085a046010604060686000615561186260620064056410651265"
1167 |         b"84654268008002800a8041808280048118814081118201840484108415844084"
1168 |         b"608400854685948509864086608602880489118a0490109024904090a1901691"
1169 |         b"8091459200942294449451958198209902a050a085a009a100a218a450a804a9"
1170 |     )
1171 | 
1172 |     @classmethod
1173 |     def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
1174 |         n_blocks = blocks.shape[0]
1175 | 
1176 |         d, qs = np.hsplit(blocks, [2])
1177 | 
1178 |         d = d.view(np.float16).astype(np.float32)
1179 | 
1180 |         qs = qs.view(np.uint32).reshape(n_blocks, -1, 2)
1181 | 
1182 |         db = d * (np.float32(0.5) + (qs[..., 1] >> 28).astype(np.float32)) * np.float32(0.25)
1183 |         db = db.reshape((n_blocks, -1, 1, 1))
1184 | 
1185 |         # get the sign indices and unpack the bits
1186 |         signs = qs[..., 1].reshape((n_blocks, -1, 1)) >> np.array([0, 7, 14, 21], dtype=np.uint32).reshape((1, 1, 4))
1187 |         ksigns = np.frombuffer(cls.ksigns, dtype=np.uint8).reshape((1, 1, 1, 128))
1188 |         signs = (signs & np.uint32(0x7F)).reshape((n_blocks, -1, 4, 1))
1189 |         signs = np.take_along_axis(ksigns, signs, axis=-1)
1190 |         signs = signs.reshape((n_blocks, -1, 4, 1)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 1, 8))
1191 |         signs = signs & np.uint8(0x01)
1192 |         signs = np.where(signs == 0, np.float32(1), np.float32(-1))
1193 |         signs = signs.reshape((n_blocks, -1, 4, 8))
1194 | 
1195 |         assert cls.grid is not None
1196 |         grid = np.take_along_axis(cls.grid, qs[..., 0].copy().view(np.uint8).reshape((n_blocks, -1, 1, 1)), axis=-2)
1197 |         grid = grid.reshape((n_blocks, -1, 4, 8))
1198 | 
1199 |         return (db * grid * signs).reshape((n_blocks, -1))
1200 | 
1201 | 
1202 | class IQ2_XS(__Quant, qtype=GGMLQuantizationType.IQ2_XS):
1203 |     # iq2xs_grid, but with each byte of the original packed in 2 bits,
1204 |     # by mapping 0x08 to 0, 0x19 to 1, and 0x2b to 2.
1205 |     grid_shape = (512, 8)
1206 |     grid_map = (0x08, 0x19, 0x2b)
1207 |     grid_hex = (
1208 |         b"00000200050008000a0011001400160019002000220025002800410044004600"
1209 |         b"49005000520055005800610064008000820085008800910094009900a0000101"
1210 |         b"04010601090110011201150118011a0121012401400142014501480151015401"
1211 |         b"6001680181018401900100020202050208021102140220024102440250025502"
1212 |         b"80028a0201040404060409041004120415041804210424044004420445044804"
1213 |         b"5104540456046004810484049004000502050505080511051405200541054405"
1214 |         b"500561058005010604061006260640064206840600080208050808080a081108"
1215 |         b"14082008250841084408500858088008a008aa08010904091009400981098909"
1216 |         b"000a200a280a960aa00a01100410061009101010121015101810211024104010"
1217 |         b"4210451048105110541060106a10811084109010001102110511081111111411"
1218 |         b"2011411144115011801194119611011204120612101240126012001402140514"
1219 |         b"0814111414142014411444144914501464148014011504151015401500161416"
1220 |         b"49160118041810181218401854188618001905196619511aa91a002002200520"
1221 |         b"08200a201120142020204120442050208020a020012104211021402148216521"
1222 |         b"002222228022a82201240424102429244024002541255225992501261a26a626"
1223 |         b"002808280a28202855288828a22868299029082a202a822a882a8a2a01400440"
1224 |         b"0640094010401240154018402140244040404240454048404a40514054406040"
1225 |         b"6540814084409040004102410541084111411441204141414441504180418541"
1226 |         b"a241014204421042124229424042004402440544084411441444194420444144"
1227 |         b"4444504480449444014504451045244540459a4500460a464446504601480448"
1228 |         b"1048404845485448624800491149444950496949044a00500250055008501150"
1229 |         b"145020502850415044505050805001510451105115514051425100524452aa52"
1230 |         b"0154045410542154405460548154a154005508558055885521566856a1560058"
1231 |         b"14584158505899581a5940594259855a0160046010604060546062608660a960"
1232 |         b"006124624a62926200641664106540654565a46501686a682569066a546a626a"
1233 |         b"00800280058008801180148020802a8041804480508080808280a880aa800181"
1234 |         b"0481068110814081518159810082208280828282a082a8820184048410841284"
1235 |         b"158440846084898400854485a58518866a860088088825885a8880888288a888"
1236 |         b"0689228a808a888a968aa88a0190049010904090569084900091229164915692"
1237 |         b"89920094059444945094589429959095929541965198a6984999159a609a00a0"
1238 |         b"02a008a00aa020a02aa0a0a051a159a1a6a100a202a208a22aa280a2a0a240a4"
1239 |         b"95a465a698a60aa820a822a828a8a0a8a8a804a984a986a928aa2aaa91aaaaaa"
1240 |     )
1241 | 
1242 |     @classmethod
1243 |     def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
1244 |         n_blocks = blocks.shape[0]
1245 | 
1246 |         d, rest = np.hsplit(blocks, [2])
1247 |         qs, scales = np.hsplit(rest, [2 * QK_K // 8])
1248 | 
1249 |         d = d.view(np.float16).astype(np.float32)
1250 |         qs = qs.view(np.uint16)
1251 | 
1252 |         scales = scales.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2))
1253 |         scales = (scales & 0x0F).reshape((n_blocks, -1))
1254 |         db = d * (np.float32(0.5) + scales) * np.float32(0.25)
1255 |         db = db.reshape((n_blocks, -1, 1, 1))
1256 | 
1257 |         # get the sign indices and unpack the bits
1258 |         signs = np.frombuffer(IQ2_XXS.ksigns, dtype=np.uint8).reshape(1, 1, 128)
1259 |         signs = np.take_along_axis(signs, (qs >> 9).reshape((n_blocks, -1, 1)), axis=-1)
1260 |         signs = signs.reshape((n_blocks, -1, 1)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 8))
1261 |         signs = signs & np.uint8(0x01)
1262 |         signs = np.where(signs == 0, np.float32(1), np.float32(-1))
1263 |         signs = signs.reshape((n_blocks, -1, 2, 8))
1264 | 
1265 |         assert cls.grid is not None
1266 |         grid = np.take_along_axis(cls.grid, (qs & np.uint16(511)).reshape((n_blocks, -1, 1, 1)), axis=-2)
1267 |         grid = grid.reshape((n_blocks, -1, 2, 8))
1268 | 
1269 |         return (db * grid * signs).reshape((n_blocks, -1))
1270 | 
1271 | 
1272 | class IQ2_S(__Quant, qtype=GGMLQuantizationType.IQ2_S):
1273 |     # iq2s_grid, but with each byte of the original packed in 2 bits,
1274 |     # by mapping 0x08 to 0, 0x19 to 1, and 0x2b to 2.
1275 |     grid_shape = (1024, 8)
1276 |     grid_map = (0x08, 0x19, 0x2b)
1277 |     grid_hex = (
1278 |         b"00000200050008000a0011001400160019002000220025002800410044004600"
1279 |         b"490050005200550058006100640066006900800082008500880091009400a000"
1280 |         b"a500aa0001010401060109011001120115011801210124014001420145014801"
1281 |         b"510154015601590160016501680181018401900192019501a101a40100020202"
1282 |         b"050208021102140220022a02410244024602490250025502800285028a029402"
1283 |         b"a202010404040604090410041204150418042104240426042904400442044504"
1284 |         b"48044a0451045404560459046004620465048104840486048904900495049804"
1285 |         b"a104a40400050205050508050a05110514051605190520052505280541054405"
1286 |         b"46054905500552055505580561056405800582058505880591059405a0050106"
1287 |         b"0406060609061006150640064506480651065406600681068406900600080208"
1288 |         b"050808081108140816081908200825082a084108440846084908500852085508"
1289 |         b"580861086408800885089408aa08010904091009120915091809210940094509"
1290 |         b"480951095409600981099009000a110a140a220a280a2a0a500a990a01100410"
1291 |         b"0610091010101210151018102110241026104010421045104810511054105610"
1292 |         b"59106010621065106810811084108610901095109810a110a410001102110511"
1293 |         b"08110a1111111411161119112011221125112811411144114611491150115211"
1294 |         b"5511581161116411801182118511881191119411011204120912101215122112"
1295 |         b"2412401245125112541281128412901200140214051408141114141416141914"
1296 |         b"2014251428144114441446144914501452145514581461146414801482148514"
1297 |         b"881491149414a014011504150615091510151215151518152115241540154215"
1298 |         b"4515481551155415601581158415901500160516081611161416201641164416"
1299 |         b"50168016aa160118041806180918101815181818211840184218451848185118"
1300 |         b"541860188118841800190219051908191119141920194119441950196919a219"
1301 |         b"041a101a401a561a00200220052008201120142016201920202025202a204120"
1302 |         b"4420502052205520642080208a209420aa200121042110211221152121214021"
1303 |         b"4221452151215421602181218421902100220a22222228222a22442250228822"
1304 |         b"8a22a82201240424062409241024152418242124242440244224452448245124"
1305 |         b"5424602481248424902400250525082511251425202541254425502566258025"
1306 |         b"0126042610264026592600280528112814284128442850288a28aa2801290429"
1307 |         b"102995290a2a222a642a882a8a2a014004400640094010401240154018401a40"
1308 |         b"21402440264040404240454048404a4051405440564059406040624065408140"
1309 |         b"8440904095409840a140a4400041024105410841114114411641194120412241"
1310 |         b"2541414144414641494150415241554158416141644180418241854188419141"
1311 |         b"9441a04101420442104212421542184224424042454248425142544260428142"
1312 |         b"844200440244054408440a441144144416441944204422442544284441444444"
1313 |         b"46444944504452445544584461446444804482448544884491449444a0440145"
1314 |         b"0445064509451045124515451845214524454045424545454845514554456045"
1315 |         b"6a4581458445904500460246054608461146144620464146444650468046a546"
1316 |         b"0148044809481048124815481848214824484048424845484848514854486048"
1317 |         b"84489048004902490549084911491449204941494449504980499649014a044a"
1318 |         b"104a404a00500250055008501150145016501950205022502550285041504450"
1319 |         b"4650495050505250555058506150645080508250855088509150945001510451"
1320 |         b"0651095110511251155118512151245140514251455148515151545160518151"
1321 |         b"8451905100520552085211521452205241524452505269528052015404540654"
1322 |         b"0954105412541554185421542454405442544554485451545454605481548454"
1323 |         b"9054005502550555085511551455205541554455505580550156045610562656"
1324 |         b"405600580258055808581158145820584158445850585a588058015904591059"
1325 |         b"4059005a195a855aa85a01600460066010601260156018602160246040604560"
1326 |         b"4860516054606060846090600061026105610861116114612061416144615061"
1327 |         b"806199610462106240625662a162006405640864116414642064416444645064"
1328 |         b"806401650465106540654a656865926500669466016804681068656898680069"
1329 |         b"2a69426aa16a0080028005800880118014801980208025804180448050805280"
1330 |         b"5580588061808080858091809480018104810981108112811581188121812481"
1331 |         b"408142814581488151815481818184819081a981008205820a82118214824182"
1332 |         b"4482508201840484068409841084128415841884218440844284458448845184"
1333 |         b"5484608481848484908400850285058508851185148520854185448550858085"
1334 |         b"8a85018604861086298640860088058811881488418844885088a28801890489"
1335 |         b"40896589228a588a5a8a828aa28a019004900990109012901590189024904090"
1336 |         b"4290459048905190549060908190849090900091059111911491419144915091"
1337 |         b"5a910192049210924092a6920094029405940894119414942094419444945094"
1338 |         b"8094969401950495109540959895a19500964696649601980498109826984098"
1339 |         b"a998009949995299909a00a005a00aa014a022a02aa041a044a050a0a2a0aaa0"
1340 |         b"40a165a102a20aa222a228a22aa282a288a28aa2a8a201a404a410a440a489a4"
1341 |         b"a4a400a519a551a60aa828a8a2a854a986a908aa0aaa20aa22aa28aa88aaaaaa"
1342 |     )
1343 | 
1344 |     @classmethod
1345 |     def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
1346 |         n_blocks = blocks.shape[0]
1347 | 
1348 |         d, rest = np.hsplit(blocks, [2])
1349 |         qs, rest = np.hsplit(rest, [QK_K // 8])
1350 |         signs, rest = np.hsplit(rest, [QK_K // 8])
1351 |         qh, scales = np.hsplit(rest, [QK_K // 32])
1352 | 
1353 |         d = d.view(np.float16).astype(np.float32)
1354 | 
1355 |         scales = scales.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2))
1356 |         scales = (scales & 0x0F).reshape((n_blocks, -1))
1357 |         db = d * (np.float32(0.5) + scales) * np.float32(0.25)
1358 |         db = db.reshape((n_blocks, -1, 1, 1))
1359 | 
1360 |         # unpack the sign bits
1361 |         signs = signs.reshape((n_blocks, -1, 1)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 8))
1362 |         signs = signs & np.uint8(0x01)
1363 |         signs = np.where(signs == 0, np.float32(1), np.float32(-1))
1364 |         signs = signs.reshape((n_blocks, -1, 2, 8))
1365 | 
1366 |         qh = qh.reshape((n_blocks, -1, 1)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4))
1367 |         qs = qs.astype(np.uint16) | ((qh & 0x03).astype(np.uint16) << 8).reshape((n_blocks, -1))
1368 | 
1369 |         assert cls.grid is not None
1370 |         grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2)
1371 |         grid = grid.reshape((n_blocks, -1, 2, 8))
1372 | 
1373 |         return (db * grid * signs).reshape((n_blocks, -1))
1374 | 
1375 | 
1376 | class IQ3_XXS(__Quant, qtype=GGMLQuantizationType.IQ3_XXS):
1377 |     grid_shape = (256, 4)
1378 |     grid_map = (0x04, 0x0c, 0x14, 0x1c, 0x24, 0x2c, 0x34, 0x3e)
1379 |     grid_hex = (
1380 |         b"0000020004001100130017002000220031004200730075000101030110011201"
1381 |         b"2101250130013201410154017001000202020402110220022202310233023702"
1382 |         b"5102570275020103070310031203250370031304370444045704730475040105"
1383 |         b"0705320552053506640610071407160743076107011003101010121021102310"
1384 |         b"3010321034104710501000110211111120112211011203121012121221123012"
1385 |         b"7212001302132013311346136613011405145014201524154615711505162217"
1386 |         b"4017002002201120132020202220262031204220012103210521102112212121"
1387 |         b"3021632167217021002202221122172220222222372240225522012310231423"
1388 |         b"7023742335245324032527254125742501270327162745270130103012302130"
1389 |         b"2330503065307230003102312031313144314631013203321032253252327232"
1390 |         b"1133333330344734723400350635223555351436363663363337603704401740"
1391 |         b"3540374053405740744120423742404260426642074345430444514464442545"
1392 |         b"4345704505471047124730471250415070500051065126515551145232527252"
1393 |         b"0253535310542354275472540255315550562457425724604460466064602161"
1394 |         b"6161176264623063366344640565526533660367216703700570077010703270"
1395 |         b"5270267140711272457252720073157333736073217441740075027524753076"
1396 |     )
1397 | 
1398 |     @classmethod
1399 |     def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
1400 |         n_blocks = blocks.shape[0]
1401 | 
1402 |         d, rest = np.hsplit(blocks, [2])
1403 |         qs, scales = np.hsplit(rest, [QK_K // 4])
1404 | 
1405 |         d = d.view(np.float16).astype(np.float32)
1406 |         scales = scales.view(np.uint32)
1407 | 
1408 |         db = d * (np.float32(0.5) + (scales >> 28).astype(np.float32)) * np.float32(0.5)
1409 |         db = db.reshape((n_blocks, -1, 1, 1))
1410 | 
1411 |         # get the sign indices and unpack the bits
1412 |         signs = scales.reshape((n_blocks, -1, 1)) >> np.array([0, 7, 14, 21], dtype=np.uint32).reshape((1, 1, 4))
1413 |         ksigns = np.frombuffer(IQ2_XXS.ksigns, dtype=np.uint8).reshape((1, 1, 1, 128))
1414 |         signs = (signs & np.uint32(0x7F)).reshape((n_blocks, -1, 4, 1))
1415 |         signs = np.take_along_axis(ksigns, signs, axis=-1)
1416 |         signs = signs.reshape((n_blocks, -1, 4, 1)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 1, 8))
1417 |         signs = signs & np.uint8(0x01)
1418 |         signs = np.where(signs == 0, np.float32(1), np.float32(-1))
1419 |         signs = signs.reshape((n_blocks, -1, 4, 8))
1420 | 
1421 |         assert cls.grid is not None
1422 |         grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2)
1423 |         grid = grid.reshape((n_blocks, -1, 4, 8))
1424 | 
1425 |         return (db * grid * signs).reshape((n_blocks, -1))
1426 | 
1427 | 
1428 | class IQ3_S(__Quant, qtype=GGMLQuantizationType.IQ3_S):
1429 |     grid_shape = (512, 4)
1430 |     grid_map = (0x01, 0x03, 0x05, 0x07, 0x09, 0x0b, 0x0d, 0x0f)
1431 |     grid_hex = (
1432 |         b"0000010002000500070010001100120014001600200021002500330040004200"
1433 |         b"4500470051005300600062007100740077000001010102010401100111011501"
1434 |         b"2001230127013101350144016101650172010002010205020702100213021602"
1435 |         b"2102250230023402420245024702510253027002730203031103150320032203"
1436 |         b"3103330336034403500352036703710375030004130417042104240432044004"
1437 |         b"4304510470040205040520052205260533054105450547056605730506061106"
1438 |         b"1306310652067106000702070407200722072607330750075407001001100210"
1439 |         b"0410101011101310151017102010221031103410361054105610611072100011"
1440 |         b"0111031106111011141121113011331141115011521170117611001212121512"
1441 |         b"1712201224123212401243125512601272120113041307131013131321132713"
1442 |         b"3013341341136213701303140514121414143114331442144614501454140115"
1443 |         b"1015131521153015321551152016241627164416461601170317101712172117"
1444 |         b"3517411762177017002001200320052007201020122014201620212023202720"
1445 |         b"3020322041204320452050205220672070207320752000210221102113211721"
1446 |         b"2221252131213421422151210122042207222122232230223722412253225722"
1447 |         b"7122742200230223052311232223242331233323422350236623012407242024"
1448 |         b"2324322435244124722475240425112522253725402553257025002602260726"
1449 |         b"2126552661260527112726273027432750270230113013301530173022303130"
1450 |         b"3330353042304430473051306330713001310331053114312131233140316031"
1451 |         b"7231763100321232203232323432503201331033143321332333273330334133"
1452 |         b"4333473355337333033411341634223431345234603464340135103512352535"
1453 |         b"3235443556357335163641360137033720372237353700400440124020402440"
1454 |         b"2740324041405040704002410741114113412241304135414341514155410142"
1455 |         b"0342104215422142334240425742624270420443114313432043224331433543"
1456 |         b"0044024424443744404471440545074521456245134634466046104715473047"
1457 |         b"4347514702501050145022504050445047505250665074500151035105511251"
1458 |         b"2151325172510052115223523052365253520253075310532753445351536553"
1459 |         b"7353015404542054325446541255265551555355425602570457225711601360"
1460 |         b"1560316033606060006120612761646112623462426255626262706200631463"
1461 |         b"2163406325644364626400650365346560650566406611671367007004700770"
1462 |         b"2070227036704070547062700271117124714371457101720472107216722172"
1463 |         b"3072517202733273357353730174057413742074507422754275027631760077"
1464 |     )
1465 | 
1466 |     @classmethod
1467 |     def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
1468 |         n_blocks = blocks.shape[0]
1469 | 
1470 |         d, rest = np.hsplit(blocks, [2])
1471 |         qs, rest = np.hsplit(rest, [QK_K // 4])
1472 |         qh, rest = np.hsplit(rest, [QK_K // 32])
1473 |         signs, scales = np.hsplit(rest, [QK_K // 8])
1474 | 
1475 |         d = d.view(np.float16).astype(np.float32)
1476 | 
1477 |         scales = scales.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2))
1478 |         scales = (scales & 0x0F).reshape((n_blocks, -1))
1479 |         db = d * (1 + 2 * scales)
1480 |         db = db.reshape((n_blocks, -1, 1, 1))
1481 | 
1482 |         # unpack the sign bits
1483 |         signs = signs.reshape((n_blocks, -1, 1)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 8))
1484 |         signs = signs & np.uint8(0x01)
1485 |         signs = np.where(signs == 0, np.float32(1), np.float32(-1))
1486 |         signs = signs.reshape((n_blocks, -1, 4, 8))
1487 | 
1488 |         qh = qh.reshape((n_blocks, -1, 1)) >> np.array([i for i in range(8)], dtype=np.uint8)
1489 |         qh = (qh & 0x01).astype(np.uint16).reshape((n_blocks, -1))
1490 |         qs = qs.astype(np.uint16) | (qh << 8)
1491 | 
1492 |         assert cls.grid is not None
1493 |         grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2)
1494 |         grid = grid.reshape((n_blocks, -1, 4, 8))
1495 | 
1496 |         return (db * grid * signs).reshape((n_blocks, -1))
1497 | 
1498 | 
1499 | class IQ1_S(__Quant, qtype=GGMLQuantizationType.IQ1_S):
1500 |     # iq1s_grid, with each byte packed into 2 bits
1501 |     # -1, 0, 1 <=> 0, 1, 2
1502 |     grid_shape = (2048, 8)
1503 |     grid_map = (-1, 0, 1)
1504 |     grid_hex = (
1505 |         b"00000200050008000a00110015002000220028002a0045005100540056006500"
1506 |         b"8000820088008a009500a000a200a800aa000401050111011401160119011a01"
1507 |         b"2501410146014901520155015a0161016401660168018501910194019601a501"
1508 |         b"0002020208020a0215022002220228022a024502510259026402690280028202"
1509 |         b"88028a02910295029902a002a202a802aa021104140416042504410449045504"
1510 |         b"5a046404650491049904a5040105040505050605150518051a05290540054505"
1511 |         b"4a0550055105540555055605590560056205650568056a058105910595059805"
1512 |         b"9a05a105a405a505a605a9051406190641064406500652065506580660066106"
1513 |         b"6606690685069106940699060008020808080a0815082008220828082a084508"
1514 |         b"5108560865088008820888088a089508a008a208a808aa080509110914091909"
1515 |         b"2409250941095009510955096109640969099109940996099909a509000a020a"
1516 |         b"080a0a0a150a200a220a280a2a0a450a510a590a610a650a800a820a850a880a"
1517 |         b"8a0a950aa00aa20aa80aaa0a1010111014101910241025104110441050105510"
1518 |         b"58106110641065106910911094109610a110a510011104110611091110111211"
1519 |         b"1511181121112411291145114a11501151115211541155115611591160116511"
1520 |         b"841192119511a111a41111121412161225124012461249125212551258125a12"
1521 |         b"641266128512911294129612a512011406140914141415141814191421142614"
1522 |         b"41144514461448144a1451145414551456145914621465146814841489149014"
1523 |         b"94149514981499149a14a114a414a514a914021505150a151115141515151615"
1524 |         b"191520152215251528152a154115441545154615511552155415551556155915"
1525 |         b"5a1561156415651566156915801582158415851588158a159015911594159515"
1526 |         b"961599159a15a015a215a51501160416051606161516161618161a1621162616"
1527 |         b"401642164416451648164a165116551656165816591661166416651668166916"
1528 |         b"6a1686168a1692169516a416a916111816182518411844184618491850185518"
1529 |         b"58185a1860186118641866186918851891189418a5181019121915191a192119"
1530 |         b"25194219441945194819511954195519561959195a19601965196a1989199119"
1531 |         b"921995199819a119a619a919091a161a241a261a441a461a491a501a521a551a"
1532 |         b"581a611a661a691a851a911a961a9a1a0020022008200a201520202022202520"
1533 |         b"28202a20452051205920612065208020822088208a209520a020a220a520a820"
1534 |         b"aa2005211121142119212521422144214921552158215a216121642165216621"
1535 |         b"8521902196219921a521012208220a22112215222022222228222a2245225122"
1536 |         b"562259226522812288228a2291229522a022a222a822aa220524142416241924"
1537 |         b"252444244524462449245224552458245a2466248524912494249924a124a524"
1538 |         b"0925152521252925402545254825512554255525592562256525682589259025"
1539 |         b"9425952598259a25a125a425a625a92505261026122619262526412649265526"
1540 |         b"6026612669268426862690269a260028022808280a2815282028222828282a28"
1541 |         b"45285128542865288028822888288a28a028a228a828aa280929112914291929"
1542 |         b"2529462949295229552961296429662969298529902996299929a429a529002a"
1543 |         b"022a082a0a2a202a222a282a2a2a452a512a562a592a652a802a822a882a8a2a"
1544 |         b"952aa02aa22aa82aaa2a054011401640254049405240554058405a4061406440"
1545 |         b"664094409940a140a6400041014104410641094112411541164118411a412141"
1546 |         b"26412941454148414a41514154415541564159415a41654168416a4181418441"
1547 |         b"8641904192419541a041a141a241054211421442164225424142524255425a42"
1548 |         b"6442694289429442a5420144154419442944454448444a445144544455445644"
1549 |         b"61446244654468446a44814486448944904492449544a044a144a94401450245"
1550 |         b"05450a4511451445154516451945204525452a45414544454545464549455045"
1551 |         b"5145544555455645584559456145644565456645694582458445854588459145"
1552 |         b"94459545964599459a45a545a845aa450146054609461446154618461a462146"
1553 |         b"2446294640464246454648465046514652465546564659466246654668468146"
1554 |         b"85468a4694469546a146a446a6460548114815481a4825484248494850485548"
1555 |         b"5848614864486648694885489148944896489948a5480149054906490a491049"
1556 |         b"144915491849214924492649404945494a495149524954495549564959496049"
1557 |         b"6249654966496a49864989499249954996499849a149a449a649a949164a444a"
1558 |         b"464a494a554a584a5a4a644a694a944aa54a0150045005500650095012501550"
1559 |         b"1a50215024502950405045504850515054505550565059506550685086508950"
1560 |         b"95509850a050a150a650a9500551085109510a51115114511551165118511951"
1561 |         b"20512551265128512a5141514451455146514951505151515251545155515651"
1562 |         b"585159515a51615164516551665169518251855191519451955196519951a051"
1563 |         b"a551aa5101520652125215521a5221522452425245524a525152545255525652"
1564 |         b"595262526552855290529252955299529a52a452045405541154145415541654"
1565 |         b"185419542154255428542a54415444544554465449544a545054515454545554"
1566 |         b"5654585459545a54615462546454655466546954805488548a54915494549554"
1567 |         b"96549954a154a454a554aa540155025504550555065509551055115512551455"
1568 |         b"1555165519551a55215524552555265529554055415542554455455546554855"
1569 |         b"4955505551555255545555555655585559555a55605561556455655566556855"
1570 |         b"69556a5581558455855589558a559055915594559555965598559955a155a455"
1571 |         b"a555a655a9550056015602560456065608560956115614561556185619562056"
1572 |         b"2156225624562556265628562956415645564656485649564a56505651565256"
1573 |         b"545655565656585659565a566156645665566956825685568656885689568a56"
1574 |         b"915695569a56a256a556a656a856a95604580558065809581058155818582158"
1575 |         b"2a58455848584a58515854585558565858585958605862586458655882588958"
1576 |         b"9058925895589858a158a9580159025905590a59115914591559165919592559"
1577 |         b"41594459455946594959505951595259545955595659585959595a5961596459"
1578 |         b"655966596959815985598959915994599559965998599959a559045a085a155a"
1579 |         b"1a5a205a255a265a295a455a485a495a515a555a565a585a595a625a655a685a"
1580 |         b"6a5a815a8a5a925a955a965a985a9a5aa15a0560146016601960256044605060"
1581 |         b"5560566058605a60616064606660696081609660a56001610461066109611261"
1582 |         b"15612161226126612961456149615161556156615961656166616a6184618a61"
1583 |         b"92619561a161a661a96111621662196240624162466255625662586260628562"
1584 |         b"91629662a56211641264156416641a6421642664296440644264456448644a64"
1585 |         b"516454645564566459645a646064626465648464856489649064926494649564"
1586 |         b"966498649a64a164a464a964056508650a651165156516651965446545654665"
1587 |         b"496550655165546555655665596561656465656566656965866589658a659165"
1588 |         b"9565966599659a65a265a565a665a86502660966156620662666286629664066"
1589 |         b"456648664a66516654665566566658665a666066656668668066826685668a66"
1590 |         b"9466966698669966a066a466a666aa661668196825684168526855685a686168"
1591 |         b"6968856891689868a66801690469106915692169246926692969406941694569"
1592 |         b"4669486951695469556956695969606965696a69826984698a699569a169a469"
1593 |         b"a569a969116a166a186a416a446a496a506a556a586a5a6a646a656a696a866a"
1594 |         b"946a986a9a6aa66a0080028008800a802080228028802a804580508051805480"
1595 |         b"5680598065808080828088808a809580a080a280a880aa800581118114811681"
1596 |         b"1981258141814481498150815281558156815881598164816681698185818981"
1597 |         b"948196819981a5810082028208820a8215822082228228822a82518254825982"
1598 |         b"65828082828288828a829582a082a282a882aa82148419844184448451845584"
1599 |         b"5a846184648469849484998401850985128515851a8526852985408541854585"
1600 |         b"4885518554855585568559855a856585668568856a8581858485868589859085"
1601 |         b"928595859885a68511861686198625864186448649864a865086558659865a86"
1602 |         b"618666866a86858691869a86a4860088028808880a8815882088228828882a88"
1603 |         b"41884588518854885988658869888088828888888a889588a088a288a888aa88"
1604 |         b"05890689118914891689258941894489468949895089528955895a8961896489"
1605 |         b"858996899989a589008a028a088a0a8a158a208a228a288a2a8a458a518a548a"
1606 |         b"568a808a828a888a8a8a958aa08aa28aa88aaa8a059011901690189019902590"
1607 |         b"419046904990559058905a9069906a9085909190949096909990a59001910491"
1608 |         b"069109911091159118911a912191249126912991409145915091519154915591"
1609 |         b"569159916291659184918691929195919891a191a491a691a991059211921492"
1610 |         b"19922592449246924992509252925592589266926992859294929692a9920194"
1611 |         b"04940694109415941894269440944a9451945494559456945894599460946194"
1612 |         b"62946594849486949294949495949894a194a9940095059508950a9510951195"
1613 |         b"14951595169519952195259529952a9541954495459546954995509551955295"
1614 |         b"549555955695589559955a956195649565956695699581958595889591959295"
1615 |         b"94959595969599959a95a095a295a595a895aa95019604961096159619962096"
1616 |         b"2696299645964896499651965296559656965996659668968296849689968a96"
1617 |         b"929694969596a496a696a9960598169819982598419846985098529855985698"
1618 |         b"5a98649865988598919896989998a59804990699099910991299159918991a99"
1619 |         b"209921992499269940994299459948994a995199549955995699599962996599"
1620 |         b"66996a99819984999099929995999a99a199a699059a159a259a449a469a499a"
1621 |         b"509a559a589a619a859a919a949a959a969a00a002a008a00aa015a020a022a0"
1622 |         b"28a02aa045a051a054a056a059a080a082a088a08aa095a0a0a0a2a0a8a0aaa0"
1623 |         b"05a109a111a114a116a119a11aa146a149a151a155a158a15aa161a164a185a1"
1624 |         b"90a192a196a199a102a208a20aa210a219a222a228a22aa245a251a256a259a2"
1625 |         b"65a280a282a288a28aa295a2a0a2a2a2a8a2aaa219a425a441a444a450a454a4"
1626 |         b"55a458a45aa461a465a466a468a469a485a406a509a510a512a515a518a526a5"
1627 |         b"29a542a545a551a554a555a556a559a565a56aa581a584a585a586a589a592a5"
1628 |         b"95a598a505a611a616a61aa621a625a644a646a64aa652a655a656a658a660a6"
1629 |         b"62a686a690a695a696a699a6a1a6a4a6a6a600a802a808a80aa820a822a828a8"
1630 |         b"2aa851a854a856a859a880a882a888a88aa895a8a0a8a2a8a8a8aaa805a914a9"
1631 |         b"19a921a925a941a950a955a95aa961a966a969a990a996a900aa02aa08aa0aaa"
1632 |         b"20aa22aa28aa2aaa51aa54aa56aa80aa82aa88aa8aaa95aaa0aaa2aaa8aaaaaa"
1633 |     )
1634 | 
1635 |     delta = np.float32(0.125)
1636 | 
1637 |     @classmethod
1638 |     def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
1639 |         n_blocks = blocks.shape[0]
1640 | 
1641 |         d, rest = np.hsplit(blocks, [2])
1642 |         qs, qh = np.hsplit(rest, [QK_K // 8])
1643 | 
1644 |         d = d.view(np.float16).astype(np.float32)
1645 |         qh = qh.view(np.uint16)
1646 | 
1647 |         dl = d * (2 * ((qh >> 12) & 7) + 1)
1648 |         dl = dl.reshape((n_blocks, -1, 1, 1))
1649 |         delta = np.where((qh & np.uint16(0x8000)) == 0, cls.delta, -cls.delta)
1650 |         delta = delta.reshape((n_blocks, -1, 1, 1))
1651 | 
1652 |         qh = qh.reshape((n_blocks, -1, 1)) >> np.array([0, 3, 6, 9], dtype=np.uint16).reshape((1, 1, 4))
1653 |         qs = qs.astype(np.uint16) | ((qh & 7) << 8).reshape((n_blocks, -1))
1654 | 
1655 |         assert cls.grid is not None
1656 |         grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2)
1657 |         grid = grid.reshape((n_blocks, -1, 4, 8))
1658 | 
1659 |         return (dl * (grid + delta)).reshape((n_blocks, -1))
1660 | 
1661 | 
1662 | class IQ1_M(__Quant, qtype=GGMLQuantizationType.IQ1_M):
1663 |     grid_shape = IQ1_S.grid_shape
1664 |     grid_map = IQ1_S.grid_map
1665 |     grid_hex = IQ1_S.grid_hex
1666 | 
1667 |     delta = IQ1_S.delta
1668 | 
1669 |     # Okay *this* type is weird. It's the only one which stores the f16 scales in multiple parts.
1670 |     @classmethod
1671 |     def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
1672 |         n_blocks = blocks.shape[0]
1673 | 
1674 |         qs, rest = np.hsplit(blocks, [QK_K // 8])
1675 |         qh, scales = np.hsplit(rest, [QK_K // 16])
1676 | 
1677 |         # The f16 scale is packed across multiple bytes
1678 |         scales = scales.view(np.uint16)
1679 |         d = (scales.reshape((n_blocks, 4)) & np.uint16(0xF000)) >> np.array([12, 8, 4, 0], dtype=np.uint16).reshape((1, 4))
1680 |         d = d[..., 0] | d[..., 1] | d[..., 2] | d[..., 3]
1681 |         d = d.view(np.float16).astype(np.float32).reshape((n_blocks, 1))
1682 | 
1683 |         scales = scales.reshape(n_blocks, -1, 1) >> np.array([0, 3, 6, 9], dtype=np.uint16).reshape((1, 1, 4))
1684 |         scales = (scales & 0x07).reshape((n_blocks, -1))
1685 |         dl = d * (2 * scales + 1)
1686 |         dl = dl.reshape((n_blocks, -1, 2, 1, 1))
1687 | 
1688 |         qh = qh.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2))
1689 |         qs = qs.astype(np.uint16) | ((qh & 0x07).astype(np.uint16) << 8).reshape((n_blocks, -1))
1690 | 
1691 |         delta = np.where(qh & 0x08 == 0, cls.delta, -cls.delta)
1692 |         delta = delta.reshape((n_blocks, -1, 2, 2, 1))
1693 | 
1694 |         assert cls.grid is not None
1695 |         grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2)
1696 |         grid = grid.reshape((n_blocks, -1, 2, 2, 8))
1697 | 
1698 |         return (dl * (grid + delta)).reshape((n_blocks, -1))
1699 | 
1700 | 
1701 | class IQ4_NL(__Quant, qtype=GGMLQuantizationType.IQ4_NL):
1702 |     kvalues = (-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113)
1703 | 
1704 |     @classmethod
1705 |     def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
1706 |         n_blocks = blocks.shape[0]
1707 | 
1708 |         d, qs = np.hsplit(blocks, [2])
1709 | 
1710 |         d = d.view(np.float16).astype(np.float32)
1711 | 
1712 |         qs = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1))
1713 | 
1714 |         qs = (qs & np.uint8(0x0F)).reshape((n_blocks, -1, 1))
1715 | 
1716 |         kvalues = np.array(cls.kvalues, dtype=np.int8).reshape(1, 1, 16)
1717 |         qs = np.take_along_axis(kvalues, qs, axis=-1).astype(np.float32).reshape((n_blocks, -1))
1718 | 
1719 |         return (d * qs)
1720 | 
1721 | 
1722 | class IQ4_XS(__Quant, qtype=GGMLQuantizationType.IQ4_XS):
1723 |     @classmethod
1724 |     def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
1725 |         n_blocks = blocks.shape[0]
1726 | 
1727 |         d, rest = np.hsplit(blocks, [2])
1728 |         scales_h, rest = np.hsplit(rest, [2])
1729 |         scales_l, qs = np.hsplit(rest, [QK_K // 64])
1730 | 
1731 |         d = d.view(np.float16).astype(np.float32)
1732 |         scales_h = scales_h.view(np.uint16)
1733 | 
1734 |         scales_l = scales_l.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2))
1735 |         scales_h = scales_h.reshape((n_blocks, 1, -1)) >> np.array([2 * i for i in range(QK_K // 32)], dtype=np.uint16).reshape((1, -1, 1))
1736 |         scales_l = scales_l.reshape((n_blocks, -1)) & np.uint8(0x0F)
1737 |         scales_h = scales_h.reshape((n_blocks, -1)).astype(np.uint8) & np.uint8(0x03)
1738 | 
1739 |         scales = (scales_l | (scales_h << np.uint8(4))).astype(np.int8) - np.int8(32)
1740 |         dl = (d * scales.astype(np.float32)).reshape((n_blocks, -1, 1))
1741 | 
1742 |         qs = qs.reshape((n_blocks, -1, 1, 16)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1))
1743 |         qs = qs.reshape((n_blocks, -1, 32, 1)) & np.uint8(0x0F)
1744 | 
1745 |         kvalues = np.array(IQ4_NL.kvalues, dtype=np.int8).reshape((1, 1, 1, -1))
1746 |         qs = np.take_along_axis(kvalues, qs, axis=-1).astype(np.float32).reshape((n_blocks, -1, 32))
1747 | 
1748 |         return (dl * qs).reshape((n_blocks, -1))
1749 | 


--------------------------------------------------------------------------------