├── requirements.txt ├── config.py ├── cuda.py ├── datasets.py ├── README.md ├── models.py ├── dataloaders.py ├── optimizers.py ├── tools.py ├── layers.py ├── variable.py ├── functions.py └── main.py /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/2ertwo/LLaMa3-Numpy-trainable/HEAD/requirements.txt -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | 3 | 4 | class Config: 5 | enable_backprop: bool = True 6 | train: bool = True 7 | 8 | 9 | @contextlib.contextmanager 10 | def using_config(name, value): 11 | old_value = getattr(Config, name) 12 | setattr(Config, name, value) 13 | try: 14 | yield 15 | finally: 16 | setattr(Config, name, old_value) 17 | 18 | 19 | def no_grad(): 20 | return using_config("enable_backprop", False) 21 | 22 | 23 | def test_mode(): 24 | return using_config('train', False) 25 | -------------------------------------------------------------------------------- /cuda.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from variable import Variable 3 | 4 | gpu_enable = True 5 | try: 6 | import cupy as cp 7 | 8 | cupy = cp 9 | except ImportError: 10 | gpu_enable = False 11 | 12 | 13 | def get_array_module(x): 14 | if isinstance(x, Variable): 15 | x = x.data 16 | if not gpu_enable: 17 | return np 18 | xp = cp.get_array_module(x) 19 | return xp 20 | 21 | 22 | def as_numpy(x): 23 | if isinstance(x, Variable): 24 | x = x.data 25 | if np.isscalar(x): 26 | return np.array(x) 27 | elif isinstance(x, np.ndarray): 28 | return x 29 | return cp.asnumpy(x) 30 | 31 | 32 | def as_cupy(x): 33 | if isinstance(x, Variable): 34 | x = x.data 35 | if not gpu_enable: 36 | raise Exception('CuPy is not installed') 37 | return cp.asarray(x) 38 | -------------------------------------------------------------------------------- /datasets.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class Dataset: 5 | def __init__(self, train: bool = True, transform=None, target_transform=None): 6 | self.train = train 7 | self.transform = transform 8 | self.target_transform = target_transform 9 | if self.transform is None: 10 | self.transform = lambda x: x 11 | if self.target_transform is None: 12 | self.target_transform = lambda x: x 13 | self.data = None 14 | self.label = None 15 | self.prepare() 16 | 17 | def __getitem__(self, item): 18 | assert np.isscalar(item) 19 | if self.label is None: 20 | return self.transform(self.data[item]), None 21 | else: 22 | return self.transform(self.data[item]), self.target_transform(self.label[item]) 23 | 24 | def __len__(self): 25 | return len(self.data) 26 | 27 | def prepare(self): 28 | raise NotImplementedError() 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LLaMa3-Numpy-trainable 2 | 用Numpy复现可训练的LLaMa3 3 | ## 具体文档 4 | 为了获得更好的体验,请移步飞书文档: 5 | https://aw8o2u3n233.feishu.cn/wiki/Pc7swzMMZiYnP5krcrzcHkQUn1a?from=from_copylink 6 | ## 参考文献 7 | 作业原本提供: 8 | https://github.com/naklecha/llama3-from-scratch 9 | 10 | llama3(不可训练)numpy实现: 11 | https://github.com/likejazz/llama3.np 12 | 13 | Baby llama: 14 | https://github.com/DLLXW/baby-llama2-chinese 15 | 16 | Atom7b: 17 | https://github.com/LlamaFamily/Llama-Chinese 18 | 19 | RMSnorm论文: 20 | https://arxiv.org/abs/1910.07467 21 | 22 | llama3的ffn SwiGLU论文: 23 | https://arxiv.org/abs/2002.05202 24 | 25 | 注意力论文: 26 | https://arxiv.org/abs/1706.03762 27 | 28 | 注意力介绍博文: 29 | https://spaces.ac.cn/archives/4765 30 | 31 | 注意力介绍: 32 | https://armanasq.github.io/nlp/self-attention/ 33 | 34 | RoPE论文RoFormer: 35 | https://arxiv.org/abs/2104.09864 36 | 37 | RoPE原作者亲自讲解: 38 | https://spaces.ac.cn/archives/8265 39 | 40 | RoPE介绍与实现: 41 | https://blog.eleuther.ai/rotary-embeddings/ 42 | 43 | RoPE两种实现的引发的不同: 44 | https://github.com/huggingface/transformers/issues/25199 45 | 46 | 关于权重共享: 47 | https://spaces.ac.cn/archives/9698 48 | 49 | 『ゼロから作る Deep Learning ❸』(O'Reilly Japan, 2020): 50 | https://github.com/oreilly-japan/deep-learning-from-scratch-3 51 | 52 | 上面那本书的读者实现的stack: 53 | https://github.com/laksjdjf/dezero-diffusion/blob/main/modules/unet.py 54 | -------------------------------------------------------------------------------- /models.py: -------------------------------------------------------------------------------- 1 | import layers 2 | from layers import Layer 3 | from functions import Function, sigmoid 4 | from tools import plot_dot_graph 5 | 6 | 7 | class Model(Layer): 8 | def plot(self, *inputs, to_file='model.png'): 9 | y = self.forward(*inputs) 10 | return plot_dot_graph(y, verbose=True, to_file=to_file) 11 | 12 | 13 | class MLP(Model): 14 | def __init__(self, fc_output_sizes: tuple[int, ...], activation: Function = sigmoid): 15 | super(MLP, self).__init__() 16 | self.activation = activation 17 | self.layers = [] 18 | 19 | for i, out_size in enumerate(fc_output_sizes): 20 | layer = layers.Linear(out_size) 21 | setattr(self, 'l' + str(i), layer) 22 | self.layers.append(layer) 23 | 24 | def forward(self, x): 25 | for l in self.layers[:-1]: 26 | x = self.activation(l(x)) 27 | return self.layers[-1](x) 28 | 29 | 30 | class Sequential(Model): 31 | def __init__(self, *layers): 32 | super().__init__() 33 | self.layers = [] 34 | for i, layer in enumerate(layers): 35 | setattr(self, 'l' + str(i), layer) 36 | self.layers.append(layer) 37 | 38 | def forward(self, x): 39 | for layer in self.layers: 40 | x = layer(x) 41 | return x 42 | -------------------------------------------------------------------------------- /dataloaders.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import math 3 | from datasets import Dataset 4 | 5 | 6 | class DataLoader: 7 | def __init__(self, dataset: Dataset, batch_size: int, shuffle: bool = True): 8 | self.dataset = dataset 9 | self.batch_size = batch_size 10 | self.shuffle = shuffle 11 | self.data_size = len(dataset) 12 | self.max_iter = math.ceil(self.data_size / self.batch_size) 13 | 14 | self.reset() 15 | 16 | def reset(self): 17 | self.iteration = 0 18 | if self.shuffle: 19 | self.index = np.random.permutation(len(self.dataset)) 20 | else: 21 | self.index = np.arange(len(self.dataset)) 22 | 23 | def __iter__(self): 24 | return self 25 | 26 | def __next__(self): 27 | if self.iteration >= self.max_iter: 28 | self.reset() 29 | raise StopIteration() 30 | 31 | i, batch_size = self.iteration, self.batch_size 32 | batch_index = self.index[i * batch_size:(i + 1) * batch_size] 33 | batch = [self.dataset[i] for i in batch_index] 34 | x = np.array([example[0] for example in batch]) 35 | t = np.array([example[1] for example in batch]) 36 | self.iteration += 1 37 | return x, t 38 | 39 | def next(self): 40 | return self.__next__() 41 | -------------------------------------------------------------------------------- /optimizers.py: -------------------------------------------------------------------------------- 1 | from typing import Union, Callable 2 | 3 | import numpy as np 4 | 5 | from models import Model 6 | from layers import Layer 7 | from variable import Parameter 8 | 9 | 10 | class Optimizer: 11 | def __init__(self): 12 | self.target = None 13 | self.hooks: list[Callable] = [] 14 | 15 | def setup(self, target: Union[Layer, Model]): 16 | self.target: Union[Layer, Model] = target 17 | return self 18 | 19 | def update(self): 20 | params = [p for p in self.target.params() if p.grad is not None] 21 | 22 | for f in self.hooks: 23 | f(params) 24 | 25 | for param in params: 26 | self.update_one(param) 27 | 28 | def update_one(self, param: Parameter): 29 | raise NotImplementedError 30 | 31 | def add_hook(self, f: Callable): 32 | self.hooks.append(f) 33 | 34 | 35 | class SGD(Optimizer): 36 | def __init__(self, lr=0.01): 37 | super(SGD, self).__init__() 38 | self.lr = lr 39 | 40 | def update_one(self, param: Parameter): 41 | param.data -= self.lr * param.grad.data 42 | 43 | 44 | class MomentumSGD(Optimizer): 45 | def __init__(self, lr=0.01, momentum=0.9): 46 | super(MomentumSGD, self).__init__() 47 | self.lr = lr 48 | self.momentum = momentum 49 | self.vs = {} 50 | 51 | def update_one(self, param: Parameter): 52 | v_key = id(param) 53 | if v_key not in self.vs: 54 | self.vs[v_key] = np.zeros_like(param.data) 55 | 56 | v = self.vs[v_key] 57 | v *= self.momentum 58 | v -= self.lr * param.grad.data 59 | param.data += v 60 | -------------------------------------------------------------------------------- /tools.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import graphviz 3 | from typing import Union, TYPE_CHECKING 4 | from variable import Variable 5 | 6 | if TYPE_CHECKING: 7 | from functions import Function 8 | 9 | 10 | def as_variable(obj: Union[np.ndarray, Variable]) -> Variable: 11 | if isinstance(obj, Variable): 12 | return obj 13 | return Variable(obj) 14 | 15 | 16 | def as_array(x, array_module=np): 17 | if np.isscalar(x): 18 | return array_module.array(x) 19 | return x 20 | 21 | 22 | def _dot_var(v: Variable, verbose: bool = False) -> str: 23 | dot_var = '{} [label="{}", color=orange, style=filled]\n' 24 | name = '' if v.name is None else v.name 25 | if verbose and v.data is not None: 26 | if v.name is not None: 27 | name += ': ' 28 | name += str(v.shape) + ' ' + str(v.dtype) 29 | return dot_var.format(id(v), name) 30 | 31 | 32 | def _dot_func(f: "Function") -> str: 33 | dot_func = '{} [label="{}", color=lightblue, style=filled, shape=box]\n' 34 | txt = dot_func.format(id(f), f.__class__.__name__) 35 | 36 | dot_edge = '{} -> {}\n' 37 | for x in f.inputs: 38 | txt += dot_edge.format(id(x), id(f)) 39 | for y in f.outputs: 40 | txt += dot_edge.format(id(f), id(y())) 41 | return txt 42 | 43 | 44 | def get_dot_graph(output: Variable, verbose: bool = True) -> str: 45 | txt = '' 46 | funcs: list[Function] = [] 47 | seen_set: set[Function] = set() 48 | 49 | def add_func(f): 50 | if f not in seen_set: 51 | funcs.append(f) 52 | seen_set.add(f) 53 | 54 | add_func(output.creator) 55 | txt += _dot_var(output, verbose) 56 | while funcs: 57 | f: "Function" = funcs.pop() 58 | txt += _dot_func(f) 59 | xs = f.inputs 60 | for x in xs: 61 | txt += _dot_var(x, verbose) 62 | if x.creator is not None: 63 | add_func(x.creator) 64 | return 'digraph g {\n' + txt + '}' 65 | 66 | 67 | def plot_dot_graph(output: Variable, verbose: bool = True, to_file: str = 'graph.png'): 68 | dot_graph = get_dot_graph(output, verbose) 69 | dot = graphviz.Source(dot_graph) 70 | dot.view() 71 | 72 | 73 | def sum_to(x: np.ndarray, shape) -> np.ndarray: 74 | ndim = len(shape) 75 | lead = x.ndim - ndim 76 | lead_axis = tuple(range(lead)) 77 | 78 | axis = tuple([i + lead for i, sx in enumerate(shape) if sx == 1]) 79 | y = x.sum(lead_axis + axis, keepdims=True) 80 | if lead > 0: 81 | y = y.squeeze(lead_axis) 82 | return y 83 | 84 | 85 | def reshape_sum_backward(gy: Variable, x_shape: tuple[int, ...], axis: Union[tuple[int, ...], int, None], 86 | keepdims: bool) -> Variable: 87 | ndim = len(x_shape) 88 | tupled_axis = axis 89 | if axis is None: 90 | tupled_axis = None 91 | elif not isinstance(axis, tuple): 92 | tupled_axis = (axis,) 93 | 94 | if not (ndim == 0 or tupled_axis is None or keepdims): 95 | actual_axis = [a if a >= 0 else a + ndim for a in tupled_axis] 96 | shape = list(gy.shape) 97 | for a in sorted(actual_axis): 98 | shape.insert(a, 1) 99 | else: 100 | shape = gy.shape 101 | 102 | gy = gy.reshape(shape) # reshape 103 | return gy 104 | -------------------------------------------------------------------------------- /layers.py: -------------------------------------------------------------------------------- 1 | import weakref 2 | import os 3 | import numpy as np 4 | from typing import Union 5 | from variable import Variable, Parameter 6 | from functions import linear 7 | 8 | 9 | class Layer: 10 | def __init__(self): 11 | self._params: set[str] = set() 12 | 13 | def __setattr__(self, key, value): 14 | if isinstance(value, (Parameter, Layer)): 15 | self._params.add(key) 16 | super(Layer, self).__setattr__(key, value) 17 | 18 | def __call__(self, *inputs: Union[Variable, np.ndarray]) -> Union[list[Variable], Variable]: 19 | outputs = self.forward(*inputs) 20 | if not isinstance(outputs, tuple): 21 | outputs = (outputs,) 22 | self.inputs = [weakref.ref(x) for x in inputs] 23 | self.outputs = [weakref.ref(y) for y in outputs] 24 | return outputs if len(outputs) > 1 else outputs[0] 25 | 26 | def forward(self, *inputs): 27 | raise NotImplementedError() 28 | 29 | def params(self): 30 | for name in self._params: 31 | obj = self.__dict__[name] 32 | if isinstance(obj, Layer): 33 | yield from obj.params() 34 | else: 35 | yield obj 36 | 37 | def cleargrads(self): 38 | for param in self.params(): 39 | param.cleargrad() 40 | 41 | def to_cpu(self): 42 | for param in self.params(): 43 | param.to_cpu() 44 | 45 | def to_gpu(self): 46 | for param in self.params(): 47 | param.to_gpu() 48 | 49 | def _flatten_params(self, params_dict, parent_key=""): 50 | for name in self._params: 51 | obj = self.__dict__[name] 52 | key = parent_key + '/' + name if parent_key else name 53 | 54 | if isinstance(obj, Layer): 55 | obj._flatten_params(params_dict, key) 56 | else: 57 | params_dict[key] = obj 58 | 59 | def save_weights(self, path): 60 | self.to_cpu() 61 | params_dict = {} 62 | self._flatten_params(params_dict) 63 | array_dict = {key: param.data for key, param in params_dict.items() 64 | if param is not None} 65 | try: 66 | np.savez_compressed(path, **array_dict) 67 | except (Exception, KeyboardInterrupt) as e: 68 | if os.path.exists(path): 69 | os.remove(path) 70 | raise 71 | 72 | def load_weights(self, path): 73 | npz = np.load(path) 74 | params_dict = {} 75 | self._flatten_params(params_dict) 76 | for key, param in params_dict.items(): 77 | param.data = npz[key] 78 | print(f'{key} loaded') 79 | 80 | 81 | class Linear(Layer): 82 | def __init__(self, out_size: int, nobias: bool = False, dtype=np.float32, in_size: int = None): 83 | super(Linear, self).__init__() 84 | self.in_size = in_size 85 | self.out_size = out_size 86 | self.dtype = dtype 87 | 88 | self.W = Parameter(None, name='W') 89 | if self.in_size is not None: 90 | self._init_W() 91 | 92 | if nobias: 93 | self.b = None 94 | else: 95 | self.b = Parameter(np.zeros(out_size, dtype=dtype), name='b') 96 | 97 | def _init_W(self): 98 | I, O = self.in_size, self.out_size 99 | W_data = np.random.randn(I, O).astype(self.dtype) * np.sqrt(1 / I) 100 | self.W = Parameter(W_data, name='W') 101 | 102 | def forward(self, x): 103 | if self.W.data is None: 104 | self.in_size = x.shape[1] 105 | self._init_W() 106 | y = linear(x, self.W, self.b) 107 | return y 108 | 109 | 110 | class Embedding(Layer): 111 | def __init__(self, in_size, out_size): 112 | super().__init__() 113 | self.W = Parameter(np.random.randn(in_size, out_size), name='W') 114 | 115 | def forward(self, x): 116 | y = self.W[x] 117 | return y 118 | -------------------------------------------------------------------------------- /variable.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from typing import TYPE_CHECKING 3 | from config import using_config, Config 4 | 5 | try: 6 | import cupy 7 | 8 | array_types = (np.ndarray, cupy.ndarray) 9 | except ImportError: 10 | array_types = (np.ndarray) 11 | if TYPE_CHECKING: 12 | from functions import Function 13 | 14 | 15 | class Variable: 16 | __array_priority__ = 200 17 | 18 | def __init__(self, data: np.ndarray, name=None): 19 | if data is not None: 20 | if not isinstance(data, array_types): 21 | raise TypeError('{} is not supported'.format(type(data))) 22 | 23 | self.data = data 24 | self.grad = None 25 | self.creator = None 26 | self.name = name 27 | self.generation = 0 28 | 29 | def set_creator(self, func: "Function"): 30 | self.creator: "Function" = func 31 | self.generation = func.generation + 1 32 | 33 | def backward(self, retain_grad: bool = False, create_graph=False): 34 | from cuda import get_array_module 35 | if self.grad is None: 36 | xp = get_array_module(self.data) 37 | self.grad = Variable(xp.ones_like(self.data)) 38 | 39 | funcs: list[Function] = [] 40 | seen_set: set[Function] = set() 41 | 42 | def add_func(f): 43 | if f not in seen_set: 44 | funcs.append(f) 45 | seen_set.add(f) 46 | funcs.sort(key=lambda x: x.generation) 47 | 48 | add_func(self.creator) 49 | 50 | while funcs: 51 | f: "Function" = funcs.pop() 52 | xs = f.inputs 53 | gys = [output().grad for output in f.outputs] 54 | 55 | with using_config('enable_backprop', create_graph): 56 | gxs = f.backward(*gys) 57 | if not isinstance(gxs, tuple): 58 | gxs = (gxs,) 59 | 60 | for x, gx in zip(xs, gxs): 61 | if x.grad is not None: 62 | x.grad = x.grad + gx 63 | else: 64 | x.grad = gx 65 | 66 | if x.creator is not None: 67 | add_func(x.creator) 68 | if not retain_grad: 69 | for y in f.outputs: 70 | y().grad = None 71 | 72 | def cleargrad(self): 73 | self.grad = None 74 | 75 | def to_cpu(self): 76 | from cuda import as_numpy 77 | if self.data is not None: 78 | self.data = as_numpy(self.data) 79 | 80 | def to_gpu(self): 81 | from cuda import as_cupy 82 | if self.data is not None: 83 | self.data = as_cupy(self.data) 84 | 85 | @property 86 | def shape(self): 87 | return self.data.shape 88 | 89 | @property 90 | def size(self): 91 | return self.data.size 92 | 93 | @property 94 | def ndim(self): 95 | return self.data.ndim 96 | 97 | @property 98 | def dtype(self): 99 | return self.data.dtype 100 | 101 | def __len__(self): 102 | return len(self.data) 103 | 104 | def __repr__(self): 105 | if self.data is None: 106 | return 'Variable(None)' 107 | p = str(self.data).replace('\n', '\n' + ' ' * 9) 108 | return 'Variable(' + p + ')' 109 | 110 | def __add__(self, other): 111 | from functions import add 112 | return add(self, other) 113 | 114 | def __radd__(self, other): 115 | from functions import add 116 | return add(self, other) 117 | 118 | def __mul__(self, other): 119 | from functions import mul 120 | return mul(self, other) 121 | 122 | def __rmul__(self, other): 123 | from functions import mul 124 | return mul(self, other) 125 | 126 | def __neg__(self): 127 | from functions import neg 128 | return neg(self) 129 | 130 | def __sub__(self, other): 131 | from functions import sub 132 | return sub(self, other) 133 | 134 | def __rsub__(self, other): 135 | from functions import rsub 136 | return rsub(self, other) 137 | 138 | def __truediv__(self, other): 139 | from functions import div 140 | return div(self, other) 141 | 142 | def __rtruediv__(self, other): 143 | from functions import rdiv 144 | return rdiv(self, other) 145 | 146 | def __pow__(self, power, modulo=None): 147 | from functions import pow 148 | return pow(self, power) 149 | 150 | def __getitem__(self, item): 151 | from functions import get_item 152 | return get_item(self, item) 153 | 154 | def reshape(self, *shape): 155 | from functions import reshape 156 | if len(shape) == 1 and isinstance(shape[0], (tuple, list)): 157 | shape = shape[0] 158 | return reshape(self, shape) 159 | 160 | # def transpose(self): 161 | # from functions import transpose 162 | # return transpose(self) 163 | # 164 | # @property 165 | # def T(self): 166 | # from functions import transpose 167 | # return transpose(self) 168 | 169 | def transpose(self, *axes): 170 | from functions import transpose 171 | if len(axes) == 0: 172 | axes = None 173 | elif len(axes) == 1: 174 | if isinstance(axes[0], (tuple, list)) or axes[0] is None: 175 | axes = axes[0] 176 | return transpose(self, axes) 177 | 178 | @property 179 | def T(self): 180 | from functions import transpose 181 | return transpose(self) 182 | 183 | def sum(self, axis=None, keepdims=False): 184 | from functions import sum 185 | return sum(self, axis=axis, keepdims=keepdims) 186 | 187 | 188 | class Parameter(Variable): 189 | pass 190 | 191 | 192 | if __name__ == '__main__': 193 | # a1 = Variable(np.array([1.0, 3.0])) 194 | # a2 = Variable(np.array([1.0, 2.0])) 195 | # print(a1 + a2) 196 | pass 197 | -------------------------------------------------------------------------------- /functions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from typing import Union 3 | from variable import Variable 4 | from config import Config 5 | from tools import as_variable, as_array, plot_dot_graph, reshape_sum_backward 6 | from tools import sum_to as raw_sum_to 7 | from cuda import get_array_module 8 | import weakref 9 | 10 | 11 | class Function: 12 | def __call__(self, *inputs: Union[np.ndarray, Variable]) -> Union[list[Variable], Variable]: 13 | inputs: list[Variable] = [as_variable(x) for x in inputs] 14 | 15 | xs = [x.data for x in inputs] 16 | 17 | ys = self.forward(*xs) 18 | if not isinstance(ys, tuple): 19 | ys = (ys,) 20 | outputs = [Variable(as_array(y)) for y in ys] 21 | 22 | if Config.enable_backprop: 23 | self.generation = max([x.generation for x in inputs]) 24 | for output in outputs: 25 | output.set_creator(self) 26 | self.outputs = [weakref.ref(output) for output in outputs] 27 | self.inputs = inputs 28 | 29 | return outputs if len(outputs) > 1 else outputs[0] 30 | 31 | def forward(self, *xs: np.ndarray) -> tuple[np.ndarray]: 32 | raise NotImplementedError() 33 | 34 | def backward(self, gys: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]: 35 | raise NotImplementedError() 36 | 37 | 38 | class Square(Function): 39 | def forward(self, x: np.ndarray) -> np.ndarray: 40 | return x ** 2 41 | 42 | def backward(self, gy: np.ndarray) -> np.ndarray: 43 | x = self.inputs[0] 44 | gx = 2 * x * gy 45 | return gx 46 | 47 | 48 | class Exp(Function): 49 | def forward(self, x: np.ndarray) -> np.ndarray: 50 | xp = get_array_module(x) 51 | return xp.exp(x) 52 | 53 | def backward(self, gy: np.ndarray) -> np.ndarray: 54 | y = self.outputs[0]() 55 | gx = y * gy 56 | return gx 57 | 58 | 59 | def square(x: Variable): 60 | return Square()(x) 61 | 62 | 63 | def exp(x: Variable): 64 | return Exp()(x) 65 | 66 | 67 | # class XtoX(Function): 68 | # # y = x^x 69 | # # lny = xlnx 70 | # # y'/y = 1+lnx 71 | # # y' = y(1+lnx) 72 | # def forward(self, x: np.ndarray) -> tuple[np.ndarray]: 73 | # return x ** x 74 | # 75 | # def backward(self, gy: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]: 76 | # y = self.outputs[0]() 77 | # x = self.inputs[0] 78 | # xp = get_array_module(x) 79 | # gx = gy * (y * (1 + xp.log(x))) 80 | # return gx 81 | 82 | 83 | class Log(Function): 84 | def forward(self, x: np.ndarray) -> tuple[np.ndarray]: 85 | xp = get_array_module(x) 86 | return xp.log(x) 87 | 88 | def backward(self, gy: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]: 89 | x = self.inputs[0] 90 | gx = gy / x 91 | return gx 92 | 93 | 94 | def log(x): 95 | return Log()(x) 96 | 97 | 98 | class Add(Function): 99 | def forward(self, x0: np.ndarray, x1: np.ndarray) -> np.ndarray: 100 | self.x0_shape, self.x1_shape = x0.shape, x1.shape 101 | y = x0 + x1 102 | return y 103 | 104 | def backward(self, gy: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]: 105 | gx0, gx1 = gy, gy 106 | if self.x0_shape != self.x1_shape: 107 | gx0 = sum_to(gx0, self.x0_shape) 108 | gx1 = sum_to(gx1, self.x1_shape) 109 | return gx0, gx1 110 | 111 | 112 | def add(x, y): 113 | y = as_array(y, get_array_module(x)) 114 | return Add()(x, y) 115 | 116 | 117 | class Mul(Function): 118 | def forward(self, x0: np.ndarray, x1: np.ndarray) -> tuple[np.ndarray]: 119 | self.x0_shape, self.x1_shape = x0.shape, x1.shape 120 | y = x0 * x1 121 | return y 122 | 123 | def backward(self, gy: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]: 124 | x0, x1 = self.inputs 125 | gx0, gx1 = gy * x1, gy * x0 126 | if self.x0_shape != self.x1_shape: 127 | gx0 = sum_to(gx0, self.x0_shape) 128 | gx1 = sum_to(gx1, self.x1_shape) 129 | return gx0, gx1 130 | 131 | 132 | def mul(x, y): 133 | y = as_array(y, get_array_module(x)) 134 | return Mul()(x, y) 135 | 136 | 137 | class Neg(Function): 138 | def forward(self, x: np.ndarray) -> tuple[np.ndarray]: 139 | return -x 140 | 141 | def backward(self, gy: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]: 142 | return -gy 143 | 144 | 145 | def neg(x): 146 | return Neg()(x) 147 | 148 | 149 | class Sub(Function): 150 | def forward(self, x0: np.ndarray, x1: np.ndarray) -> tuple[np.ndarray]: 151 | self.x0_shape, self.x1_shape = x0.shape, x1.shape 152 | y = x0 - x1 153 | return y 154 | 155 | def backward(self, gy: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]: 156 | gx0, gx1 = gy, -gy 157 | if self.x0_shape != self.x1_shape: 158 | gx0 = sum_to(gx0, self.x0_shape) 159 | gx1 = sum_to(gx1, self.x1_shape) 160 | return gx0, gx1 161 | 162 | 163 | def sub(x, y): 164 | y = as_array(y, get_array_module(x)) 165 | return Sub()(x, y) 166 | 167 | 168 | def rsub(x, y): 169 | y = as_array(y, get_array_module(x)) 170 | return Sub()(y, x) 171 | 172 | 173 | class Div(Function): 174 | def forward(self, x0: np.ndarray, x1: np.ndarray) -> tuple[np.ndarray]: 175 | self.x0_shape, self.x1_shape = x0.shape, x1.shape 176 | y = x0 / x1 177 | return y 178 | 179 | def backward(self, gy: np.ndarray) -> Union[tuple[Variable, ...], Variable]: 180 | x0, x1 = self.inputs 181 | gx0 = gy / x1 182 | gx1 = gy * (-x0 / x1 ** 2) 183 | if self.x0_shape != self.x1_shape: 184 | gx0 = sum_to(gx0, self.x0_shape) 185 | gx1 = sum_to(gx1, self.x1_shape) 186 | return gx0, gx1 187 | 188 | 189 | def div(x, y): 190 | y = as_array(y, get_array_module(x)) 191 | return Div()(x, y) 192 | 193 | 194 | def rdiv(x, y): 195 | y = as_array(y, get_array_module(x)) 196 | return Div()(y, x) 197 | 198 | 199 | class Pow(Function): 200 | def __init__(self, c): 201 | self.c = c 202 | 203 | def forward(self, x: np.ndarray) -> tuple[np.ndarray]: 204 | y = x ** self.c 205 | return y 206 | 207 | def backward(self, gy: np.ndarray) -> Union[tuple[Variable, ...], Variable]: 208 | x = self.inputs[0] 209 | gx = self.c * x ** (self.c - 1) * gy 210 | return gx 211 | 212 | 213 | def pow(x, c): 214 | return Pow(c)(x) 215 | 216 | 217 | class Sin(Function): 218 | def forward(self, x: np.ndarray) -> tuple[np.ndarray]: 219 | xp = get_array_module(x) 220 | y = xp.sin(x) 221 | return y 222 | 223 | def backward(self, gy: np.ndarray) -> Union[tuple[Variable, ...], Variable]: 224 | x = self.inputs[0] 225 | gx = gy * cos(x) 226 | return gx 227 | 228 | 229 | def sin(x): 230 | return Sin()(x) 231 | 232 | 233 | class Cos(Function): 234 | def forward(self, x: np.ndarray) -> tuple[np.ndarray]: 235 | xp = get_array_module(x) 236 | y = xp.cos(x) 237 | return y 238 | 239 | def backward(self, gy: np.ndarray) -> Union[tuple[Variable, ...], Variable]: 240 | x = self.inputs[0] 241 | gx = gy * -sin(x) 242 | return gx 243 | 244 | 245 | def cos(x): 246 | return Cos()(x) 247 | 248 | 249 | class Tanh(Function): 250 | def forward(self, x: np.ndarray) -> tuple[np.ndarray]: 251 | xp = get_array_module(x) 252 | y = xp.tanh(x) 253 | return y 254 | 255 | def backward(self, gy: np.ndarray) -> Union[tuple[Variable, ...], Variable]: 256 | y = self.outputs[0]() 257 | gx = gy * (1 - y * y) 258 | return gx 259 | 260 | 261 | def tanh(x): 262 | return Tanh()(x) 263 | 264 | 265 | class Reshape(Function): 266 | def __init__(self, shape): 267 | self.shape = shape 268 | 269 | def forward(self, x: np.ndarray) -> tuple[np.ndarray]: 270 | self.x_shape = x.shape 271 | y = x.reshape(self.shape) 272 | return y 273 | 274 | def backward(self, gy: np.ndarray) -> Union[tuple[Variable, ...], Variable]: 275 | return reshape(gy, self.x_shape) 276 | 277 | 278 | def reshape(x, shape): 279 | if x.shape == shape: 280 | return as_variable(x) 281 | return Reshape(shape)(x) 282 | 283 | 284 | # class Transpose(Function): 285 | # def forward(self, x: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]: 286 | # y = np.transpose(x) 287 | # return y 288 | # 289 | # def backward(self, gy: np.ndarray) -> Union[tuple[Variable, ...], Variable]: 290 | # return transpose(gy) 291 | # 292 | # 293 | # def transpose(x): 294 | # return Transpose()(x) 295 | 296 | class Transpose(Function): 297 | def __init__(self, axes=None): 298 | self.axes = axes 299 | 300 | def forward(self, x): 301 | y = x.transpose(self.axes) 302 | return y 303 | 304 | def backward(self, gy): 305 | if self.axes is None: 306 | return transpose(gy) 307 | 308 | axes_len = len(self.axes) 309 | inv_axes = tuple(np.argsort([ax % axes_len for ax in self.axes])) 310 | return transpose(gy, inv_axes) 311 | 312 | 313 | def transpose(x, axes=None): 314 | return Transpose(axes)(x) 315 | 316 | 317 | class Sum(Function): 318 | def __init__(self, axis: Union[tuple[int, ...], int, None], keepdims: bool): 319 | self.axis = axis 320 | self.keepdims = keepdims 321 | 322 | def forward(self, x: np.ndarray) -> tuple[np.ndarray]: 323 | self.x_shape = x.shape 324 | y = x.sum(axis=self.axis, keepdims=self.keepdims) 325 | return y 326 | 327 | def backward(self, gy: np.ndarray) -> Union[tuple[Variable, ...], Variable]: 328 | gy = reshape_sum_backward(gy, self.x_shape, self.axis, self.keepdims) 329 | gx = broadcast_to(gy, self.x_shape) 330 | return gx 331 | 332 | 333 | def sum(x, axis=None, keepdims=False): 334 | return Sum(axis=axis, keepdims=keepdims)(x) 335 | 336 | 337 | class BroadcastTo(Function): 338 | def __init__(self, shape): 339 | self.shape = shape 340 | 341 | def forward(self, x: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]: 342 | self.x_shape = x.shape 343 | xp = get_array_module(x) 344 | y = xp.broadcast_to(x, self.shape) 345 | return y 346 | 347 | def backward(self, gy: np.ndarray) -> Union[tuple[Variable, ...], Variable]: 348 | gx = sum_to(gy, self.x_shape) 349 | return gx 350 | 351 | 352 | def broadcast_to(x, shape): 353 | if x.shape == shape: 354 | return as_variable(x) 355 | return BroadcastTo(shape)(x) 356 | 357 | 358 | class SumTo(Function): 359 | def __init__(self, shape): 360 | self.shape = shape 361 | 362 | def forward(self, x: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]: 363 | self.x_shape = x.shape 364 | y = raw_sum_to(x, self.shape) 365 | return y 366 | 367 | def backward(self, gy: np.ndarray) -> Union[tuple[Variable, ...], Variable]: 368 | gx = broadcast_to(gy, self.x_shape) 369 | return gx 370 | 371 | 372 | def sum_to(x, shape): 373 | if x.shape == shape: 374 | return as_variable(x) 375 | return SumTo(shape)(x) 376 | 377 | 378 | class MatMul(Function): 379 | def forward(self, x: np.ndarray, W: np.ndarray) -> tuple[np.ndarray]: 380 | if x.ndim <= 2 and W.ndim <= 2: 381 | y = x.dot(W) 382 | else: 383 | y = x @ W 384 | return y 385 | 386 | def backward(self, gy: np.ndarray) -> Union[tuple[Variable, ...], Variable]: 387 | x, W = self.inputs 388 | gx = matmul(gy, W.transpose(([i for i in range(W.ndim - 2)] + [-1, -2]))) 389 | gW = matmul(x.transpose(([i for i in range(x.ndim - 2)] + [-1, -2])), gy) 390 | return gx, gW 391 | 392 | 393 | def matmul(x, W): 394 | return MatMul()(x, W) 395 | 396 | 397 | class MeanSquaredError(Function): 398 | def forward(self, x0: np.ndarray, x1: np.ndarray) -> np.ndarray: 399 | diff = x0 - x1 400 | y = (diff ** 2).sum() / len(diff) 401 | return y 402 | 403 | def backward(self, gy: np.ndarray) -> Union[tuple[Variable, ...], Variable]: 404 | x0, x1 = self.inputs 405 | diff: Variable = x0 - x1 406 | gx0: Variable = gy * diff * (2. / len(diff)) 407 | gx1: Variable = -gx0 408 | return gx0, gx1 409 | 410 | 411 | def mean_squared_error(x, y): 412 | return MeanSquaredError()(x, y) 413 | 414 | 415 | class Linear(Function): 416 | def forward(self, x: np.ndarray, W: np.ndarray, b: np.ndarray) -> tuple[np.ndarray]: 417 | y = x.dot(W) 418 | if b is not None: 419 | y += b 420 | 421 | return y 422 | 423 | def backward(self, gy: np.ndarray) -> Union[tuple[Variable, ...], Variable]: 424 | x, W, b = self.inputs 425 | gb = None if b.data is None else sum_to(gy, b.shape) 426 | gx = matmul(gy, W.transpose(([i for i in range(W.ndim - 2)] + [-1, -2]))) 427 | gW = matmul(x.transpose(([i for i in range(x.ndim - 2)] + [-1, -2])), gy) 428 | return gx, gW, gb 429 | 430 | 431 | def linear(x, W, b=None): 432 | return Linear()(x, W, b) 433 | 434 | 435 | class Sigmoid(Function): 436 | def forward(self, x: np.ndarray) -> tuple[np.ndarray]: 437 | # y = 1 / (1 + exp(-x)) 438 | xp = get_array_module(x) 439 | y = xp.tanh(x * 0.5) * 0.5 + 0.5 440 | return y 441 | 442 | def backward(self, gy: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]: 443 | y = self.outputs[0]() 444 | gx = gy * y * (1 - y) 445 | return gx 446 | 447 | 448 | def sigmoid(x): 449 | return Sigmoid()(x) 450 | 451 | 452 | class GetItem(Function): 453 | def __init__(self, slices): 454 | self.slices = slices 455 | 456 | def forward(self, x: np.ndarray) -> tuple[np.ndarray]: 457 | y = x[self.slices] 458 | return y 459 | 460 | def backward(self, gy: np.ndarray) -> Union[tuple[Variable, ...], Variable]: 461 | x = self.inputs[0] 462 | f = GetItemGrad(self.slices, x.shape) 463 | return f(gy) 464 | 465 | 466 | def get_item(x, slices): 467 | return GetItem(slices)(x) 468 | 469 | 470 | class GetItemGrad(Function): 471 | def __init__(self, slices, in_shape): 472 | self.slices = slices 473 | self.in_shape = in_shape 474 | 475 | def forward(self, gy: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]: 476 | xp = get_array_module(gy) 477 | gx = xp.zeros(self.in_shape) 478 | if xp is np: 479 | np.add.at(gx, self.slices, gy) 480 | else: 481 | xp.scatter_add(gx, self.slices, gy) 482 | return gx 483 | 484 | def backward(self, ggx: np.ndarray) -> Union[tuple[Variable, ...], Variable]: 485 | return get_item(ggx, self.slices) 486 | 487 | 488 | class Softmax(Function): 489 | def __init__(self, axis=1): 490 | self.axis = axis 491 | 492 | def forward(self, x: np.ndarray) -> tuple[np.ndarray]: 493 | xp = get_array_module(x) 494 | y = x - x.max(axis=self.axis, keepdims=True) 495 | y = xp.exp(y) 496 | y /= y.sum(axis=self.axis, keepdims=True) 497 | return y 498 | 499 | def backward(self, gy: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]: 500 | y = self.outputs[0]() 501 | gx = y * gy 502 | sumdx = gx.sum(axis=self.axis, keepdims=True) 503 | gx -= y * sumdx 504 | return gx 505 | 506 | 507 | def softmax(x, axis=1): 508 | return Softmax(axis=axis)(x) 509 | 510 | 511 | class Cat(Function): 512 | def __init__(self, axis: int = 0): 513 | self.axis = axis 514 | 515 | def forward(self, *xs: np.ndarray) -> np.ndarray: 516 | xp = get_array_module(xs[0]) 517 | z = xp.concatenate(xs, axis=self.axis) 518 | return z 519 | 520 | def backward(self, gy: Variable) -> Union[tuple[Variable, ...], Variable]: 521 | inputs = self.inputs 522 | gx = [] 523 | start_idx = 0 524 | for x in inputs: 525 | end_idx = start_idx + x.shape[self.axis] 526 | indices = [slice(None)] * gy.ndim 527 | indices[self.axis] = slice(start_idx, end_idx) 528 | gx.append(gy[tuple(indices)]) 529 | start_idx = end_idx 530 | 531 | return tuple(gx) 532 | 533 | 534 | def cat(inputs, axis=0): 535 | return Cat(axis=axis)(*inputs) 536 | 537 | 538 | class Clip(Function): 539 | def __init__(self, x_min, x_max): 540 | self.x_min = x_min 541 | self.x_max = x_max 542 | 543 | def forward(self, x: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]: 544 | xp = get_array_module(x) 545 | y = xp.clip(x, self.x_min, self.x_max) 546 | return y 547 | 548 | def backward(self, gy: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]: 549 | x = self.inputs[0] 550 | mask = (x.data >= self.x_min) * (x.data <= self.x_max) 551 | gx = gy * mask 552 | return gx 553 | 554 | 555 | def clip(x, x_min, x_max): 556 | return Clip(x_min, x_max)(x) 557 | 558 | 559 | def softmax_cross_entropy_simple(x, t): 560 | x, t = as_variable(x), as_variable(t) 561 | N = x.shape[0] 562 | 563 | p = softmax(x) 564 | p = clip(p, 1e-15, 1.0) 565 | log_p = log(p) 566 | tlog_p = log_p[np.arange(N), t.data] 567 | y = -1 * sum(tlog_p) / N 568 | return y 569 | 570 | 571 | def accuracy(y, t): 572 | y, t = as_variable(y), as_variable(t) 573 | 574 | pred = y.data.argmax(axis=1).reshape(t.shape) 575 | result = (pred == t.data) 576 | acc = result.mean() 577 | 578 | return Variable(as_array(acc)) 579 | 580 | 581 | def dropout(x, dropout_ratio=0.5): 582 | x = as_variable(x) 583 | 584 | if Config.train: 585 | xp = get_array_module(x) 586 | mask = xp.random.rand(*x.shape) > dropout_ratio 587 | scale = xp.array(1.0 - dropout_ratio).astype(x.dtype) 588 | y = x * mask / scale 589 | return y 590 | else: 591 | return x 592 | 593 | 594 | class Stack(Function): 595 | def __init__(self, axis: int = 0): 596 | self.axis = axis 597 | 598 | def forward(self, *xs: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]: 599 | xp = get_array_module(xs[0]) 600 | self.x_shape = xs[0].shape 601 | self.x_num = len(xs) 602 | y = xp.stack(xs, axis=self.axis) 603 | return y 604 | 605 | def backward(self, gy: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]: 606 | gx = [] 607 | for i in range(self.x_num): 608 | indices = [slice(None)] * gy.ndim 609 | indices[self.axis] = slice(i, i + 1) 610 | gx.append(gy[tuple(indices)].reshape(self.x_shape)) 611 | return tuple(gx) 612 | 613 | 614 | def stack(inputs, axis=0): 615 | return Stack(axis=axis)(*inputs) 616 | 617 | 618 | if __name__ == '__main__': 619 | # def goldstein(x, y): 620 | # z = (1 + (x + y + 1) ** 2 * (19 - 14 * x + 3 * x ** 2 - 14 * y + 6 * x * y + 3 * y ** 2)) * \ 621 | # (30 + (2 * x - 3 * y) ** 2 * (18 - 32 * x + 12 * x ** 2 + 48 * y - 36 * x * y + 27 * y ** 2)) 622 | # return z 623 | # 624 | # 625 | # x = Variable(np.array(1.0)) 626 | # y = Variable(np.array(1.0)) 627 | # z = goldstein(x, y) 628 | # z.backward() 629 | # print(z) 630 | # print(x.grad, y.grad) 631 | # 632 | # plot_dot_graph(z, verbose=False) 633 | 634 | # x = Variable(np.array(1.0)) 635 | # y = tanh(x) 636 | # x.name = 'x' 637 | # y.name = 'y' 638 | # y.backward(create_graph=True) 639 | # iters = 6 640 | # 641 | # for i in range(iters): 642 | # gx = x.grad 643 | # x.cleargrad() 644 | # gx.backward(create_graph=True) 645 | # 646 | # gx = x.grad 647 | # gx.name = 'gx' + str(iters + 1) 648 | # plot_dot_graph(gx, verbose=False) 649 | 650 | # np.random.seed(0) 651 | # x = np.random.rand(100, 1) 652 | # y = 5 + 2 * x + np.random.rand(100, 1) 653 | # 654 | # W = Variable(np.zeros((1, 1))) 655 | # b = Variable(np.zeros(1)) 656 | # 657 | # 658 | # def predict(x): 659 | # y = matmul(x, W) + b 660 | # return y 661 | # 662 | # 663 | # def mean_squared_error(x0, x1): 664 | # diff = x0 - x1 665 | # return sum(diff ** 2) / len(diff) 666 | # 667 | # 668 | # lr = 0.1 669 | # iters = 100 670 | # 671 | # for i in range(iters): 672 | # y_pred = predict(x) 673 | # loss = mean_squared_error(y, y_pred) 674 | # W.cleargrad() 675 | # b.cleargrad() 676 | # loss.backward() 677 | # W.data -= lr * W.grad.data 678 | # b.data -= lr * b.grad.data 679 | # print(W, b, loss) 680 | # a = Variable(np.array([i for i in range(20)]).reshape(4, 5)) 681 | # b = Variable(np.array([i for i in range(20, 40)]).reshape(4, 5)) 682 | # c = Variable(np.array([i for i in range(40, 60)]).reshape(4, 5)) 683 | # 684 | # aaa = 2 685 | # raw_shape = a.shape 686 | # print(raw_shape[:aaa] + (1,) + raw_shape[aaa:]) 687 | # 688 | # a1 = a.reshape(raw_shape[:aaa] + (1,) + raw_shape[aaa:]) 689 | # b1 = b.reshape(raw_shape[:aaa] + (1,) + raw_shape[aaa:]) 690 | # c1 = c.reshape(raw_shape[:aaa] + (1,) + raw_shape[aaa:]) 691 | # 692 | # d = cat((a1, b1, c1), axis=aaa) 693 | # 694 | # d1 = d[1:, :, :] * 3 695 | # d2 = d[:1, :, :] * 5 696 | # 697 | # dd = cat((d1, d2), axis=0) 698 | # 699 | # dd.backward() 700 | # print(dd) 701 | # print(dd.shape) 702 | # print(a.grad) 703 | # print(b.grad) 704 | # print(c.grad) 705 | # 706 | # a2 = Variable(np.array([i for i in range(20)]).reshape(4, 5)) 707 | # b2 = Variable(np.array([i for i in range(20, 40)]).reshape(4, 5)) 708 | # c2 = Variable(np.array([i for i in range(40, 60)]).reshape(4, 5)) 709 | # 710 | # dd2 = Stack(axis=aaa)(a2, b2, c2) 711 | # 712 | # d11 = dd2[1:, :, :] * 3 713 | # d22 = dd2[:1, :, :] * 5 714 | # 715 | # ddd = cat((d11, d22), axis=0) 716 | # ddd.backward() 717 | # 718 | # print(ddd) 719 | # print(a2.grad) 720 | # print(b2.grad) 721 | # print(c2.grad) 722 | 723 | pass 724 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import numpy as np 4 | import json 5 | from typing import Union, Optional, Callable, Literal 6 | from config import Config, no_grad, test_mode 7 | import optimizers 8 | from variable import Variable 9 | from optimizers import SGD 10 | from models import MLP, Model, Sequential 11 | from layers import Linear, Parameter, Layer, Embedding 12 | from functions import mean_squared_error, sigmoid, matmul, Function, cat, softmax, dropout, stack 13 | from datasets import Dataset 14 | from dataloaders import DataLoader 15 | from dataclasses import dataclass 16 | 17 | # 中文故事 https://github.com/chenyangMl/llama2.c-zh 18 | # 中文医疗 https://huggingface.co/datasets/shibing624/medical 19 | 20 | 21 | import os 22 | 23 | 24 | # 仅设置一块可见 25 | # os.environ['CUDA_VISIBLE_DEVICES'] = '2' 26 | 27 | 28 | class Tokenizer: 29 | def __init__(self, model_path: str): 30 | with open(model_path, "r", encoding="utf-8") as f: 31 | model = json.load(f) 32 | self.vocab = model["tokens"] 33 | self.scores = model["scores"] 34 | self.pad_id = 0 35 | self.bos_id = 1 36 | self.eos_id = 2 37 | self.n_words = len(self.vocab) 38 | special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] 39 | self.special_tokens = {} 40 | self.index_special_tokens = {} 41 | for token in special_tokens: 42 | self.special_tokens[token] = self.n_words 43 | self.index_special_tokens[self.n_words] = token 44 | self.n_words += 1 45 | 46 | def str_lookup(self, token: str) -> int: 47 | try: 48 | index = self.vocab.index(token) 49 | return index 50 | except ValueError as err: 51 | return -1 52 | 53 | def encode( 54 | self, 55 | text: str, 56 | add_bos: bool = True, 57 | add_eos: bool = False, 58 | add_prefix: bool = True, 59 | add_new_bos: bool = False, 60 | ) -> list[int]: 61 | tokens = [] 62 | for pos, char in enumerate(text): 63 | id = self.str_lookup(char) 64 | if id >= 0: 65 | tokens.append(id) 66 | else: 67 | tokens = tokens + [(i + 3) for i in char.encode()] 68 | while True: 69 | best_score = -1e10 70 | best_id = -1 71 | best_idx = -1 72 | 73 | for i in range(len(tokens) - 1): 74 | # Check if we can merge the pair (tokens[i], tokens[i+1]) 75 | string = self.vocab[tokens[i]] + self.vocab[tokens[i + 1]] 76 | id = self.str_lookup(string) 77 | if id != -1 and self.scores[id] > best_score: 78 | best_score = self.scores[id] 79 | best_id = id 80 | best_idx = i 81 | 82 | if best_idx == -1: 83 | break 84 | 85 | # Merge the consecutive pair (best_idx, best_idx+1) into new token best_id 86 | tokens[best_idx] = best_id 87 | # Delete token at position best_idx+1, shift the entire sequence back 1 88 | tokens = tokens[0: best_idx + 1] + tokens[best_idx + 2:] 89 | if add_bos: 90 | tokens.insert(0, self.bos_id) 91 | if add_eos: 92 | tokens.append(self.eos_id) 93 | if add_prefix: 94 | tokens.insert(0, self.special_tokens['sop']) 95 | tokens.insert(0, self.special_tokens['[gMASK]']) 96 | if add_new_bos: 97 | tokens.append(self.bos_id) 98 | return tokens 99 | 100 | def decode(self, ids: list[int]) -> str: 101 | res = [] 102 | for i in ids: 103 | token = self.vocab[i] 104 | res.append(token) 105 | text = "".join(res) 106 | text = text.strip("").strip("") 107 | return text 108 | 109 | 110 | class SelfAttention(Model): 111 | def __init__(self, 112 | args: 'LLaMaArgs', 113 | rope_apply: Callable): 114 | super(SelfAttention, self).__init__() 115 | 116 | assert args.num_heads * args.head_dim == args.hidden_size 117 | assert args.num_heads % args.num_key_value_heads == 0 118 | assert args.head_dim % 2 == 0 119 | 120 | self.max_len = args.max_len 121 | self.max_batch_size = args.max_batch_size 122 | self.enable_kv_cache = args.enable_kv_cache 123 | self.use_gpu = args.use_gpu 124 | 125 | self.hidden_size = args.hidden_size 126 | self.num_heads = args.num_heads 127 | self.head_dim = args.head_dim 128 | self.num_key_value_heads = args.num_key_value_heads 129 | self.attention_bias = args.attention_bias 130 | self.dropout_ratio = args.dropout_ratio 131 | 132 | self.dropout_on = args.dropout_ratio != 0 133 | self.kv_repeat_num = self.num_heads // self.num_key_value_heads 134 | 135 | self.rope_apply = rope_apply 136 | 137 | self.q_proj = Linear(in_size=self.hidden_size, out_size=self.num_heads * self.head_dim, 138 | nobias=~self.attention_bias) 139 | 140 | self.k_proj = Linear(in_size=self.hidden_size, out_size=self.num_key_value_heads * self.head_dim, 141 | nobias=~self.attention_bias) 142 | 143 | self.v_proj = Linear(in_size=self.hidden_size, out_size=self.num_key_value_heads * self.head_dim, 144 | nobias=~self.attention_bias) 145 | 146 | self.o_proj = Linear(in_size=self.hidden_size, out_size=self.hidden_size, nobias=~self.attention_bias) 147 | 148 | if self.enable_kv_cache: 149 | self.k_cache = Variable(np.zeros([self.max_batch_size, self.num_key_value_heads, 0, self.head_dim])) 150 | self.v_cache = Variable(np.zeros([self.max_batch_size, self.num_key_value_heads, 0, self.head_dim])) 151 | if self.use_gpu: 152 | self.k_cache.to_gpu() 153 | self.v_cache.to_gpu() 154 | 155 | def forward(self, x, cos_pos, sin_pos): 156 | batch_size = x.shape[0] 157 | length = x.shape[1] 158 | # embed_dim = x.shape[2] 159 | 160 | q = self.q_proj(x) 161 | k = self.k_proj(x) 162 | v = self.v_proj(x) 163 | # [batch_size, length, hidden_size] 164 | 165 | q = q.reshape(batch_size, length, self.num_heads, self.head_dim).transpose(0, 2, 1, 3) 166 | k = k.reshape(batch_size, length, self.num_key_value_heads, self.head_dim).transpose(0, 2, 1, 3) 167 | v = v.reshape(batch_size, length, self.num_key_value_heads, self.head_dim).transpose(0, 2, 1, 3) 168 | # [batch_size, length, num_heads, head_dim] 169 | # [batch_size, num_heads, length, head_dim] 170 | 171 | # q,k rope finish 172 | # q = apply_RoPE(q, cos_pos, sin_pos) 173 | # k = apply_RoPE(k, cos_pos, sin_pos) 174 | q = self.rope_apply(q, cos_pos, sin_pos) 175 | k = self.rope_apply(k, cos_pos, sin_pos) 176 | 177 | if self.enable_kv_cache: 178 | start_pos = self.k_cache.shape[2] 179 | else: 180 | start_pos = 0 181 | 182 | if self.enable_kv_cache: 183 | self.k_cache = cat((self.k_cache, k), axis=2) 184 | self.v_cache = cat((self.v_cache, v), axis=2) 185 | k = self.k_cache 186 | v = self.v_cache 187 | 188 | # print(k[0, 0]) 189 | # print(v[0, 0]) 190 | 191 | # 相乘之前若是kv头数不一样还需要重复 num_heads % num_key_value_heads 192 | if self.num_heads != self.num_key_value_heads: 193 | k = k[:, np.arange(self.num_key_value_heads).repeat(self.kv_repeat_num), :, :] 194 | v = v[:, np.arange(self.num_key_value_heads).repeat(self.kv_repeat_num), :, :] 195 | 196 | attention_weight = matmul(q, k.transpose(0, 1, 3, 2)) / np.sqrt(self.head_dim) 197 | 198 | mask = np.full((length, length), -np.inf) 199 | mask = np.triu(mask, k=1) 200 | mask = np.concatenate((np.zeros((length, start_pos)), mask), axis=1) 201 | 202 | if self.use_gpu: 203 | from cuda import as_cupy 204 | mask = as_cupy(mask) 205 | 206 | attention_weight = attention_weight + mask 207 | 208 | attention_weight = softmax(attention_weight, axis=-1) 209 | 210 | if self.dropout_on: 211 | attention_weight = dropout(attention_weight, self.dropout_ratio) 212 | 213 | output = matmul(attention_weight, v) # (bzs, num_heads, length, head_dim) 214 | output = output.transpose(0, 2, 1, 3).reshape(batch_size, length, self.hidden_size) 215 | # (bzs, length, embed_dim) 216 | output = self.o_proj(output) 217 | 218 | return output 219 | 220 | 221 | class SiLU(Function): 222 | def forward(self, x: np.ndarray) -> tuple[np.ndarray]: 223 | self.sigmoid = 1 / (1 + np.exp(-x)) 224 | y = x * self.sigmoid 225 | return y 226 | 227 | def backward(self, gy: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]: 228 | y = self.outputs[0]() 229 | gx = gy * (y + self.sigmoid * (1 - y)) 230 | # y'=(xs)'=s+xs(1-s)=s+xs-xss=xs+s(1-xs)=y+s(1-y) 231 | return gx 232 | 233 | 234 | def silu(x): 235 | return SiLU()(x) 236 | 237 | 238 | class SwiGLUFeedForwardNetwork(Model): 239 | def __init__(self, hidden_size: int, intermediate_size: int, use_bias: bool = False): 240 | super(SwiGLUFeedForwardNetwork, self).__init__() 241 | self.fc_gate = Linear(in_size=hidden_size, out_size=intermediate_size, nobias=~use_bias) 242 | self.fc_up = Linear(in_size=hidden_size, out_size=intermediate_size, nobias=~use_bias) 243 | self.fc_down = Linear(in_size=intermediate_size, out_size=hidden_size, nobias=~use_bias) 244 | 245 | def forward(self, x): 246 | x1 = self.fc_up(x) 247 | x = silu(self.fc_gate(x)) 248 | x = x * x1 249 | x = self.fc_down(x) 250 | return x 251 | 252 | 253 | # class RMSNorm(Layer): 254 | # def __init__(self, hidden_size: int, eps: float = 1e-6): 255 | # super(RMSNorm, self).__init__() 256 | # self.weight = Parameter(np.ones(hidden_size), 'weight') 257 | # self.epsilon = eps 258 | # 259 | # def forward(self, x): 260 | # x_shape = x.shape 261 | # x = x * ((x ** 2).sum(axis=x.ndim - 1) / x_shape[-1] + self.epsilon).reshape(*(x_shape[:-1] + (1,))) ** (-1 / 2) 262 | # x = self.weight * x 263 | # return x 264 | 265 | class RoPELlama: 266 | def __init__(self, 267 | max_len: int, 268 | output_dim: int, 269 | rope_theta: float = 10000.0): 270 | self.max_len = max_len 271 | self.output_dim = output_dim 272 | self.rope_theta = rope_theta 273 | 274 | def apply(q: Variable, cos_pos: np.ndarray, sin_pos: np.ndarray): 275 | q2 = stack((-q[..., 1::2], q[..., ::2]), axis=-1) 276 | q2 = q2.reshape(q.shape) 277 | q = q * cos_pos + q2 * sin_pos 278 | return q 279 | 280 | self.apply = apply 281 | 282 | def get_cos_sin(self): 283 | position = np.arange(0, self.max_len, dtype=np.float32)[..., np.newaxis] 284 | ids = np.arange(0, self.output_dim // 2, dtype=np.float32) 285 | theta = self.rope_theta ** (-2 * ids / self.output_dim) 286 | embeddings = position * theta 287 | # (max_len, output_dim//2, 2) 288 | embeddings = np.stack([np.sin(embeddings), np.cos(embeddings)], axis=-1) 289 | # (bs, head, max_len, output_dim//2, 2) 290 | embeddings = np.tile(embeddings, 291 | (1, 1, *([1] * len(embeddings.shape)))) # 在bs维度重复,其他维度都是1不重复 292 | # (bs, head, max_len, output_dim) 293 | # reshape后就是:偶数sin, 奇数cos了 294 | embeddings = np.reshape(embeddings, (1, 1, self.max_len, self.output_dim)) 295 | cos_pos = embeddings[..., 1::2].repeat(2, axis=-1) # 将奇数列信息抽取出来也就是cos 拿出来并复制 296 | sin_pos = embeddings[..., ::2].repeat(2, axis=-1) # 将偶数列信息抽取出来也就是sin 拿出来并复制 297 | return cos_pos, sin_pos 298 | 299 | 300 | class RoPEHF: 301 | def __init__(self, 302 | max_len: int, 303 | output_dim: int, 304 | rope_theta: float = 500000.0): 305 | self.max_len = max_len 306 | self.output_dim = output_dim 307 | self.rope_theta = rope_theta 308 | 309 | def apply(q: Variable, cos_pos: np.ndarray, sin_pos: np.ndarray): 310 | q2 = cat((-q[..., q.shape[-1] // 2:], q[..., : q.shape[-1] // 2]), axis=-1) 311 | q = q * cos_pos + q2 * sin_pos 312 | return q 313 | 314 | self.apply = apply 315 | 316 | def get_cos_sin(self): 317 | # HF 318 | position = np.arange(0, self.max_len, dtype=np.float32)[..., np.newaxis] 319 | ids = np.arange(0, self.output_dim // 2, dtype=np.float32) 320 | theta = self.rope_theta ** (-2 * ids / self.output_dim) 321 | embeddings = position * theta 322 | embeddings = np.concatenate((embeddings, embeddings), axis=-1)[np.newaxis, np.newaxis, :, :] 323 | cos_pos = np.cos(embeddings) 324 | sin_pos = np.sin(embeddings) 325 | return cos_pos, sin_pos 326 | 327 | 328 | # def sinusoidal_position_embedding(batch_size: int, 329 | # nums_head: int, 330 | # max_len: int, 331 | # output_dim: int, 332 | # rope_theta: float = 10000.0): 333 | # # (max_len, 1) 334 | # position = np.arange(0, max_len, dtype=np.float32)[..., np.newaxis] 335 | # # (output_dim//2) 336 | # ids = np.arange(0, output_dim // 2, dtype=np.float32) # 即公式里的i, i的范围是 [0,d/2] 337 | # theta = rope_theta ** (-2 * ids / output_dim) 338 | # 339 | # # (max_len, output_dim//2) 340 | # embeddings = position * theta # 即公式里的:pos / (10000^(2i/d)) 341 | # 342 | # # (max_len, output_dim//2, 2) 343 | # embeddings = np.stack([np.sin(embeddings), np.cos(embeddings)], axis=-1) 344 | # 345 | # # (bs, head, max_len, output_dim//2, 2) 346 | # 347 | # embeddings = np.tile(embeddings, (batch_size, nums_head, *([1] * len(embeddings.shape)))) # 在bs维度重复,其他维度都是1不重复 348 | # 349 | # # (bs, head, max_len, output_dim) 350 | # # reshape后就是:偶数sin, 奇数cos了 351 | # 352 | # embeddings = np.reshape(embeddings, (batch_size, nums_head, max_len, output_dim)) 353 | # return embeddings 354 | # 355 | # 356 | # def apply_RoPE(q: Variable, cos_pos: np.ndarray, sin_pos: np.ndarray): 357 | # q2 = stack((-q[..., 1::2], q[..., ::2]), axis=-1) 358 | # q2 = q2.reshape(q.shape) # reshape后就是正负交替了 359 | # q = q * cos_pos + q2 * sin_pos 360 | # return q 361 | 362 | 363 | class TransformerDecoderBlock(Model): 364 | def __init__(self, 365 | args: 'LLaMaArgs', 366 | rope_apply: Callable): 367 | super(TransformerDecoderBlock, self).__init__() 368 | 369 | self.max_len = args.max_len 370 | self.max_batch_size = args.max_batch_size 371 | self.enable_kv_cache = args.enable_kv_cache 372 | 373 | self.hidden_size = args.hidden_size 374 | 375 | self.num_heads = args.num_heads 376 | self.head_dim = args.head_dim 377 | self.num_key_value_heads = args.num_key_value_heads 378 | self.attention_bias = args.attention_bias 379 | self.dropout_ratio = args.dropout_ratio 380 | 381 | self.ffn_intermediate_size = args.ffn_intermediate_size 382 | self.ffn_bias = args.ffn_bias 383 | 384 | self.rms_eps = args.rms_eps 385 | 386 | self.multi_head_self_attention = SelfAttention(args=args, rope_apply=rope_apply) 387 | self.ffn = SwiGLUFeedForwardNetwork(hidden_size=self.hidden_size, intermediate_size=self.ffn_intermediate_size, 388 | use_bias=self.ffn_bias) 389 | 390 | self.rms_norm_1 = RMSNorm(hidden_size=self.hidden_size, eps=self.rms_eps) 391 | self.rms_norm_2 = RMSNorm(hidden_size=self.hidden_size, eps=self.rms_eps) 392 | 393 | def forward(self, x, cos_pos, sin_pos): 394 | x = self.multi_head_self_attention(self.rms_norm_1(x), cos_pos, sin_pos) + x 395 | x = self.ffn(self.rms_norm_2(x)) + x 396 | return x 397 | 398 | 399 | class LLaMa(Model): 400 | def __init__(self, 401 | args: 'LLaMaArgs'): 402 | super(LLaMa, self).__init__() 403 | 404 | self.max_len = args.max_len 405 | self.max_batch_size = args.max_batch_size 406 | self.enable_kv_cache = args.enable_kv_cache 407 | self.use_gpu = args.use_gpu 408 | 409 | self.vocab_size = args.vocab_size 410 | self.num_layers = args.num_layers 411 | self.hidden_size = args.hidden_size 412 | 413 | self.num_heads = args.num_heads 414 | self.head_dim = args.head_dim 415 | self.num_key_value_heads = args.num_key_value_heads 416 | self.attention_bias = args.attention_bias 417 | self.rope_theta = args.rope_theta 418 | self.dropout_ratio = args.dropout_ratio 419 | 420 | self.ffn_intermediate_size = args.ffn_intermediate_size 421 | self.ffn_bias = args.ffn_bias 422 | 423 | self.rms_eps = args.rms_eps 424 | 425 | self.embedding = Embedding(in_size=self.vocab_size, out_size=self.hidden_size) 426 | 427 | self.rope_type = args.rope_type 428 | if self.rope_type == 'Llama': 429 | self.rope = RoPELlama(max_len=self.max_len, 430 | output_dim=self.head_dim, 431 | rope_theta=self.rope_theta) 432 | else: 433 | self.rope = RoPEHF(max_len=self.max_len, 434 | output_dim=self.head_dim, 435 | rope_theta=self.rope_theta) 436 | 437 | self.transformers = Sequential(*[ 438 | TransformerDecoderBlock(args=args, rope_apply=self.rope.apply) for _ in range(self.num_layers)]) 439 | 440 | self.last_rms = RMSNorm(hidden_size=self.hidden_size, eps=self.rms_eps) 441 | self.linear = Linear(in_size=self.hidden_size, out_size=self.vocab_size, nobias=True) 442 | 443 | self.weight_share = args.weight_share 444 | 445 | if self.weight_share: 446 | self.linear.W = self.embedding.W.T 447 | 448 | self.cos_pos, self.sin_pos = self.rope.get_cos_sin() 449 | 450 | if self.use_gpu: 451 | from cuda import as_cupy 452 | self.cos_pos = as_cupy(self.cos_pos) 453 | self.sin_pos = as_cupy(self.sin_pos) 454 | 455 | def forward(self, x): 456 | if self.enable_kv_cache: 457 | start_pos = self.transformers.layers[0].multi_head_self_attention.k_cache.shape[2] 458 | else: 459 | start_pos = 0 460 | now_len = x.shape[1] 461 | if start_pos + now_len >= self.max_len: 462 | raise 'kv cache is full' 463 | x = self.embedding(x) 464 | for layer in self.transformers.layers: 465 | x = layer(x, self.cos_pos[:, :, start_pos:(start_pos + now_len), :], 466 | self.sin_pos[:, :, start_pos:(start_pos + now_len), :]) 467 | x = self.last_rms(x) 468 | x = self.linear(x[:, -1, :]) 469 | # return softmax(x, 2) 470 | return x 471 | 472 | def clean_kv_cache(self): 473 | if self.enable_kv_cache: 474 | for i in self.transformers.layers: 475 | if self.use_gpu: 476 | import cupy as cp 477 | i.multi_head_self_attention.k_cache = Variable( 478 | cp.zeros((self.max_batch_size, self.num_key_value_heads, 0, self.head_dim))) 479 | i.multi_head_self_attention.v_cache = Variable( 480 | cp.zeros((self.max_batch_size, self.num_key_value_heads, 0, self.head_dim))) 481 | else: 482 | i.multi_head_self_attention.k_cache = Variable( 483 | np.zeros([self.max_batch_size, self.num_key_value_heads, 0, self.head_dim])) 484 | i.multi_head_self_attention.v_cache = Variable( 485 | np.zeros([self.max_batch_size, self.num_key_value_heads, 0, self.head_dim])) 486 | print('kv cache cleaned') 487 | else: 488 | print('kv cache is not enabled') 489 | 490 | def generate(self, token: np.ndarray, max_gen: int, temperature: float, top_k: int, eos_id: int = 2): 491 | token_batch, token_len = token.shape 492 | assert token_batch == 1 493 | if token_len > self.max_len: 494 | token = token[:, (token_len - self.max_len):] 495 | token_len = self.max_len 496 | 497 | new_char = 0 498 | for i in range(max_gen): 499 | if self.enable_kv_cache: 500 | if i == 0: 501 | r = self(token) 502 | else: 503 | r = self(np.array([[new_char]])) 504 | else: 505 | r = self(token) 506 | r.to_cpu() 507 | if temperature == 0: 508 | new_char = np.argmax(r.data) 509 | new_char = int(new_char) 510 | else: 511 | new_r = r.data / temperature 512 | r_top_k = np.argsort(-new_r)[:, top_k] 513 | new_r[new_r < new_r[:, r_top_k]] = -np.inf 514 | probs = softmax(new_r).data.astype(np.float64) 515 | probs = probs / probs.sum() 516 | new_char = np.argmax(np.random.multinomial(n=1, pvals=probs[0])) 517 | new_char = int(new_char) 518 | 519 | token = np.concatenate((token, np.array([[new_char]])), axis=1) 520 | # print(tokenizer.decode([new_char]), end='') 521 | yield new_char 522 | if new_char == eos_id: 523 | break 524 | return token 525 | 526 | def chat(self, promote: str, tokenizer: Tokenizer, max_gen: int = 500, temperature: float = 1.0, top_k: int = 100, 527 | bos_id: int = 2): 528 | tokens = tokenizer.encode(promote, add_eos=False, add_new_bos=True, add_bos=False, add_prefix=False) 529 | # tokens = tokenizer.encode(promote, add_eos=False, add_new_bos=False, add_bos=True, add_prefix=False) 530 | tokens = np.array(tokens)[np.newaxis, ...] 531 | gen = '' 532 | for i in self.generate(tokens, max_gen, temperature, top_k, bos_id): 533 | if i == tokenizer.eos_id: 534 | print('') 535 | new_char = tokenizer.decode([i]) 536 | gen += new_char 537 | print(new_char, end='') 538 | return gen 539 | 540 | 541 | class RMSNormFunction(Function): 542 | def __init__(self, eps: float = 1e-6): 543 | self.epsilon = eps 544 | 545 | def forward(self, x: np.ndarray, w: np.ndarray) -> tuple[np.ndarray]: 546 | self.rms_inv = ((x ** 2).sum(axis=x.ndim - 1, keepdims=True) / x.shape[-1] + self.epsilon) ** (-1 / 2) 547 | self.rms_x = x * self.rms_inv 548 | y = self.rms_x * w 549 | return y 550 | 551 | def backward(self, gy: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]: 552 | x, w = self.inputs 553 | gw = (gy * self.rms_x).sum(axis=tuple([i for i in range(x.ndim - 1)])) 554 | gx = gy * w * self.rms_inv - x * (self.rms_inv ** 3) * ( 555 | (gy * w * x).sum(axis=x.ndim - 1, keepdims=True) / x.shape[-1]) 556 | return gx, gw 557 | 558 | 559 | def rms_norm(x, w, eps=1e-6): 560 | return RMSNormFunction(eps=eps)(x, w) 561 | 562 | 563 | class RMSNorm(Layer): 564 | def __init__(self, hidden_size: int, eps: float = 1e-6): 565 | super(RMSNorm, self).__init__() 566 | self.weight = Parameter(np.ones(hidden_size), 'weight') 567 | self.epsilon = eps 568 | 569 | def forward(self, x): 570 | return rms_norm(x, self.weight, eps=self.epsilon) 571 | 572 | 573 | @dataclass 574 | class LLaMaArgs: 575 | vocab_size: int = 64783 576 | num_layers: int = 12 577 | hidden_size: int = 1024 578 | num_heads: int = 8 579 | head_dim: int = 128 580 | num_key_value_heads: int = 8 581 | attention_bias: bool = False 582 | weight_share: bool = True 583 | rope_type: Literal['Llama', 'HF'] = 'Llama' 584 | rope_theta: float = 10000.0 585 | enable_kv_cache: bool = True 586 | ffn_intermediate_size: int = 2752 587 | ffn_bias: bool = False 588 | max_len: int = 1024 589 | rms_eps: float = 1e-5 590 | dropout_ratio: float = 0.0 591 | max_batch_size: int = 1 592 | use_gpu: bool = True 593 | 594 | 595 | baby_llama_zh = LLaMaArgs( 596 | vocab_size=64783, 597 | num_layers=12, 598 | hidden_size=1024, 599 | num_heads=8, 600 | head_dim=128, 601 | num_key_value_heads=8, 602 | attention_bias=False, 603 | weight_share=True, 604 | rope_type='Llama', 605 | rope_theta=10000.0, 606 | enable_kv_cache=True, 607 | ffn_intermediate_size=2752, 608 | ffn_bias=False, 609 | max_len=1024, 610 | rms_eps=1e-5, 611 | dropout_ratio=0.0, 612 | max_batch_size=1, 613 | use_gpu=True, 614 | ) 615 | 616 | atom_7b = LLaMaArgs( 617 | vocab_size=65000, 618 | num_layers=32, 619 | hidden_size=4096, 620 | num_heads=32, 621 | head_dim=128, 622 | num_key_value_heads=32, 623 | attention_bias=False, 624 | weight_share=False, 625 | rope_type='HF', 626 | rope_theta=500000.0, 627 | enable_kv_cache=True, 628 | ffn_intermediate_size=11008, 629 | ffn_bias=False, 630 | max_len=4096, 631 | rms_eps=1e-5, 632 | dropout_ratio=0.0, 633 | max_batch_size=1, 634 | use_gpu=True, 635 | ) 636 | 637 | 638 | class Timer: 639 | def __init__(self, name: str): 640 | self.name = name 641 | 642 | def __enter__(self): 643 | self.s = time.time() 644 | 645 | def __exit__(self, exc_type, exc_val, exc_tb): 646 | self.e = time.time() 647 | print(f'{self.name} cost {self.e - self.s} seconds') 648 | 649 | 650 | if __name__ == '__main__': 651 | np.random.seed(114514) 652 | 653 | model_dict_atom_7b = { 654 | 'args': atom_7b, 655 | 'weights_path': 'Atom7b.npz', 656 | 'tokenizer_path': 'tokenizer_atom7b.model.np', 657 | } 658 | model_dict_baby_llama_zh = { 659 | 'args': baby_llama_zh, 660 | 'weights_path': 'WEIGHTS.npz', 661 | 'tokenizer_path': 'tokenizer_chatglm2.model.np', 662 | } 663 | 664 | model_dict = model_dict_baby_llama_zh 665 | 666 | with no_grad(), test_mode(): 667 | tokenizer = Tokenizer(model_path=model_dict['tokenizer_path']) 668 | 669 | with Timer('model init'): # 771 670 | m = LLaMa(args=model_dict['args']) 671 | 672 | with Timer('weights load'): # 235 673 | m.load_weights(model_dict['weights_path']) 674 | 675 | if model_dict['args'].use_gpu: 676 | with Timer('to gpu'): # 6 677 | m.to_gpu() 678 | 679 | # i = np.array([[1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007]]) 680 | # print(m(i)) 681 | 682 | # m1 = LLaMa(args=model_args) 683 | # m1.load_weights('WEIGHTS.npz') 684 | # m1.to_gpu() 685 | # m2 = LLaMa(args=model_args) 686 | # m2.load_weights('WEIGHTS.npz') 687 | # m2.to_gpu() 688 | # the_str = '写一篇700字以上的有关大语言模型的议论文' 689 | # print(the_str) 690 | # for i in range(100): 691 | # r1 = m1.chat(the_str, tokenizer) 692 | # r2 = m1.chat(r1, tokenizer) 693 | # the_str = r2 694 | 695 | # https://github.com/AI-Study-Han/Mini-Llama2-Chinese/tree/main 696 | 697 | # the_str = '什么是大语言模型' 698 | # print(the_str) 699 | # m.chat(the_str, tokenizer) 700 | 701 | for _ in range(100): 702 | test_str = input() 703 | if test_str == '\\clean': 704 | m.clean_kv_cache() 705 | continue 706 | if test_str == '\\stop': 707 | break 708 | m.chat(test_str, tokenizer) 709 | --------------------------------------------------------------------------------