├── requirements.txt
├── config.py
├── cuda.py
├── datasets.py
├── README.md
├── models.py
├── dataloaders.py
├── optimizers.py
├── tools.py
├── layers.py
├── variable.py
├── functions.py
└── main.py
/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/2ertwo/LLaMa3-Numpy-trainable/HEAD/requirements.txt
--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | import contextlib
2 |
3 |
4 | class Config:
5 | enable_backprop: bool = True
6 | train: bool = True
7 |
8 |
9 | @contextlib.contextmanager
10 | def using_config(name, value):
11 | old_value = getattr(Config, name)
12 | setattr(Config, name, value)
13 | try:
14 | yield
15 | finally:
16 | setattr(Config, name, old_value)
17 |
18 |
19 | def no_grad():
20 | return using_config("enable_backprop", False)
21 |
22 |
23 | def test_mode():
24 | return using_config('train', False)
25 |
--------------------------------------------------------------------------------
/cuda.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from variable import Variable
3 |
4 | gpu_enable = True
5 | try:
6 | import cupy as cp
7 |
8 | cupy = cp
9 | except ImportError:
10 | gpu_enable = False
11 |
12 |
13 | def get_array_module(x):
14 | if isinstance(x, Variable):
15 | x = x.data
16 | if not gpu_enable:
17 | return np
18 | xp = cp.get_array_module(x)
19 | return xp
20 |
21 |
22 | def as_numpy(x):
23 | if isinstance(x, Variable):
24 | x = x.data
25 | if np.isscalar(x):
26 | return np.array(x)
27 | elif isinstance(x, np.ndarray):
28 | return x
29 | return cp.asnumpy(x)
30 |
31 |
32 | def as_cupy(x):
33 | if isinstance(x, Variable):
34 | x = x.data
35 | if not gpu_enable:
36 | raise Exception('CuPy is not installed')
37 | return cp.asarray(x)
38 |
--------------------------------------------------------------------------------
/datasets.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class Dataset:
5 | def __init__(self, train: bool = True, transform=None, target_transform=None):
6 | self.train = train
7 | self.transform = transform
8 | self.target_transform = target_transform
9 | if self.transform is None:
10 | self.transform = lambda x: x
11 | if self.target_transform is None:
12 | self.target_transform = lambda x: x
13 | self.data = None
14 | self.label = None
15 | self.prepare()
16 |
17 | def __getitem__(self, item):
18 | assert np.isscalar(item)
19 | if self.label is None:
20 | return self.transform(self.data[item]), None
21 | else:
22 | return self.transform(self.data[item]), self.target_transform(self.label[item])
23 |
24 | def __len__(self):
25 | return len(self.data)
26 |
27 | def prepare(self):
28 | raise NotImplementedError()
29 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # LLaMa3-Numpy-trainable
2 | 用Numpy复现可训练的LLaMa3
3 | ## 具体文档
4 | 为了获得更好的体验,请移步飞书文档:
5 | https://aw8o2u3n233.feishu.cn/wiki/Pc7swzMMZiYnP5krcrzcHkQUn1a?from=from_copylink
6 | ## 参考文献
7 | 作业原本提供:
8 | https://github.com/naklecha/llama3-from-scratch
9 |
10 | llama3(不可训练)numpy实现:
11 | https://github.com/likejazz/llama3.np
12 |
13 | Baby llama:
14 | https://github.com/DLLXW/baby-llama2-chinese
15 |
16 | Atom7b:
17 | https://github.com/LlamaFamily/Llama-Chinese
18 |
19 | RMSnorm论文:
20 | https://arxiv.org/abs/1910.07467
21 |
22 | llama3的ffn SwiGLU论文:
23 | https://arxiv.org/abs/2002.05202
24 |
25 | 注意力论文:
26 | https://arxiv.org/abs/1706.03762
27 |
28 | 注意力介绍博文:
29 | https://spaces.ac.cn/archives/4765
30 |
31 | 注意力介绍:
32 | https://armanasq.github.io/nlp/self-attention/
33 |
34 | RoPE论文RoFormer:
35 | https://arxiv.org/abs/2104.09864
36 |
37 | RoPE原作者亲自讲解:
38 | https://spaces.ac.cn/archives/8265
39 |
40 | RoPE介绍与实现:
41 | https://blog.eleuther.ai/rotary-embeddings/
42 |
43 | RoPE两种实现的引发的不同:
44 | https://github.com/huggingface/transformers/issues/25199
45 |
46 | 关于权重共享:
47 | https://spaces.ac.cn/archives/9698
48 |
49 | 『ゼロから作る Deep Learning ❸』(O'Reilly Japan, 2020):
50 | https://github.com/oreilly-japan/deep-learning-from-scratch-3
51 |
52 | 上面那本书的读者实现的stack:
53 | https://github.com/laksjdjf/dezero-diffusion/blob/main/modules/unet.py
54 |
--------------------------------------------------------------------------------
/models.py:
--------------------------------------------------------------------------------
1 | import layers
2 | from layers import Layer
3 | from functions import Function, sigmoid
4 | from tools import plot_dot_graph
5 |
6 |
7 | class Model(Layer):
8 | def plot(self, *inputs, to_file='model.png'):
9 | y = self.forward(*inputs)
10 | return plot_dot_graph(y, verbose=True, to_file=to_file)
11 |
12 |
13 | class MLP(Model):
14 | def __init__(self, fc_output_sizes: tuple[int, ...], activation: Function = sigmoid):
15 | super(MLP, self).__init__()
16 | self.activation = activation
17 | self.layers = []
18 |
19 | for i, out_size in enumerate(fc_output_sizes):
20 | layer = layers.Linear(out_size)
21 | setattr(self, 'l' + str(i), layer)
22 | self.layers.append(layer)
23 |
24 | def forward(self, x):
25 | for l in self.layers[:-1]:
26 | x = self.activation(l(x))
27 | return self.layers[-1](x)
28 |
29 |
30 | class Sequential(Model):
31 | def __init__(self, *layers):
32 | super().__init__()
33 | self.layers = []
34 | for i, layer in enumerate(layers):
35 | setattr(self, 'l' + str(i), layer)
36 | self.layers.append(layer)
37 |
38 | def forward(self, x):
39 | for layer in self.layers:
40 | x = layer(x)
41 | return x
42 |
--------------------------------------------------------------------------------
/dataloaders.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import math
3 | from datasets import Dataset
4 |
5 |
6 | class DataLoader:
7 | def __init__(self, dataset: Dataset, batch_size: int, shuffle: bool = True):
8 | self.dataset = dataset
9 | self.batch_size = batch_size
10 | self.shuffle = shuffle
11 | self.data_size = len(dataset)
12 | self.max_iter = math.ceil(self.data_size / self.batch_size)
13 |
14 | self.reset()
15 |
16 | def reset(self):
17 | self.iteration = 0
18 | if self.shuffle:
19 | self.index = np.random.permutation(len(self.dataset))
20 | else:
21 | self.index = np.arange(len(self.dataset))
22 |
23 | def __iter__(self):
24 | return self
25 |
26 | def __next__(self):
27 | if self.iteration >= self.max_iter:
28 | self.reset()
29 | raise StopIteration()
30 |
31 | i, batch_size = self.iteration, self.batch_size
32 | batch_index = self.index[i * batch_size:(i + 1) * batch_size]
33 | batch = [self.dataset[i] for i in batch_index]
34 | x = np.array([example[0] for example in batch])
35 | t = np.array([example[1] for example in batch])
36 | self.iteration += 1
37 | return x, t
38 |
39 | def next(self):
40 | return self.__next__()
41 |
--------------------------------------------------------------------------------
/optimizers.py:
--------------------------------------------------------------------------------
1 | from typing import Union, Callable
2 |
3 | import numpy as np
4 |
5 | from models import Model
6 | from layers import Layer
7 | from variable import Parameter
8 |
9 |
10 | class Optimizer:
11 | def __init__(self):
12 | self.target = None
13 | self.hooks: list[Callable] = []
14 |
15 | def setup(self, target: Union[Layer, Model]):
16 | self.target: Union[Layer, Model] = target
17 | return self
18 |
19 | def update(self):
20 | params = [p for p in self.target.params() if p.grad is not None]
21 |
22 | for f in self.hooks:
23 | f(params)
24 |
25 | for param in params:
26 | self.update_one(param)
27 |
28 | def update_one(self, param: Parameter):
29 | raise NotImplementedError
30 |
31 | def add_hook(self, f: Callable):
32 | self.hooks.append(f)
33 |
34 |
35 | class SGD(Optimizer):
36 | def __init__(self, lr=0.01):
37 | super(SGD, self).__init__()
38 | self.lr = lr
39 |
40 | def update_one(self, param: Parameter):
41 | param.data -= self.lr * param.grad.data
42 |
43 |
44 | class MomentumSGD(Optimizer):
45 | def __init__(self, lr=0.01, momentum=0.9):
46 | super(MomentumSGD, self).__init__()
47 | self.lr = lr
48 | self.momentum = momentum
49 | self.vs = {}
50 |
51 | def update_one(self, param: Parameter):
52 | v_key = id(param)
53 | if v_key not in self.vs:
54 | self.vs[v_key] = np.zeros_like(param.data)
55 |
56 | v = self.vs[v_key]
57 | v *= self.momentum
58 | v -= self.lr * param.grad.data
59 | param.data += v
60 |
--------------------------------------------------------------------------------
/tools.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import graphviz
3 | from typing import Union, TYPE_CHECKING
4 | from variable import Variable
5 |
6 | if TYPE_CHECKING:
7 | from functions import Function
8 |
9 |
10 | def as_variable(obj: Union[np.ndarray, Variable]) -> Variable:
11 | if isinstance(obj, Variable):
12 | return obj
13 | return Variable(obj)
14 |
15 |
16 | def as_array(x, array_module=np):
17 | if np.isscalar(x):
18 | return array_module.array(x)
19 | return x
20 |
21 |
22 | def _dot_var(v: Variable, verbose: bool = False) -> str:
23 | dot_var = '{} [label="{}", color=orange, style=filled]\n'
24 | name = '' if v.name is None else v.name
25 | if verbose and v.data is not None:
26 | if v.name is not None:
27 | name += ': '
28 | name += str(v.shape) + ' ' + str(v.dtype)
29 | return dot_var.format(id(v), name)
30 |
31 |
32 | def _dot_func(f: "Function") -> str:
33 | dot_func = '{} [label="{}", color=lightblue, style=filled, shape=box]\n'
34 | txt = dot_func.format(id(f), f.__class__.__name__)
35 |
36 | dot_edge = '{} -> {}\n'
37 | for x in f.inputs:
38 | txt += dot_edge.format(id(x), id(f))
39 | for y in f.outputs:
40 | txt += dot_edge.format(id(f), id(y()))
41 | return txt
42 |
43 |
44 | def get_dot_graph(output: Variable, verbose: bool = True) -> str:
45 | txt = ''
46 | funcs: list[Function] = []
47 | seen_set: set[Function] = set()
48 |
49 | def add_func(f):
50 | if f not in seen_set:
51 | funcs.append(f)
52 | seen_set.add(f)
53 |
54 | add_func(output.creator)
55 | txt += _dot_var(output, verbose)
56 | while funcs:
57 | f: "Function" = funcs.pop()
58 | txt += _dot_func(f)
59 | xs = f.inputs
60 | for x in xs:
61 | txt += _dot_var(x, verbose)
62 | if x.creator is not None:
63 | add_func(x.creator)
64 | return 'digraph g {\n' + txt + '}'
65 |
66 |
67 | def plot_dot_graph(output: Variable, verbose: bool = True, to_file: str = 'graph.png'):
68 | dot_graph = get_dot_graph(output, verbose)
69 | dot = graphviz.Source(dot_graph)
70 | dot.view()
71 |
72 |
73 | def sum_to(x: np.ndarray, shape) -> np.ndarray:
74 | ndim = len(shape)
75 | lead = x.ndim - ndim
76 | lead_axis = tuple(range(lead))
77 |
78 | axis = tuple([i + lead for i, sx in enumerate(shape) if sx == 1])
79 | y = x.sum(lead_axis + axis, keepdims=True)
80 | if lead > 0:
81 | y = y.squeeze(lead_axis)
82 | return y
83 |
84 |
85 | def reshape_sum_backward(gy: Variable, x_shape: tuple[int, ...], axis: Union[tuple[int, ...], int, None],
86 | keepdims: bool) -> Variable:
87 | ndim = len(x_shape)
88 | tupled_axis = axis
89 | if axis is None:
90 | tupled_axis = None
91 | elif not isinstance(axis, tuple):
92 | tupled_axis = (axis,)
93 |
94 | if not (ndim == 0 or tupled_axis is None or keepdims):
95 | actual_axis = [a if a >= 0 else a + ndim for a in tupled_axis]
96 | shape = list(gy.shape)
97 | for a in sorted(actual_axis):
98 | shape.insert(a, 1)
99 | else:
100 | shape = gy.shape
101 |
102 | gy = gy.reshape(shape) # reshape
103 | return gy
104 |
--------------------------------------------------------------------------------
/layers.py:
--------------------------------------------------------------------------------
1 | import weakref
2 | import os
3 | import numpy as np
4 | from typing import Union
5 | from variable import Variable, Parameter
6 | from functions import linear
7 |
8 |
9 | class Layer:
10 | def __init__(self):
11 | self._params: set[str] = set()
12 |
13 | def __setattr__(self, key, value):
14 | if isinstance(value, (Parameter, Layer)):
15 | self._params.add(key)
16 | super(Layer, self).__setattr__(key, value)
17 |
18 | def __call__(self, *inputs: Union[Variable, np.ndarray]) -> Union[list[Variable], Variable]:
19 | outputs = self.forward(*inputs)
20 | if not isinstance(outputs, tuple):
21 | outputs = (outputs,)
22 | self.inputs = [weakref.ref(x) for x in inputs]
23 | self.outputs = [weakref.ref(y) for y in outputs]
24 | return outputs if len(outputs) > 1 else outputs[0]
25 |
26 | def forward(self, *inputs):
27 | raise NotImplementedError()
28 |
29 | def params(self):
30 | for name in self._params:
31 | obj = self.__dict__[name]
32 | if isinstance(obj, Layer):
33 | yield from obj.params()
34 | else:
35 | yield obj
36 |
37 | def cleargrads(self):
38 | for param in self.params():
39 | param.cleargrad()
40 |
41 | def to_cpu(self):
42 | for param in self.params():
43 | param.to_cpu()
44 |
45 | def to_gpu(self):
46 | for param in self.params():
47 | param.to_gpu()
48 |
49 | def _flatten_params(self, params_dict, parent_key=""):
50 | for name in self._params:
51 | obj = self.__dict__[name]
52 | key = parent_key + '/' + name if parent_key else name
53 |
54 | if isinstance(obj, Layer):
55 | obj._flatten_params(params_dict, key)
56 | else:
57 | params_dict[key] = obj
58 |
59 | def save_weights(self, path):
60 | self.to_cpu()
61 | params_dict = {}
62 | self._flatten_params(params_dict)
63 | array_dict = {key: param.data for key, param in params_dict.items()
64 | if param is not None}
65 | try:
66 | np.savez_compressed(path, **array_dict)
67 | except (Exception, KeyboardInterrupt) as e:
68 | if os.path.exists(path):
69 | os.remove(path)
70 | raise
71 |
72 | def load_weights(self, path):
73 | npz = np.load(path)
74 | params_dict = {}
75 | self._flatten_params(params_dict)
76 | for key, param in params_dict.items():
77 | param.data = npz[key]
78 | print(f'{key} loaded')
79 |
80 |
81 | class Linear(Layer):
82 | def __init__(self, out_size: int, nobias: bool = False, dtype=np.float32, in_size: int = None):
83 | super(Linear, self).__init__()
84 | self.in_size = in_size
85 | self.out_size = out_size
86 | self.dtype = dtype
87 |
88 | self.W = Parameter(None, name='W')
89 | if self.in_size is not None:
90 | self._init_W()
91 |
92 | if nobias:
93 | self.b = None
94 | else:
95 | self.b = Parameter(np.zeros(out_size, dtype=dtype), name='b')
96 |
97 | def _init_W(self):
98 | I, O = self.in_size, self.out_size
99 | W_data = np.random.randn(I, O).astype(self.dtype) * np.sqrt(1 / I)
100 | self.W = Parameter(W_data, name='W')
101 |
102 | def forward(self, x):
103 | if self.W.data is None:
104 | self.in_size = x.shape[1]
105 | self._init_W()
106 | y = linear(x, self.W, self.b)
107 | return y
108 |
109 |
110 | class Embedding(Layer):
111 | def __init__(self, in_size, out_size):
112 | super().__init__()
113 | self.W = Parameter(np.random.randn(in_size, out_size), name='W')
114 |
115 | def forward(self, x):
116 | y = self.W[x]
117 | return y
118 |
--------------------------------------------------------------------------------
/variable.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from typing import TYPE_CHECKING
3 | from config import using_config, Config
4 |
5 | try:
6 | import cupy
7 |
8 | array_types = (np.ndarray, cupy.ndarray)
9 | except ImportError:
10 | array_types = (np.ndarray)
11 | if TYPE_CHECKING:
12 | from functions import Function
13 |
14 |
15 | class Variable:
16 | __array_priority__ = 200
17 |
18 | def __init__(self, data: np.ndarray, name=None):
19 | if data is not None:
20 | if not isinstance(data, array_types):
21 | raise TypeError('{} is not supported'.format(type(data)))
22 |
23 | self.data = data
24 | self.grad = None
25 | self.creator = None
26 | self.name = name
27 | self.generation = 0
28 |
29 | def set_creator(self, func: "Function"):
30 | self.creator: "Function" = func
31 | self.generation = func.generation + 1
32 |
33 | def backward(self, retain_grad: bool = False, create_graph=False):
34 | from cuda import get_array_module
35 | if self.grad is None:
36 | xp = get_array_module(self.data)
37 | self.grad = Variable(xp.ones_like(self.data))
38 |
39 | funcs: list[Function] = []
40 | seen_set: set[Function] = set()
41 |
42 | def add_func(f):
43 | if f not in seen_set:
44 | funcs.append(f)
45 | seen_set.add(f)
46 | funcs.sort(key=lambda x: x.generation)
47 |
48 | add_func(self.creator)
49 |
50 | while funcs:
51 | f: "Function" = funcs.pop()
52 | xs = f.inputs
53 | gys = [output().grad for output in f.outputs]
54 |
55 | with using_config('enable_backprop', create_graph):
56 | gxs = f.backward(*gys)
57 | if not isinstance(gxs, tuple):
58 | gxs = (gxs,)
59 |
60 | for x, gx in zip(xs, gxs):
61 | if x.grad is not None:
62 | x.grad = x.grad + gx
63 | else:
64 | x.grad = gx
65 |
66 | if x.creator is not None:
67 | add_func(x.creator)
68 | if not retain_grad:
69 | for y in f.outputs:
70 | y().grad = None
71 |
72 | def cleargrad(self):
73 | self.grad = None
74 |
75 | def to_cpu(self):
76 | from cuda import as_numpy
77 | if self.data is not None:
78 | self.data = as_numpy(self.data)
79 |
80 | def to_gpu(self):
81 | from cuda import as_cupy
82 | if self.data is not None:
83 | self.data = as_cupy(self.data)
84 |
85 | @property
86 | def shape(self):
87 | return self.data.shape
88 |
89 | @property
90 | def size(self):
91 | return self.data.size
92 |
93 | @property
94 | def ndim(self):
95 | return self.data.ndim
96 |
97 | @property
98 | def dtype(self):
99 | return self.data.dtype
100 |
101 | def __len__(self):
102 | return len(self.data)
103 |
104 | def __repr__(self):
105 | if self.data is None:
106 | return 'Variable(None)'
107 | p = str(self.data).replace('\n', '\n' + ' ' * 9)
108 | return 'Variable(' + p + ')'
109 |
110 | def __add__(self, other):
111 | from functions import add
112 | return add(self, other)
113 |
114 | def __radd__(self, other):
115 | from functions import add
116 | return add(self, other)
117 |
118 | def __mul__(self, other):
119 | from functions import mul
120 | return mul(self, other)
121 |
122 | def __rmul__(self, other):
123 | from functions import mul
124 | return mul(self, other)
125 |
126 | def __neg__(self):
127 | from functions import neg
128 | return neg(self)
129 |
130 | def __sub__(self, other):
131 | from functions import sub
132 | return sub(self, other)
133 |
134 | def __rsub__(self, other):
135 | from functions import rsub
136 | return rsub(self, other)
137 |
138 | def __truediv__(self, other):
139 | from functions import div
140 | return div(self, other)
141 |
142 | def __rtruediv__(self, other):
143 | from functions import rdiv
144 | return rdiv(self, other)
145 |
146 | def __pow__(self, power, modulo=None):
147 | from functions import pow
148 | return pow(self, power)
149 |
150 | def __getitem__(self, item):
151 | from functions import get_item
152 | return get_item(self, item)
153 |
154 | def reshape(self, *shape):
155 | from functions import reshape
156 | if len(shape) == 1 and isinstance(shape[0], (tuple, list)):
157 | shape = shape[0]
158 | return reshape(self, shape)
159 |
160 | # def transpose(self):
161 | # from functions import transpose
162 | # return transpose(self)
163 | #
164 | # @property
165 | # def T(self):
166 | # from functions import transpose
167 | # return transpose(self)
168 |
169 | def transpose(self, *axes):
170 | from functions import transpose
171 | if len(axes) == 0:
172 | axes = None
173 | elif len(axes) == 1:
174 | if isinstance(axes[0], (tuple, list)) or axes[0] is None:
175 | axes = axes[0]
176 | return transpose(self, axes)
177 |
178 | @property
179 | def T(self):
180 | from functions import transpose
181 | return transpose(self)
182 |
183 | def sum(self, axis=None, keepdims=False):
184 | from functions import sum
185 | return sum(self, axis=axis, keepdims=keepdims)
186 |
187 |
188 | class Parameter(Variable):
189 | pass
190 |
191 |
192 | if __name__ == '__main__':
193 | # a1 = Variable(np.array([1.0, 3.0]))
194 | # a2 = Variable(np.array([1.0, 2.0]))
195 | # print(a1 + a2)
196 | pass
197 |
--------------------------------------------------------------------------------
/functions.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from typing import Union
3 | from variable import Variable
4 | from config import Config
5 | from tools import as_variable, as_array, plot_dot_graph, reshape_sum_backward
6 | from tools import sum_to as raw_sum_to
7 | from cuda import get_array_module
8 | import weakref
9 |
10 |
11 | class Function:
12 | def __call__(self, *inputs: Union[np.ndarray, Variable]) -> Union[list[Variable], Variable]:
13 | inputs: list[Variable] = [as_variable(x) for x in inputs]
14 |
15 | xs = [x.data for x in inputs]
16 |
17 | ys = self.forward(*xs)
18 | if not isinstance(ys, tuple):
19 | ys = (ys,)
20 | outputs = [Variable(as_array(y)) for y in ys]
21 |
22 | if Config.enable_backprop:
23 | self.generation = max([x.generation for x in inputs])
24 | for output in outputs:
25 | output.set_creator(self)
26 | self.outputs = [weakref.ref(output) for output in outputs]
27 | self.inputs = inputs
28 |
29 | return outputs if len(outputs) > 1 else outputs[0]
30 |
31 | def forward(self, *xs: np.ndarray) -> tuple[np.ndarray]:
32 | raise NotImplementedError()
33 |
34 | def backward(self, gys: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]:
35 | raise NotImplementedError()
36 |
37 |
38 | class Square(Function):
39 | def forward(self, x: np.ndarray) -> np.ndarray:
40 | return x ** 2
41 |
42 | def backward(self, gy: np.ndarray) -> np.ndarray:
43 | x = self.inputs[0]
44 | gx = 2 * x * gy
45 | return gx
46 |
47 |
48 | class Exp(Function):
49 | def forward(self, x: np.ndarray) -> np.ndarray:
50 | xp = get_array_module(x)
51 | return xp.exp(x)
52 |
53 | def backward(self, gy: np.ndarray) -> np.ndarray:
54 | y = self.outputs[0]()
55 | gx = y * gy
56 | return gx
57 |
58 |
59 | def square(x: Variable):
60 | return Square()(x)
61 |
62 |
63 | def exp(x: Variable):
64 | return Exp()(x)
65 |
66 |
67 | # class XtoX(Function):
68 | # # y = x^x
69 | # # lny = xlnx
70 | # # y'/y = 1+lnx
71 | # # y' = y(1+lnx)
72 | # def forward(self, x: np.ndarray) -> tuple[np.ndarray]:
73 | # return x ** x
74 | #
75 | # def backward(self, gy: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]:
76 | # y = self.outputs[0]()
77 | # x = self.inputs[0]
78 | # xp = get_array_module(x)
79 | # gx = gy * (y * (1 + xp.log(x)))
80 | # return gx
81 |
82 |
83 | class Log(Function):
84 | def forward(self, x: np.ndarray) -> tuple[np.ndarray]:
85 | xp = get_array_module(x)
86 | return xp.log(x)
87 |
88 | def backward(self, gy: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]:
89 | x = self.inputs[0]
90 | gx = gy / x
91 | return gx
92 |
93 |
94 | def log(x):
95 | return Log()(x)
96 |
97 |
98 | class Add(Function):
99 | def forward(self, x0: np.ndarray, x1: np.ndarray) -> np.ndarray:
100 | self.x0_shape, self.x1_shape = x0.shape, x1.shape
101 | y = x0 + x1
102 | return y
103 |
104 | def backward(self, gy: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]:
105 | gx0, gx1 = gy, gy
106 | if self.x0_shape != self.x1_shape:
107 | gx0 = sum_to(gx0, self.x0_shape)
108 | gx1 = sum_to(gx1, self.x1_shape)
109 | return gx0, gx1
110 |
111 |
112 | def add(x, y):
113 | y = as_array(y, get_array_module(x))
114 | return Add()(x, y)
115 |
116 |
117 | class Mul(Function):
118 | def forward(self, x0: np.ndarray, x1: np.ndarray) -> tuple[np.ndarray]:
119 | self.x0_shape, self.x1_shape = x0.shape, x1.shape
120 | y = x0 * x1
121 | return y
122 |
123 | def backward(self, gy: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]:
124 | x0, x1 = self.inputs
125 | gx0, gx1 = gy * x1, gy * x0
126 | if self.x0_shape != self.x1_shape:
127 | gx0 = sum_to(gx0, self.x0_shape)
128 | gx1 = sum_to(gx1, self.x1_shape)
129 | return gx0, gx1
130 |
131 |
132 | def mul(x, y):
133 | y = as_array(y, get_array_module(x))
134 | return Mul()(x, y)
135 |
136 |
137 | class Neg(Function):
138 | def forward(self, x: np.ndarray) -> tuple[np.ndarray]:
139 | return -x
140 |
141 | def backward(self, gy: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]:
142 | return -gy
143 |
144 |
145 | def neg(x):
146 | return Neg()(x)
147 |
148 |
149 | class Sub(Function):
150 | def forward(self, x0: np.ndarray, x1: np.ndarray) -> tuple[np.ndarray]:
151 | self.x0_shape, self.x1_shape = x0.shape, x1.shape
152 | y = x0 - x1
153 | return y
154 |
155 | def backward(self, gy: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]:
156 | gx0, gx1 = gy, -gy
157 | if self.x0_shape != self.x1_shape:
158 | gx0 = sum_to(gx0, self.x0_shape)
159 | gx1 = sum_to(gx1, self.x1_shape)
160 | return gx0, gx1
161 |
162 |
163 | def sub(x, y):
164 | y = as_array(y, get_array_module(x))
165 | return Sub()(x, y)
166 |
167 |
168 | def rsub(x, y):
169 | y = as_array(y, get_array_module(x))
170 | return Sub()(y, x)
171 |
172 |
173 | class Div(Function):
174 | def forward(self, x0: np.ndarray, x1: np.ndarray) -> tuple[np.ndarray]:
175 | self.x0_shape, self.x1_shape = x0.shape, x1.shape
176 | y = x0 / x1
177 | return y
178 |
179 | def backward(self, gy: np.ndarray) -> Union[tuple[Variable, ...], Variable]:
180 | x0, x1 = self.inputs
181 | gx0 = gy / x1
182 | gx1 = gy * (-x0 / x1 ** 2)
183 | if self.x0_shape != self.x1_shape:
184 | gx0 = sum_to(gx0, self.x0_shape)
185 | gx1 = sum_to(gx1, self.x1_shape)
186 | return gx0, gx1
187 |
188 |
189 | def div(x, y):
190 | y = as_array(y, get_array_module(x))
191 | return Div()(x, y)
192 |
193 |
194 | def rdiv(x, y):
195 | y = as_array(y, get_array_module(x))
196 | return Div()(y, x)
197 |
198 |
199 | class Pow(Function):
200 | def __init__(self, c):
201 | self.c = c
202 |
203 | def forward(self, x: np.ndarray) -> tuple[np.ndarray]:
204 | y = x ** self.c
205 | return y
206 |
207 | def backward(self, gy: np.ndarray) -> Union[tuple[Variable, ...], Variable]:
208 | x = self.inputs[0]
209 | gx = self.c * x ** (self.c - 1) * gy
210 | return gx
211 |
212 |
213 | def pow(x, c):
214 | return Pow(c)(x)
215 |
216 |
217 | class Sin(Function):
218 | def forward(self, x: np.ndarray) -> tuple[np.ndarray]:
219 | xp = get_array_module(x)
220 | y = xp.sin(x)
221 | return y
222 |
223 | def backward(self, gy: np.ndarray) -> Union[tuple[Variable, ...], Variable]:
224 | x = self.inputs[0]
225 | gx = gy * cos(x)
226 | return gx
227 |
228 |
229 | def sin(x):
230 | return Sin()(x)
231 |
232 |
233 | class Cos(Function):
234 | def forward(self, x: np.ndarray) -> tuple[np.ndarray]:
235 | xp = get_array_module(x)
236 | y = xp.cos(x)
237 | return y
238 |
239 | def backward(self, gy: np.ndarray) -> Union[tuple[Variable, ...], Variable]:
240 | x = self.inputs[0]
241 | gx = gy * -sin(x)
242 | return gx
243 |
244 |
245 | def cos(x):
246 | return Cos()(x)
247 |
248 |
249 | class Tanh(Function):
250 | def forward(self, x: np.ndarray) -> tuple[np.ndarray]:
251 | xp = get_array_module(x)
252 | y = xp.tanh(x)
253 | return y
254 |
255 | def backward(self, gy: np.ndarray) -> Union[tuple[Variable, ...], Variable]:
256 | y = self.outputs[0]()
257 | gx = gy * (1 - y * y)
258 | return gx
259 |
260 |
261 | def tanh(x):
262 | return Tanh()(x)
263 |
264 |
265 | class Reshape(Function):
266 | def __init__(self, shape):
267 | self.shape = shape
268 |
269 | def forward(self, x: np.ndarray) -> tuple[np.ndarray]:
270 | self.x_shape = x.shape
271 | y = x.reshape(self.shape)
272 | return y
273 |
274 | def backward(self, gy: np.ndarray) -> Union[tuple[Variable, ...], Variable]:
275 | return reshape(gy, self.x_shape)
276 |
277 |
278 | def reshape(x, shape):
279 | if x.shape == shape:
280 | return as_variable(x)
281 | return Reshape(shape)(x)
282 |
283 |
284 | # class Transpose(Function):
285 | # def forward(self, x: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]:
286 | # y = np.transpose(x)
287 | # return y
288 | #
289 | # def backward(self, gy: np.ndarray) -> Union[tuple[Variable, ...], Variable]:
290 | # return transpose(gy)
291 | #
292 | #
293 | # def transpose(x):
294 | # return Transpose()(x)
295 |
296 | class Transpose(Function):
297 | def __init__(self, axes=None):
298 | self.axes = axes
299 |
300 | def forward(self, x):
301 | y = x.transpose(self.axes)
302 | return y
303 |
304 | def backward(self, gy):
305 | if self.axes is None:
306 | return transpose(gy)
307 |
308 | axes_len = len(self.axes)
309 | inv_axes = tuple(np.argsort([ax % axes_len for ax in self.axes]))
310 | return transpose(gy, inv_axes)
311 |
312 |
313 | def transpose(x, axes=None):
314 | return Transpose(axes)(x)
315 |
316 |
317 | class Sum(Function):
318 | def __init__(self, axis: Union[tuple[int, ...], int, None], keepdims: bool):
319 | self.axis = axis
320 | self.keepdims = keepdims
321 |
322 | def forward(self, x: np.ndarray) -> tuple[np.ndarray]:
323 | self.x_shape = x.shape
324 | y = x.sum(axis=self.axis, keepdims=self.keepdims)
325 | return y
326 |
327 | def backward(self, gy: np.ndarray) -> Union[tuple[Variable, ...], Variable]:
328 | gy = reshape_sum_backward(gy, self.x_shape, self.axis, self.keepdims)
329 | gx = broadcast_to(gy, self.x_shape)
330 | return gx
331 |
332 |
333 | def sum(x, axis=None, keepdims=False):
334 | return Sum(axis=axis, keepdims=keepdims)(x)
335 |
336 |
337 | class BroadcastTo(Function):
338 | def __init__(self, shape):
339 | self.shape = shape
340 |
341 | def forward(self, x: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]:
342 | self.x_shape = x.shape
343 | xp = get_array_module(x)
344 | y = xp.broadcast_to(x, self.shape)
345 | return y
346 |
347 | def backward(self, gy: np.ndarray) -> Union[tuple[Variable, ...], Variable]:
348 | gx = sum_to(gy, self.x_shape)
349 | return gx
350 |
351 |
352 | def broadcast_to(x, shape):
353 | if x.shape == shape:
354 | return as_variable(x)
355 | return BroadcastTo(shape)(x)
356 |
357 |
358 | class SumTo(Function):
359 | def __init__(self, shape):
360 | self.shape = shape
361 |
362 | def forward(self, x: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]:
363 | self.x_shape = x.shape
364 | y = raw_sum_to(x, self.shape)
365 | return y
366 |
367 | def backward(self, gy: np.ndarray) -> Union[tuple[Variable, ...], Variable]:
368 | gx = broadcast_to(gy, self.x_shape)
369 | return gx
370 |
371 |
372 | def sum_to(x, shape):
373 | if x.shape == shape:
374 | return as_variable(x)
375 | return SumTo(shape)(x)
376 |
377 |
378 | class MatMul(Function):
379 | def forward(self, x: np.ndarray, W: np.ndarray) -> tuple[np.ndarray]:
380 | if x.ndim <= 2 and W.ndim <= 2:
381 | y = x.dot(W)
382 | else:
383 | y = x @ W
384 | return y
385 |
386 | def backward(self, gy: np.ndarray) -> Union[tuple[Variable, ...], Variable]:
387 | x, W = self.inputs
388 | gx = matmul(gy, W.transpose(([i for i in range(W.ndim - 2)] + [-1, -2])))
389 | gW = matmul(x.transpose(([i for i in range(x.ndim - 2)] + [-1, -2])), gy)
390 | return gx, gW
391 |
392 |
393 | def matmul(x, W):
394 | return MatMul()(x, W)
395 |
396 |
397 | class MeanSquaredError(Function):
398 | def forward(self, x0: np.ndarray, x1: np.ndarray) -> np.ndarray:
399 | diff = x0 - x1
400 | y = (diff ** 2).sum() / len(diff)
401 | return y
402 |
403 | def backward(self, gy: np.ndarray) -> Union[tuple[Variable, ...], Variable]:
404 | x0, x1 = self.inputs
405 | diff: Variable = x0 - x1
406 | gx0: Variable = gy * diff * (2. / len(diff))
407 | gx1: Variable = -gx0
408 | return gx0, gx1
409 |
410 |
411 | def mean_squared_error(x, y):
412 | return MeanSquaredError()(x, y)
413 |
414 |
415 | class Linear(Function):
416 | def forward(self, x: np.ndarray, W: np.ndarray, b: np.ndarray) -> tuple[np.ndarray]:
417 | y = x.dot(W)
418 | if b is not None:
419 | y += b
420 |
421 | return y
422 |
423 | def backward(self, gy: np.ndarray) -> Union[tuple[Variable, ...], Variable]:
424 | x, W, b = self.inputs
425 | gb = None if b.data is None else sum_to(gy, b.shape)
426 | gx = matmul(gy, W.transpose(([i for i in range(W.ndim - 2)] + [-1, -2])))
427 | gW = matmul(x.transpose(([i for i in range(x.ndim - 2)] + [-1, -2])), gy)
428 | return gx, gW, gb
429 |
430 |
431 | def linear(x, W, b=None):
432 | return Linear()(x, W, b)
433 |
434 |
435 | class Sigmoid(Function):
436 | def forward(self, x: np.ndarray) -> tuple[np.ndarray]:
437 | # y = 1 / (1 + exp(-x))
438 | xp = get_array_module(x)
439 | y = xp.tanh(x * 0.5) * 0.5 + 0.5
440 | return y
441 |
442 | def backward(self, gy: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]:
443 | y = self.outputs[0]()
444 | gx = gy * y * (1 - y)
445 | return gx
446 |
447 |
448 | def sigmoid(x):
449 | return Sigmoid()(x)
450 |
451 |
452 | class GetItem(Function):
453 | def __init__(self, slices):
454 | self.slices = slices
455 |
456 | def forward(self, x: np.ndarray) -> tuple[np.ndarray]:
457 | y = x[self.slices]
458 | return y
459 |
460 | def backward(self, gy: np.ndarray) -> Union[tuple[Variable, ...], Variable]:
461 | x = self.inputs[0]
462 | f = GetItemGrad(self.slices, x.shape)
463 | return f(gy)
464 |
465 |
466 | def get_item(x, slices):
467 | return GetItem(slices)(x)
468 |
469 |
470 | class GetItemGrad(Function):
471 | def __init__(self, slices, in_shape):
472 | self.slices = slices
473 | self.in_shape = in_shape
474 |
475 | def forward(self, gy: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]:
476 | xp = get_array_module(gy)
477 | gx = xp.zeros(self.in_shape)
478 | if xp is np:
479 | np.add.at(gx, self.slices, gy)
480 | else:
481 | xp.scatter_add(gx, self.slices, gy)
482 | return gx
483 |
484 | def backward(self, ggx: np.ndarray) -> Union[tuple[Variable, ...], Variable]:
485 | return get_item(ggx, self.slices)
486 |
487 |
488 | class Softmax(Function):
489 | def __init__(self, axis=1):
490 | self.axis = axis
491 |
492 | def forward(self, x: np.ndarray) -> tuple[np.ndarray]:
493 | xp = get_array_module(x)
494 | y = x - x.max(axis=self.axis, keepdims=True)
495 | y = xp.exp(y)
496 | y /= y.sum(axis=self.axis, keepdims=True)
497 | return y
498 |
499 | def backward(self, gy: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]:
500 | y = self.outputs[0]()
501 | gx = y * gy
502 | sumdx = gx.sum(axis=self.axis, keepdims=True)
503 | gx -= y * sumdx
504 | return gx
505 |
506 |
507 | def softmax(x, axis=1):
508 | return Softmax(axis=axis)(x)
509 |
510 |
511 | class Cat(Function):
512 | def __init__(self, axis: int = 0):
513 | self.axis = axis
514 |
515 | def forward(self, *xs: np.ndarray) -> np.ndarray:
516 | xp = get_array_module(xs[0])
517 | z = xp.concatenate(xs, axis=self.axis)
518 | return z
519 |
520 | def backward(self, gy: Variable) -> Union[tuple[Variable, ...], Variable]:
521 | inputs = self.inputs
522 | gx = []
523 | start_idx = 0
524 | for x in inputs:
525 | end_idx = start_idx + x.shape[self.axis]
526 | indices = [slice(None)] * gy.ndim
527 | indices[self.axis] = slice(start_idx, end_idx)
528 | gx.append(gy[tuple(indices)])
529 | start_idx = end_idx
530 |
531 | return tuple(gx)
532 |
533 |
534 | def cat(inputs, axis=0):
535 | return Cat(axis=axis)(*inputs)
536 |
537 |
538 | class Clip(Function):
539 | def __init__(self, x_min, x_max):
540 | self.x_min = x_min
541 | self.x_max = x_max
542 |
543 | def forward(self, x: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]:
544 | xp = get_array_module(x)
545 | y = xp.clip(x, self.x_min, self.x_max)
546 | return y
547 |
548 | def backward(self, gy: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]:
549 | x = self.inputs[0]
550 | mask = (x.data >= self.x_min) * (x.data <= self.x_max)
551 | gx = gy * mask
552 | return gx
553 |
554 |
555 | def clip(x, x_min, x_max):
556 | return Clip(x_min, x_max)(x)
557 |
558 |
559 | def softmax_cross_entropy_simple(x, t):
560 | x, t = as_variable(x), as_variable(t)
561 | N = x.shape[0]
562 |
563 | p = softmax(x)
564 | p = clip(p, 1e-15, 1.0)
565 | log_p = log(p)
566 | tlog_p = log_p[np.arange(N), t.data]
567 | y = -1 * sum(tlog_p) / N
568 | return y
569 |
570 |
571 | def accuracy(y, t):
572 | y, t = as_variable(y), as_variable(t)
573 |
574 | pred = y.data.argmax(axis=1).reshape(t.shape)
575 | result = (pred == t.data)
576 | acc = result.mean()
577 |
578 | return Variable(as_array(acc))
579 |
580 |
581 | def dropout(x, dropout_ratio=0.5):
582 | x = as_variable(x)
583 |
584 | if Config.train:
585 | xp = get_array_module(x)
586 | mask = xp.random.rand(*x.shape) > dropout_ratio
587 | scale = xp.array(1.0 - dropout_ratio).astype(x.dtype)
588 | y = x * mask / scale
589 | return y
590 | else:
591 | return x
592 |
593 |
594 | class Stack(Function):
595 | def __init__(self, axis: int = 0):
596 | self.axis = axis
597 |
598 | def forward(self, *xs: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]:
599 | xp = get_array_module(xs[0])
600 | self.x_shape = xs[0].shape
601 | self.x_num = len(xs)
602 | y = xp.stack(xs, axis=self.axis)
603 | return y
604 |
605 | def backward(self, gy: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]:
606 | gx = []
607 | for i in range(self.x_num):
608 | indices = [slice(None)] * gy.ndim
609 | indices[self.axis] = slice(i, i + 1)
610 | gx.append(gy[tuple(indices)].reshape(self.x_shape))
611 | return tuple(gx)
612 |
613 |
614 | def stack(inputs, axis=0):
615 | return Stack(axis=axis)(*inputs)
616 |
617 |
618 | if __name__ == '__main__':
619 | # def goldstein(x, y):
620 | # z = (1 + (x + y + 1) ** 2 * (19 - 14 * x + 3 * x ** 2 - 14 * y + 6 * x * y + 3 * y ** 2)) * \
621 | # (30 + (2 * x - 3 * y) ** 2 * (18 - 32 * x + 12 * x ** 2 + 48 * y - 36 * x * y + 27 * y ** 2))
622 | # return z
623 | #
624 | #
625 | # x = Variable(np.array(1.0))
626 | # y = Variable(np.array(1.0))
627 | # z = goldstein(x, y)
628 | # z.backward()
629 | # print(z)
630 | # print(x.grad, y.grad)
631 | #
632 | # plot_dot_graph(z, verbose=False)
633 |
634 | # x = Variable(np.array(1.0))
635 | # y = tanh(x)
636 | # x.name = 'x'
637 | # y.name = 'y'
638 | # y.backward(create_graph=True)
639 | # iters = 6
640 | #
641 | # for i in range(iters):
642 | # gx = x.grad
643 | # x.cleargrad()
644 | # gx.backward(create_graph=True)
645 | #
646 | # gx = x.grad
647 | # gx.name = 'gx' + str(iters + 1)
648 | # plot_dot_graph(gx, verbose=False)
649 |
650 | # np.random.seed(0)
651 | # x = np.random.rand(100, 1)
652 | # y = 5 + 2 * x + np.random.rand(100, 1)
653 | #
654 | # W = Variable(np.zeros((1, 1)))
655 | # b = Variable(np.zeros(1))
656 | #
657 | #
658 | # def predict(x):
659 | # y = matmul(x, W) + b
660 | # return y
661 | #
662 | #
663 | # def mean_squared_error(x0, x1):
664 | # diff = x0 - x1
665 | # return sum(diff ** 2) / len(diff)
666 | #
667 | #
668 | # lr = 0.1
669 | # iters = 100
670 | #
671 | # for i in range(iters):
672 | # y_pred = predict(x)
673 | # loss = mean_squared_error(y, y_pred)
674 | # W.cleargrad()
675 | # b.cleargrad()
676 | # loss.backward()
677 | # W.data -= lr * W.grad.data
678 | # b.data -= lr * b.grad.data
679 | # print(W, b, loss)
680 | # a = Variable(np.array([i for i in range(20)]).reshape(4, 5))
681 | # b = Variable(np.array([i for i in range(20, 40)]).reshape(4, 5))
682 | # c = Variable(np.array([i for i in range(40, 60)]).reshape(4, 5))
683 | #
684 | # aaa = 2
685 | # raw_shape = a.shape
686 | # print(raw_shape[:aaa] + (1,) + raw_shape[aaa:])
687 | #
688 | # a1 = a.reshape(raw_shape[:aaa] + (1,) + raw_shape[aaa:])
689 | # b1 = b.reshape(raw_shape[:aaa] + (1,) + raw_shape[aaa:])
690 | # c1 = c.reshape(raw_shape[:aaa] + (1,) + raw_shape[aaa:])
691 | #
692 | # d = cat((a1, b1, c1), axis=aaa)
693 | #
694 | # d1 = d[1:, :, :] * 3
695 | # d2 = d[:1, :, :] * 5
696 | #
697 | # dd = cat((d1, d2), axis=0)
698 | #
699 | # dd.backward()
700 | # print(dd)
701 | # print(dd.shape)
702 | # print(a.grad)
703 | # print(b.grad)
704 | # print(c.grad)
705 | #
706 | # a2 = Variable(np.array([i for i in range(20)]).reshape(4, 5))
707 | # b2 = Variable(np.array([i for i in range(20, 40)]).reshape(4, 5))
708 | # c2 = Variable(np.array([i for i in range(40, 60)]).reshape(4, 5))
709 | #
710 | # dd2 = Stack(axis=aaa)(a2, b2, c2)
711 | #
712 | # d11 = dd2[1:, :, :] * 3
713 | # d22 = dd2[:1, :, :] * 5
714 | #
715 | # ddd = cat((d11, d22), axis=0)
716 | # ddd.backward()
717 | #
718 | # print(ddd)
719 | # print(a2.grad)
720 | # print(b2.grad)
721 | # print(c2.grad)
722 |
723 | pass
724 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | import time
2 |
3 | import numpy as np
4 | import json
5 | from typing import Union, Optional, Callable, Literal
6 | from config import Config, no_grad, test_mode
7 | import optimizers
8 | from variable import Variable
9 | from optimizers import SGD
10 | from models import MLP, Model, Sequential
11 | from layers import Linear, Parameter, Layer, Embedding
12 | from functions import mean_squared_error, sigmoid, matmul, Function, cat, softmax, dropout, stack
13 | from datasets import Dataset
14 | from dataloaders import DataLoader
15 | from dataclasses import dataclass
16 |
17 | # 中文故事 https://github.com/chenyangMl/llama2.c-zh
18 | # 中文医疗 https://huggingface.co/datasets/shibing624/medical
19 |
20 |
21 | import os
22 |
23 |
24 | # 仅设置一块可见
25 | # os.environ['CUDA_VISIBLE_DEVICES'] = '2'
26 |
27 |
28 | class Tokenizer:
29 | def __init__(self, model_path: str):
30 | with open(model_path, "r", encoding="utf-8") as f:
31 | model = json.load(f)
32 | self.vocab = model["tokens"]
33 | self.scores = model["scores"]
34 | self.pad_id = 0
35 | self.bos_id = 1
36 | self.eos_id = 2
37 | self.n_words = len(self.vocab)
38 | special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"]
39 | self.special_tokens = {}
40 | self.index_special_tokens = {}
41 | for token in special_tokens:
42 | self.special_tokens[token] = self.n_words
43 | self.index_special_tokens[self.n_words] = token
44 | self.n_words += 1
45 |
46 | def str_lookup(self, token: str) -> int:
47 | try:
48 | index = self.vocab.index(token)
49 | return index
50 | except ValueError as err:
51 | return -1
52 |
53 | def encode(
54 | self,
55 | text: str,
56 | add_bos: bool = True,
57 | add_eos: bool = False,
58 | add_prefix: bool = True,
59 | add_new_bos: bool = False,
60 | ) -> list[int]:
61 | tokens = []
62 | for pos, char in enumerate(text):
63 | id = self.str_lookup(char)
64 | if id >= 0:
65 | tokens.append(id)
66 | else:
67 | tokens = tokens + [(i + 3) for i in char.encode()]
68 | while True:
69 | best_score = -1e10
70 | best_id = -1
71 | best_idx = -1
72 |
73 | for i in range(len(tokens) - 1):
74 | # Check if we can merge the pair (tokens[i], tokens[i+1])
75 | string = self.vocab[tokens[i]] + self.vocab[tokens[i + 1]]
76 | id = self.str_lookup(string)
77 | if id != -1 and self.scores[id] > best_score:
78 | best_score = self.scores[id]
79 | best_id = id
80 | best_idx = i
81 |
82 | if best_idx == -1:
83 | break
84 |
85 | # Merge the consecutive pair (best_idx, best_idx+1) into new token best_id
86 | tokens[best_idx] = best_id
87 | # Delete token at position best_idx+1, shift the entire sequence back 1
88 | tokens = tokens[0: best_idx + 1] + tokens[best_idx + 2:]
89 | if add_bos:
90 | tokens.insert(0, self.bos_id)
91 | if add_eos:
92 | tokens.append(self.eos_id)
93 | if add_prefix:
94 | tokens.insert(0, self.special_tokens['sop'])
95 | tokens.insert(0, self.special_tokens['[gMASK]'])
96 | if add_new_bos:
97 | tokens.append(self.bos_id)
98 | return tokens
99 |
100 | def decode(self, ids: list[int]) -> str:
101 | res = []
102 | for i in ids:
103 | token = self.vocab[i]
104 | res.append(token)
105 | text = "".join(res)
106 | text = text.strip("").strip("")
107 | return text
108 |
109 |
110 | class SelfAttention(Model):
111 | def __init__(self,
112 | args: 'LLaMaArgs',
113 | rope_apply: Callable):
114 | super(SelfAttention, self).__init__()
115 |
116 | assert args.num_heads * args.head_dim == args.hidden_size
117 | assert args.num_heads % args.num_key_value_heads == 0
118 | assert args.head_dim % 2 == 0
119 |
120 | self.max_len = args.max_len
121 | self.max_batch_size = args.max_batch_size
122 | self.enable_kv_cache = args.enable_kv_cache
123 | self.use_gpu = args.use_gpu
124 |
125 | self.hidden_size = args.hidden_size
126 | self.num_heads = args.num_heads
127 | self.head_dim = args.head_dim
128 | self.num_key_value_heads = args.num_key_value_heads
129 | self.attention_bias = args.attention_bias
130 | self.dropout_ratio = args.dropout_ratio
131 |
132 | self.dropout_on = args.dropout_ratio != 0
133 | self.kv_repeat_num = self.num_heads // self.num_key_value_heads
134 |
135 | self.rope_apply = rope_apply
136 |
137 | self.q_proj = Linear(in_size=self.hidden_size, out_size=self.num_heads * self.head_dim,
138 | nobias=~self.attention_bias)
139 |
140 | self.k_proj = Linear(in_size=self.hidden_size, out_size=self.num_key_value_heads * self.head_dim,
141 | nobias=~self.attention_bias)
142 |
143 | self.v_proj = Linear(in_size=self.hidden_size, out_size=self.num_key_value_heads * self.head_dim,
144 | nobias=~self.attention_bias)
145 |
146 | self.o_proj = Linear(in_size=self.hidden_size, out_size=self.hidden_size, nobias=~self.attention_bias)
147 |
148 | if self.enable_kv_cache:
149 | self.k_cache = Variable(np.zeros([self.max_batch_size, self.num_key_value_heads, 0, self.head_dim]))
150 | self.v_cache = Variable(np.zeros([self.max_batch_size, self.num_key_value_heads, 0, self.head_dim]))
151 | if self.use_gpu:
152 | self.k_cache.to_gpu()
153 | self.v_cache.to_gpu()
154 |
155 | def forward(self, x, cos_pos, sin_pos):
156 | batch_size = x.shape[0]
157 | length = x.shape[1]
158 | # embed_dim = x.shape[2]
159 |
160 | q = self.q_proj(x)
161 | k = self.k_proj(x)
162 | v = self.v_proj(x)
163 | # [batch_size, length, hidden_size]
164 |
165 | q = q.reshape(batch_size, length, self.num_heads, self.head_dim).transpose(0, 2, 1, 3)
166 | k = k.reshape(batch_size, length, self.num_key_value_heads, self.head_dim).transpose(0, 2, 1, 3)
167 | v = v.reshape(batch_size, length, self.num_key_value_heads, self.head_dim).transpose(0, 2, 1, 3)
168 | # [batch_size, length, num_heads, head_dim]
169 | # [batch_size, num_heads, length, head_dim]
170 |
171 | # q,k rope finish
172 | # q = apply_RoPE(q, cos_pos, sin_pos)
173 | # k = apply_RoPE(k, cos_pos, sin_pos)
174 | q = self.rope_apply(q, cos_pos, sin_pos)
175 | k = self.rope_apply(k, cos_pos, sin_pos)
176 |
177 | if self.enable_kv_cache:
178 | start_pos = self.k_cache.shape[2]
179 | else:
180 | start_pos = 0
181 |
182 | if self.enable_kv_cache:
183 | self.k_cache = cat((self.k_cache, k), axis=2)
184 | self.v_cache = cat((self.v_cache, v), axis=2)
185 | k = self.k_cache
186 | v = self.v_cache
187 |
188 | # print(k[0, 0])
189 | # print(v[0, 0])
190 |
191 | # 相乘之前若是kv头数不一样还需要重复 num_heads % num_key_value_heads
192 | if self.num_heads != self.num_key_value_heads:
193 | k = k[:, np.arange(self.num_key_value_heads).repeat(self.kv_repeat_num), :, :]
194 | v = v[:, np.arange(self.num_key_value_heads).repeat(self.kv_repeat_num), :, :]
195 |
196 | attention_weight = matmul(q, k.transpose(0, 1, 3, 2)) / np.sqrt(self.head_dim)
197 |
198 | mask = np.full((length, length), -np.inf)
199 | mask = np.triu(mask, k=1)
200 | mask = np.concatenate((np.zeros((length, start_pos)), mask), axis=1)
201 |
202 | if self.use_gpu:
203 | from cuda import as_cupy
204 | mask = as_cupy(mask)
205 |
206 | attention_weight = attention_weight + mask
207 |
208 | attention_weight = softmax(attention_weight, axis=-1)
209 |
210 | if self.dropout_on:
211 | attention_weight = dropout(attention_weight, self.dropout_ratio)
212 |
213 | output = matmul(attention_weight, v) # (bzs, num_heads, length, head_dim)
214 | output = output.transpose(0, 2, 1, 3).reshape(batch_size, length, self.hidden_size)
215 | # (bzs, length, embed_dim)
216 | output = self.o_proj(output)
217 |
218 | return output
219 |
220 |
221 | class SiLU(Function):
222 | def forward(self, x: np.ndarray) -> tuple[np.ndarray]:
223 | self.sigmoid = 1 / (1 + np.exp(-x))
224 | y = x * self.sigmoid
225 | return y
226 |
227 | def backward(self, gy: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]:
228 | y = self.outputs[0]()
229 | gx = gy * (y + self.sigmoid * (1 - y))
230 | # y'=(xs)'=s+xs(1-s)=s+xs-xss=xs+s(1-xs)=y+s(1-y)
231 | return gx
232 |
233 |
234 | def silu(x):
235 | return SiLU()(x)
236 |
237 |
238 | class SwiGLUFeedForwardNetwork(Model):
239 | def __init__(self, hidden_size: int, intermediate_size: int, use_bias: bool = False):
240 | super(SwiGLUFeedForwardNetwork, self).__init__()
241 | self.fc_gate = Linear(in_size=hidden_size, out_size=intermediate_size, nobias=~use_bias)
242 | self.fc_up = Linear(in_size=hidden_size, out_size=intermediate_size, nobias=~use_bias)
243 | self.fc_down = Linear(in_size=intermediate_size, out_size=hidden_size, nobias=~use_bias)
244 |
245 | def forward(self, x):
246 | x1 = self.fc_up(x)
247 | x = silu(self.fc_gate(x))
248 | x = x * x1
249 | x = self.fc_down(x)
250 | return x
251 |
252 |
253 | # class RMSNorm(Layer):
254 | # def __init__(self, hidden_size: int, eps: float = 1e-6):
255 | # super(RMSNorm, self).__init__()
256 | # self.weight = Parameter(np.ones(hidden_size), 'weight')
257 | # self.epsilon = eps
258 | #
259 | # def forward(self, x):
260 | # x_shape = x.shape
261 | # x = x * ((x ** 2).sum(axis=x.ndim - 1) / x_shape[-1] + self.epsilon).reshape(*(x_shape[:-1] + (1,))) ** (-1 / 2)
262 | # x = self.weight * x
263 | # return x
264 |
265 | class RoPELlama:
266 | def __init__(self,
267 | max_len: int,
268 | output_dim: int,
269 | rope_theta: float = 10000.0):
270 | self.max_len = max_len
271 | self.output_dim = output_dim
272 | self.rope_theta = rope_theta
273 |
274 | def apply(q: Variable, cos_pos: np.ndarray, sin_pos: np.ndarray):
275 | q2 = stack((-q[..., 1::2], q[..., ::2]), axis=-1)
276 | q2 = q2.reshape(q.shape)
277 | q = q * cos_pos + q2 * sin_pos
278 | return q
279 |
280 | self.apply = apply
281 |
282 | def get_cos_sin(self):
283 | position = np.arange(0, self.max_len, dtype=np.float32)[..., np.newaxis]
284 | ids = np.arange(0, self.output_dim // 2, dtype=np.float32)
285 | theta = self.rope_theta ** (-2 * ids / self.output_dim)
286 | embeddings = position * theta
287 | # (max_len, output_dim//2, 2)
288 | embeddings = np.stack([np.sin(embeddings), np.cos(embeddings)], axis=-1)
289 | # (bs, head, max_len, output_dim//2, 2)
290 | embeddings = np.tile(embeddings,
291 | (1, 1, *([1] * len(embeddings.shape)))) # 在bs维度重复,其他维度都是1不重复
292 | # (bs, head, max_len, output_dim)
293 | # reshape后就是:偶数sin, 奇数cos了
294 | embeddings = np.reshape(embeddings, (1, 1, self.max_len, self.output_dim))
295 | cos_pos = embeddings[..., 1::2].repeat(2, axis=-1) # 将奇数列信息抽取出来也就是cos 拿出来并复制
296 | sin_pos = embeddings[..., ::2].repeat(2, axis=-1) # 将偶数列信息抽取出来也就是sin 拿出来并复制
297 | return cos_pos, sin_pos
298 |
299 |
300 | class RoPEHF:
301 | def __init__(self,
302 | max_len: int,
303 | output_dim: int,
304 | rope_theta: float = 500000.0):
305 | self.max_len = max_len
306 | self.output_dim = output_dim
307 | self.rope_theta = rope_theta
308 |
309 | def apply(q: Variable, cos_pos: np.ndarray, sin_pos: np.ndarray):
310 | q2 = cat((-q[..., q.shape[-1] // 2:], q[..., : q.shape[-1] // 2]), axis=-1)
311 | q = q * cos_pos + q2 * sin_pos
312 | return q
313 |
314 | self.apply = apply
315 |
316 | def get_cos_sin(self):
317 | # HF
318 | position = np.arange(0, self.max_len, dtype=np.float32)[..., np.newaxis]
319 | ids = np.arange(0, self.output_dim // 2, dtype=np.float32)
320 | theta = self.rope_theta ** (-2 * ids / self.output_dim)
321 | embeddings = position * theta
322 | embeddings = np.concatenate((embeddings, embeddings), axis=-1)[np.newaxis, np.newaxis, :, :]
323 | cos_pos = np.cos(embeddings)
324 | sin_pos = np.sin(embeddings)
325 | return cos_pos, sin_pos
326 |
327 |
328 | # def sinusoidal_position_embedding(batch_size: int,
329 | # nums_head: int,
330 | # max_len: int,
331 | # output_dim: int,
332 | # rope_theta: float = 10000.0):
333 | # # (max_len, 1)
334 | # position = np.arange(0, max_len, dtype=np.float32)[..., np.newaxis]
335 | # # (output_dim//2)
336 | # ids = np.arange(0, output_dim // 2, dtype=np.float32) # 即公式里的i, i的范围是 [0,d/2]
337 | # theta = rope_theta ** (-2 * ids / output_dim)
338 | #
339 | # # (max_len, output_dim//2)
340 | # embeddings = position * theta # 即公式里的:pos / (10000^(2i/d))
341 | #
342 | # # (max_len, output_dim//2, 2)
343 | # embeddings = np.stack([np.sin(embeddings), np.cos(embeddings)], axis=-1)
344 | #
345 | # # (bs, head, max_len, output_dim//2, 2)
346 | #
347 | # embeddings = np.tile(embeddings, (batch_size, nums_head, *([1] * len(embeddings.shape)))) # 在bs维度重复,其他维度都是1不重复
348 | #
349 | # # (bs, head, max_len, output_dim)
350 | # # reshape后就是:偶数sin, 奇数cos了
351 | #
352 | # embeddings = np.reshape(embeddings, (batch_size, nums_head, max_len, output_dim))
353 | # return embeddings
354 | #
355 | #
356 | # def apply_RoPE(q: Variable, cos_pos: np.ndarray, sin_pos: np.ndarray):
357 | # q2 = stack((-q[..., 1::2], q[..., ::2]), axis=-1)
358 | # q2 = q2.reshape(q.shape) # reshape后就是正负交替了
359 | # q = q * cos_pos + q2 * sin_pos
360 | # return q
361 |
362 |
363 | class TransformerDecoderBlock(Model):
364 | def __init__(self,
365 | args: 'LLaMaArgs',
366 | rope_apply: Callable):
367 | super(TransformerDecoderBlock, self).__init__()
368 |
369 | self.max_len = args.max_len
370 | self.max_batch_size = args.max_batch_size
371 | self.enable_kv_cache = args.enable_kv_cache
372 |
373 | self.hidden_size = args.hidden_size
374 |
375 | self.num_heads = args.num_heads
376 | self.head_dim = args.head_dim
377 | self.num_key_value_heads = args.num_key_value_heads
378 | self.attention_bias = args.attention_bias
379 | self.dropout_ratio = args.dropout_ratio
380 |
381 | self.ffn_intermediate_size = args.ffn_intermediate_size
382 | self.ffn_bias = args.ffn_bias
383 |
384 | self.rms_eps = args.rms_eps
385 |
386 | self.multi_head_self_attention = SelfAttention(args=args, rope_apply=rope_apply)
387 | self.ffn = SwiGLUFeedForwardNetwork(hidden_size=self.hidden_size, intermediate_size=self.ffn_intermediate_size,
388 | use_bias=self.ffn_bias)
389 |
390 | self.rms_norm_1 = RMSNorm(hidden_size=self.hidden_size, eps=self.rms_eps)
391 | self.rms_norm_2 = RMSNorm(hidden_size=self.hidden_size, eps=self.rms_eps)
392 |
393 | def forward(self, x, cos_pos, sin_pos):
394 | x = self.multi_head_self_attention(self.rms_norm_1(x), cos_pos, sin_pos) + x
395 | x = self.ffn(self.rms_norm_2(x)) + x
396 | return x
397 |
398 |
399 | class LLaMa(Model):
400 | def __init__(self,
401 | args: 'LLaMaArgs'):
402 | super(LLaMa, self).__init__()
403 |
404 | self.max_len = args.max_len
405 | self.max_batch_size = args.max_batch_size
406 | self.enable_kv_cache = args.enable_kv_cache
407 | self.use_gpu = args.use_gpu
408 |
409 | self.vocab_size = args.vocab_size
410 | self.num_layers = args.num_layers
411 | self.hidden_size = args.hidden_size
412 |
413 | self.num_heads = args.num_heads
414 | self.head_dim = args.head_dim
415 | self.num_key_value_heads = args.num_key_value_heads
416 | self.attention_bias = args.attention_bias
417 | self.rope_theta = args.rope_theta
418 | self.dropout_ratio = args.dropout_ratio
419 |
420 | self.ffn_intermediate_size = args.ffn_intermediate_size
421 | self.ffn_bias = args.ffn_bias
422 |
423 | self.rms_eps = args.rms_eps
424 |
425 | self.embedding = Embedding(in_size=self.vocab_size, out_size=self.hidden_size)
426 |
427 | self.rope_type = args.rope_type
428 | if self.rope_type == 'Llama':
429 | self.rope = RoPELlama(max_len=self.max_len,
430 | output_dim=self.head_dim,
431 | rope_theta=self.rope_theta)
432 | else:
433 | self.rope = RoPEHF(max_len=self.max_len,
434 | output_dim=self.head_dim,
435 | rope_theta=self.rope_theta)
436 |
437 | self.transformers = Sequential(*[
438 | TransformerDecoderBlock(args=args, rope_apply=self.rope.apply) for _ in range(self.num_layers)])
439 |
440 | self.last_rms = RMSNorm(hidden_size=self.hidden_size, eps=self.rms_eps)
441 | self.linear = Linear(in_size=self.hidden_size, out_size=self.vocab_size, nobias=True)
442 |
443 | self.weight_share = args.weight_share
444 |
445 | if self.weight_share:
446 | self.linear.W = self.embedding.W.T
447 |
448 | self.cos_pos, self.sin_pos = self.rope.get_cos_sin()
449 |
450 | if self.use_gpu:
451 | from cuda import as_cupy
452 | self.cos_pos = as_cupy(self.cos_pos)
453 | self.sin_pos = as_cupy(self.sin_pos)
454 |
455 | def forward(self, x):
456 | if self.enable_kv_cache:
457 | start_pos = self.transformers.layers[0].multi_head_self_attention.k_cache.shape[2]
458 | else:
459 | start_pos = 0
460 | now_len = x.shape[1]
461 | if start_pos + now_len >= self.max_len:
462 | raise 'kv cache is full'
463 | x = self.embedding(x)
464 | for layer in self.transformers.layers:
465 | x = layer(x, self.cos_pos[:, :, start_pos:(start_pos + now_len), :],
466 | self.sin_pos[:, :, start_pos:(start_pos + now_len), :])
467 | x = self.last_rms(x)
468 | x = self.linear(x[:, -1, :])
469 | # return softmax(x, 2)
470 | return x
471 |
472 | def clean_kv_cache(self):
473 | if self.enable_kv_cache:
474 | for i in self.transformers.layers:
475 | if self.use_gpu:
476 | import cupy as cp
477 | i.multi_head_self_attention.k_cache = Variable(
478 | cp.zeros((self.max_batch_size, self.num_key_value_heads, 0, self.head_dim)))
479 | i.multi_head_self_attention.v_cache = Variable(
480 | cp.zeros((self.max_batch_size, self.num_key_value_heads, 0, self.head_dim)))
481 | else:
482 | i.multi_head_self_attention.k_cache = Variable(
483 | np.zeros([self.max_batch_size, self.num_key_value_heads, 0, self.head_dim]))
484 | i.multi_head_self_attention.v_cache = Variable(
485 | np.zeros([self.max_batch_size, self.num_key_value_heads, 0, self.head_dim]))
486 | print('kv cache cleaned')
487 | else:
488 | print('kv cache is not enabled')
489 |
490 | def generate(self, token: np.ndarray, max_gen: int, temperature: float, top_k: int, eos_id: int = 2):
491 | token_batch, token_len = token.shape
492 | assert token_batch == 1
493 | if token_len > self.max_len:
494 | token = token[:, (token_len - self.max_len):]
495 | token_len = self.max_len
496 |
497 | new_char = 0
498 | for i in range(max_gen):
499 | if self.enable_kv_cache:
500 | if i == 0:
501 | r = self(token)
502 | else:
503 | r = self(np.array([[new_char]]))
504 | else:
505 | r = self(token)
506 | r.to_cpu()
507 | if temperature == 0:
508 | new_char = np.argmax(r.data)
509 | new_char = int(new_char)
510 | else:
511 | new_r = r.data / temperature
512 | r_top_k = np.argsort(-new_r)[:, top_k]
513 | new_r[new_r < new_r[:, r_top_k]] = -np.inf
514 | probs = softmax(new_r).data.astype(np.float64)
515 | probs = probs / probs.sum()
516 | new_char = np.argmax(np.random.multinomial(n=1, pvals=probs[0]))
517 | new_char = int(new_char)
518 |
519 | token = np.concatenate((token, np.array([[new_char]])), axis=1)
520 | # print(tokenizer.decode([new_char]), end='')
521 | yield new_char
522 | if new_char == eos_id:
523 | break
524 | return token
525 |
526 | def chat(self, promote: str, tokenizer: Tokenizer, max_gen: int = 500, temperature: float = 1.0, top_k: int = 100,
527 | bos_id: int = 2):
528 | tokens = tokenizer.encode(promote, add_eos=False, add_new_bos=True, add_bos=False, add_prefix=False)
529 | # tokens = tokenizer.encode(promote, add_eos=False, add_new_bos=False, add_bos=True, add_prefix=False)
530 | tokens = np.array(tokens)[np.newaxis, ...]
531 | gen = ''
532 | for i in self.generate(tokens, max_gen, temperature, top_k, bos_id):
533 | if i == tokenizer.eos_id:
534 | print('')
535 | new_char = tokenizer.decode([i])
536 | gen += new_char
537 | print(new_char, end='')
538 | return gen
539 |
540 |
541 | class RMSNormFunction(Function):
542 | def __init__(self, eps: float = 1e-6):
543 | self.epsilon = eps
544 |
545 | def forward(self, x: np.ndarray, w: np.ndarray) -> tuple[np.ndarray]:
546 | self.rms_inv = ((x ** 2).sum(axis=x.ndim - 1, keepdims=True) / x.shape[-1] + self.epsilon) ** (-1 / 2)
547 | self.rms_x = x * self.rms_inv
548 | y = self.rms_x * w
549 | return y
550 |
551 | def backward(self, gy: np.ndarray) -> Union[tuple[np.ndarray, ...], np.ndarray]:
552 | x, w = self.inputs
553 | gw = (gy * self.rms_x).sum(axis=tuple([i for i in range(x.ndim - 1)]))
554 | gx = gy * w * self.rms_inv - x * (self.rms_inv ** 3) * (
555 | (gy * w * x).sum(axis=x.ndim - 1, keepdims=True) / x.shape[-1])
556 | return gx, gw
557 |
558 |
559 | def rms_norm(x, w, eps=1e-6):
560 | return RMSNormFunction(eps=eps)(x, w)
561 |
562 |
563 | class RMSNorm(Layer):
564 | def __init__(self, hidden_size: int, eps: float = 1e-6):
565 | super(RMSNorm, self).__init__()
566 | self.weight = Parameter(np.ones(hidden_size), 'weight')
567 | self.epsilon = eps
568 |
569 | def forward(self, x):
570 | return rms_norm(x, self.weight, eps=self.epsilon)
571 |
572 |
573 | @dataclass
574 | class LLaMaArgs:
575 | vocab_size: int = 64783
576 | num_layers: int = 12
577 | hidden_size: int = 1024
578 | num_heads: int = 8
579 | head_dim: int = 128
580 | num_key_value_heads: int = 8
581 | attention_bias: bool = False
582 | weight_share: bool = True
583 | rope_type: Literal['Llama', 'HF'] = 'Llama'
584 | rope_theta: float = 10000.0
585 | enable_kv_cache: bool = True
586 | ffn_intermediate_size: int = 2752
587 | ffn_bias: bool = False
588 | max_len: int = 1024
589 | rms_eps: float = 1e-5
590 | dropout_ratio: float = 0.0
591 | max_batch_size: int = 1
592 | use_gpu: bool = True
593 |
594 |
595 | baby_llama_zh = LLaMaArgs(
596 | vocab_size=64783,
597 | num_layers=12,
598 | hidden_size=1024,
599 | num_heads=8,
600 | head_dim=128,
601 | num_key_value_heads=8,
602 | attention_bias=False,
603 | weight_share=True,
604 | rope_type='Llama',
605 | rope_theta=10000.0,
606 | enable_kv_cache=True,
607 | ffn_intermediate_size=2752,
608 | ffn_bias=False,
609 | max_len=1024,
610 | rms_eps=1e-5,
611 | dropout_ratio=0.0,
612 | max_batch_size=1,
613 | use_gpu=True,
614 | )
615 |
616 | atom_7b = LLaMaArgs(
617 | vocab_size=65000,
618 | num_layers=32,
619 | hidden_size=4096,
620 | num_heads=32,
621 | head_dim=128,
622 | num_key_value_heads=32,
623 | attention_bias=False,
624 | weight_share=False,
625 | rope_type='HF',
626 | rope_theta=500000.0,
627 | enable_kv_cache=True,
628 | ffn_intermediate_size=11008,
629 | ffn_bias=False,
630 | max_len=4096,
631 | rms_eps=1e-5,
632 | dropout_ratio=0.0,
633 | max_batch_size=1,
634 | use_gpu=True,
635 | )
636 |
637 |
638 | class Timer:
639 | def __init__(self, name: str):
640 | self.name = name
641 |
642 | def __enter__(self):
643 | self.s = time.time()
644 |
645 | def __exit__(self, exc_type, exc_val, exc_tb):
646 | self.e = time.time()
647 | print(f'{self.name} cost {self.e - self.s} seconds')
648 |
649 |
650 | if __name__ == '__main__':
651 | np.random.seed(114514)
652 |
653 | model_dict_atom_7b = {
654 | 'args': atom_7b,
655 | 'weights_path': 'Atom7b.npz',
656 | 'tokenizer_path': 'tokenizer_atom7b.model.np',
657 | }
658 | model_dict_baby_llama_zh = {
659 | 'args': baby_llama_zh,
660 | 'weights_path': 'WEIGHTS.npz',
661 | 'tokenizer_path': 'tokenizer_chatglm2.model.np',
662 | }
663 |
664 | model_dict = model_dict_baby_llama_zh
665 |
666 | with no_grad(), test_mode():
667 | tokenizer = Tokenizer(model_path=model_dict['tokenizer_path'])
668 |
669 | with Timer('model init'): # 771
670 | m = LLaMa(args=model_dict['args'])
671 |
672 | with Timer('weights load'): # 235
673 | m.load_weights(model_dict['weights_path'])
674 |
675 | if model_dict['args'].use_gpu:
676 | with Timer('to gpu'): # 6
677 | m.to_gpu()
678 |
679 | # i = np.array([[1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007]])
680 | # print(m(i))
681 |
682 | # m1 = LLaMa(args=model_args)
683 | # m1.load_weights('WEIGHTS.npz')
684 | # m1.to_gpu()
685 | # m2 = LLaMa(args=model_args)
686 | # m2.load_weights('WEIGHTS.npz')
687 | # m2.to_gpu()
688 | # the_str = '写一篇700字以上的有关大语言模型的议论文'
689 | # print(the_str)
690 | # for i in range(100):
691 | # r1 = m1.chat(the_str, tokenizer)
692 | # r2 = m1.chat(r1, tokenizer)
693 | # the_str = r2
694 |
695 | # https://github.com/AI-Study-Han/Mini-Llama2-Chinese/tree/main
696 |
697 | # the_str = '什么是大语言模型'
698 | # print(the_str)
699 | # m.chat(the_str, tokenizer)
700 |
701 | for _ in range(100):
702 | test_str = input()
703 | if test_str == '\\clean':
704 | m.clean_kv_cache()
705 | continue
706 | if test_str == '\\stop':
707 | break
708 | m.chat(test_str, tokenizer)
709 |
--------------------------------------------------------------------------------