├── .gitignore ├── imgs ├── ad1d.png ├── ad2d.png ├── rnn.png ├── mlp_cnn.png ├── dropout_bn.png └── transformer.png ├── pytest.ini ├── .gitattributes ├── llm ├── clip │ ├── picture.png │ ├── tokenizer.py │ ├── model.py │ └── infer.py └── llama │ ├── tokenizer.py │ ├── infer.py │ └── model.py ├── pydynet ├── nn │ ├── __init__.py │ ├── parameter.py │ ├── modules │ │ ├── dropout.py │ │ ├── loss.py │ │ ├── __init__.py │ │ ├── activation.py │ │ ├── pool.py │ │ ├── linear.py │ │ ├── conv.py │ │ ├── module.py │ │ ├── norm.py │ │ └── rnn.py │ ├── init.py │ └── functional.py ├── optim │ ├── __init__.py │ ├── lr_scheduler.py │ └── optimizer.py ├── core │ ├── __init__.py │ └── function.py ├── __init__.py ├── autograd.py ├── special.py ├── cuda.py └── data.py ├── requirements.txt ├── tests ├── test_backward.py └── test_tensor_basic.py ├── setup.py ├── LICENSE ├── .github └── workflows │ └── python-publish.yml ├── examples ├── pydynet │ ├── autograd1d.py │ ├── autograd2d.py │ ├── ts_prediction.py │ ├── mnist.py │ ├── dropout_bn.py │ └── transformer.py └── pytorch │ ├── ts_prediction.py │ ├── mnist.py │ ├── dropout_bn.py │ └── transformer.py ├── cnREADME.md └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | __pycache__/ 3 | data -------------------------------------------------------------------------------- /imgs/ad1d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WeltXing/PyDyNet/HEAD/imgs/ad1d.png -------------------------------------------------------------------------------- /imgs/ad2d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WeltXing/PyDyNet/HEAD/imgs/ad2d.png -------------------------------------------------------------------------------- /imgs/rnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WeltXing/PyDyNet/HEAD/imgs/rnn.png -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | filterwarnings = 3 | ignore::UserWarning:pydynet -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /imgs/mlp_cnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WeltXing/PyDyNet/HEAD/imgs/mlp_cnn.png -------------------------------------------------------------------------------- /imgs/dropout_bn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WeltXing/PyDyNet/HEAD/imgs/dropout_bn.png -------------------------------------------------------------------------------- /imgs/transformer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WeltXing/PyDyNet/HEAD/imgs/transformer.png -------------------------------------------------------------------------------- /llm/clip/picture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WeltXing/PyDyNet/HEAD/llm/clip/picture.png -------------------------------------------------------------------------------- /pydynet/nn/__init__.py: -------------------------------------------------------------------------------- 1 | from .modules import * 2 | from .parameter import Parameter 3 | from . import init -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=2.0.0 2 | cupy-cuda12x # For Cuda 12.x; refer to https://docs.cupy.dev/en/stable/install.html 3 | -------------------------------------------------------------------------------- /tests/test_backward.py: -------------------------------------------------------------------------------- 1 | import sys, pytest, random 2 | import numpy as np 3 | 4 | sys.path.append('../pydynet') 5 | 6 | np.random.seed(0) 7 | random.seed(0) 8 | 9 | type_list = [np.float16, np.float32, np.float64] 10 | -------------------------------------------------------------------------------- /pydynet/optim/__init__.py: -------------------------------------------------------------------------------- 1 | from .optimizer import ( 2 | Optimizer, 3 | SGD, 4 | Adagrad, 5 | Adadelta, 6 | Adam, 7 | ) 8 | from .lr_scheduler import ( 9 | _LRScheduler, 10 | ExponentialLR, 11 | StepLR, 12 | MultiStepLR, 13 | CosineAnnealingLR, 14 | ) 15 | -------------------------------------------------------------------------------- /pydynet/core/__init__.py: -------------------------------------------------------------------------------- 1 | from .tensor import (Tensor, add, sub, mul, div, pow, matmul, abs, sum, mean, 2 | min, max, min, argmax, argmin, maximum, minimum, exp, log, 3 | sign, reshape, transpose, swapaxes, concat, sigmoid, tanh) 4 | from .function import sqrt, square, vsplit, hsplit, dsplit, split, unsqueeze, squeeze 5 | -------------------------------------------------------------------------------- /pydynet/nn/parameter.py: -------------------------------------------------------------------------------- 1 | from ..core import Tensor 2 | 3 | 4 | class Parameter(Tensor): 5 | 6 | def __init__(self, data: Tensor, requires_grad: bool = True) -> None: 7 | super().__init__( 8 | data=data.data, 9 | dtype=data.dtype, 10 | device=data.device, 11 | copy=False, 12 | requires_grad=requires_grad, 13 | ) 14 | 15 | def __repr__(self) -> str: 16 | return "Parameter : \n{}".format(self.data) + (",\ndevice={}".format( 17 | self.device) if self.device.device != "cpu" else "") 18 | -------------------------------------------------------------------------------- /pydynet/nn/modules/dropout.py: -------------------------------------------------------------------------------- 1 | from .module import Module 2 | from ...core import Tensor 3 | from ...special import rand 4 | 5 | 6 | class Dropout(Module): 7 | 8 | def __init__(self, p: float = 0.5) -> None: 9 | super().__init__() 10 | assert p >= 0 and p < 1 11 | self.p = p 12 | 13 | def forward(self, x) -> Tensor: 14 | if self._train: 15 | mask = rand(*x.shape, device=x.device) < 1 - self.p 16 | return x * mask.astype(x.dtype) / (1 - self.p) 17 | return x 18 | 19 | def __repr__(self) -> str: 20 | return "{}(p={})".format(self.__class__.__name__, self.p) 21 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | setuptools.setup( 4 | name='pydynet', 5 | version='1.0', 6 | description= 7 | 'PyDyNet: Neuron Network (MLP, CNN, RNN, Transformer, ...) implementation using Numpy with Autodiff', 8 | author="Cun-Yuan Xing", 9 | author_email="xingcy@lamda.nju.edu.cn", 10 | maintainer="Cun-Yuan Xing", 11 | maintainer_email="xingcy@lamad.nju.edu.cn", 12 | packages=[ 13 | 'pydynet', 'pydynet/optim', 'pydynet/nn', 'pydynet/nn/modules', 14 | 'pydynet/core' 15 | ], 16 | license='MIT License', 17 | install_requires=['numpy>=2.0.0'], 18 | long_description=open('README.md', encoding='utf-8').read(), 19 | long_description_content_type="text/markdown", 20 | url='https://github.com/WeltXing/PyDyNet', 21 | ) 22 | -------------------------------------------------------------------------------- /pydynet/__init__.py: -------------------------------------------------------------------------------- 1 | from .core import (Tensor, add, sub, mul, div, pow, matmul, abs, sum, mean, 2 | min, max, min, argmax, argmin, maximum, minimum, exp, log, 3 | sign, reshape, transpose, swapaxes, concat, sigmoid, tanh, 4 | sqrt, square, vsplit, hsplit, dsplit, split, unsqueeze, 5 | squeeze) 6 | from .special import zeros, ones, rand, randn, empty, uniform 7 | from .cuda import Device 8 | from .autograd import enable_grad, no_grad 9 | 10 | __all__ = [ 11 | "Tensor", "add", "sub", "mul", "div", "pow", "matmul", "abs", "sum", 12 | "mean", "min", "max", "argmax", "argmin", "maximum", "minimum", "exp", 13 | "log", "sign", "reshape", "transpose", "swapaxes", "concat", 'sigmoid', 14 | 'tanh', "sqrt", "square", "vsplit", "hsplit", "dsplit", "split", 15 | "unsqueeze", "squeeze", "zeros", "ones", "rand", "randn", "empty", 16 | "uniform", "Device", "enable_grad", "no_grad" 17 | ] 18 | -------------------------------------------------------------------------------- /pydynet/nn/modules/loss.py: -------------------------------------------------------------------------------- 1 | from .module import Module 2 | from .. import functional as F 3 | from ...core import Tensor 4 | 5 | 6 | class Loss(Module): 7 | '''损失函数基类''' 8 | 9 | def __init__(self, reduction='mean') -> None: 10 | super().__init__() 11 | self.reduction = reduction 12 | assert self.reduction in {'mean', 'sum'} 13 | 14 | def forward(self, y_pred: Tensor, y_true: Tensor) -> Tensor: 15 | raise NotImplementedError 16 | 17 | 18 | class MSELoss(Loss): 19 | 20 | def forward(self, y_pred: Tensor, y_true: Tensor) -> Tensor: 21 | return F.mse_loss(y_pred, y_true, reduction=self.reduction) 22 | 23 | 24 | class NLLLoss(Loss): 25 | 26 | def forward(self, y_pred: Tensor, y_true: Tensor) -> Tensor: 27 | return F.nll_loss(y_pred, y_true, reduction=self.reduction) 28 | 29 | 30 | class CrossEntropyLoss(Loss): 31 | 32 | def forward(self, y_pred: Tensor, y_true: Tensor) -> Tensor: 33 | return F.cross_entropy_loss(y_pred, y_true, reduction=self.reduction) 34 | -------------------------------------------------------------------------------- /pydynet/nn/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .activation import Sigmoid, Tanh, ReLU, LeakyReLU, Softmax 2 | from .norm import BatchNorm1d, BatchNorm2d, LayerNorm, RMSNorm 3 | from .conv import Conv1d, Conv2d 4 | from .pool import MaxPool1d, MaxPool2d, AvgPool1d, AvgPool2d 5 | from .dropout import Dropout 6 | from .linear import Linear, Embedding 7 | from .loss import MSELoss, NLLLoss, CrossEntropyLoss 8 | from .module import Module, Sequential, ModuleList 9 | from .rnn import RNN, LSTM, GRU, RNNCell, LSTMCell, GRUCell 10 | 11 | __all__ = [ 12 | "Sigmoid", 13 | "Tanh", 14 | "ReLU", 15 | "LeakyReLU", 16 | "Softmax", 17 | "BatchNorm1d", 18 | "BatchNorm2d", 19 | "LayerNorm", 20 | "RMSNorm", 21 | "Conv1d", 22 | "Conv2d", 23 | "MaxPool1d", 24 | "MaxPool2d", 25 | "AvgPool1d", 26 | "AvgPool2d", 27 | "Dropout", 28 | "Linear", 29 | "Embedding", 30 | "MSELoss", 31 | "NLLLoss", 32 | "CrossEntropyLoss", 33 | "Module", 34 | "Sequential", 35 | "ModuleList", 36 | "RNN", 37 | "LSTM", 38 | "GRU", 39 | "RNNCell", 40 | "LSTMCell", 41 | "GRUCell", 42 | ] 43 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Welt Xing 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /pydynet/autograd.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | grad_enable = True 4 | 5 | 6 | def is_grad_enable(): 7 | return grad_enable 8 | 9 | 10 | def set_grad_enabled(mode: bool): 11 | global grad_enable 12 | grad_enable = mode 13 | 14 | 15 | class no_grad: 16 | 17 | def __enter__(self) -> None: 18 | self.prev = is_grad_enable() 19 | set_grad_enabled(False) 20 | 21 | def __exit__(self, exc_type, exc_value, traceback) -> None: 22 | set_grad_enabled(self.prev) 23 | 24 | def __call__(self, func): 25 | 26 | @functools.wraps(func) 27 | def decorate_context(*args, **kwargs): 28 | with __class__(): 29 | return func(*args, **kwargs) 30 | 31 | return decorate_context 32 | 33 | 34 | class enable_grad: 35 | 36 | def __enter__(self) -> None: 37 | self.prev = is_grad_enable() 38 | set_grad_enabled(True) 39 | 40 | def __exit__(self, exc_type, exc_value, traceback) -> None: 41 | set_grad_enabled(self.prev) 42 | 43 | def __call__(self, func): 44 | 45 | @functools.wraps(func) 46 | def decorate_context(*args, **kwargs): 47 | with __class__(): 48 | return func(*args, **kwargs) 49 | 50 | return decorate_context 51 | -------------------------------------------------------------------------------- /pydynet/nn/modules/activation.py: -------------------------------------------------------------------------------- 1 | from .module import Module 2 | from .. import functional as F 3 | from ...core import Tensor 4 | 5 | 6 | class Sigmoid(Module): 7 | '''激活函数层 : Sigmoid''' 8 | 9 | def forward(self, x) -> Tensor: 10 | return F.sigmoid(x) 11 | 12 | def __repr__(self) -> str: 13 | return "{}()".format(self.__class__.__name__) 14 | 15 | 16 | class Tanh(Module): 17 | '''激活函数层 : Tanh''' 18 | 19 | def forward(self, x) -> Tensor: 20 | return F.tanh(x) 21 | 22 | def __repr__(self) -> str: 23 | return "{}()".format(self.__class__.__name__) 24 | 25 | 26 | class ReLU(Module): 27 | '''激活函数层 : ReLU''' 28 | 29 | def forward(self, x) -> Tensor: 30 | return F.relu(x) 31 | 32 | def __repr__(self) -> str: 33 | return "{}()".format(self.__class__.__name__) 34 | 35 | 36 | class LeakyReLU(Module): 37 | ''' 38 | 激活函数层 : LeakyReLU 39 | 40 | Parameter 41 | --------- 42 | alpha : float 43 | 负输入对应的斜率. 44 | ''' 45 | 46 | def __init__(self, alpha: float = 0.1) -> None: 47 | super().__init__() 48 | self.alpha = float(alpha) 49 | 50 | def forward(self, x) -> Tensor: 51 | return F.leaky_relu(x, self.alpha) 52 | 53 | def __repr__(self) -> str: 54 | return "{}(alpha={})".format(self.__class__.__name__, self.alpha) 55 | 56 | 57 | class Softmax(Module): 58 | ''' 59 | 激活函数层 : softmax 60 | 61 | Parameter 62 | --------- 63 | axis : Optional[Tuple[int]], default=None 64 | 沿着axis计算softmax. 65 | ''' 66 | 67 | def __init__(self, axis=None) -> None: 68 | super().__init__() 69 | self.axis = axis 70 | 71 | def forward(self, x) -> Tensor: 72 | return F.softmax(x, self.axis) 73 | 74 | def __repr__(self) -> str: 75 | return "{}(axis={})".format(self.__class__.__name__, self.axis) 76 | -------------------------------------------------------------------------------- /llm/llama/tokenizer.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import json 3 | 4 | 5 | class Tokenizer: 6 | def __init__(self, model_path: str): 7 | with open(model_path, "r", encoding="utf-8") as f: 8 | model = json.load(f) 9 | self.vocab = model["tokens"] 10 | self.scores = model["scores"] 11 | self.bos_id = 1 12 | self.eos_id = 2 13 | 14 | def str_lookup(self, token: str) -> int: 15 | try: 16 | index = self.vocab.index(token) 17 | return index 18 | except ValueError as err: 19 | return -1 20 | 21 | def encode( 22 | self, 23 | text: str, 24 | add_bos: bool = True, 25 | add_eos: bool = False, 26 | ) -> List[int]: 27 | tokens = [] 28 | for pos, char in enumerate(text): 29 | id = self.str_lookup(char) 30 | if id >= 0: 31 | tokens.append(id) 32 | while True: 33 | best_score = -1e10 34 | best_id = -1 35 | best_idx = -1 36 | 37 | for i in range(len(tokens) - 1): 38 | # Check if we can merge the pair (tokens[i], tokens[i+1]) 39 | string = self.vocab[tokens[i]] + self.vocab[tokens[i + 1]] 40 | id = self.str_lookup(string) 41 | if id != -1 and self.scores[id] > best_score: 42 | best_score = self.scores[id] 43 | best_id = id 44 | best_idx = i 45 | 46 | if best_idx == -1: 47 | break 48 | 49 | # Merge the consecutive pair (best_idx, best_idx+1) into new token best_id 50 | tokens[best_idx] = best_id 51 | # Delete token at position best_idx+1, shift the entire sequence back 1 52 | tokens = tokens[0: best_idx + 1] + tokens[best_idx + 2:] 53 | if add_bos: 54 | tokens.insert(0, self.bos_id) 55 | if add_eos: 56 | tokens.append(self.eos_id) 57 | return tokens 58 | 59 | def decode(self, ids: List[int]) -> str: 60 | res = [] 61 | for i in ids: 62 | token = self.vocab[i] 63 | res.append(token) 64 | text = "".join(res) 65 | text = text.strip("").strip("") 66 | return text 67 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package to PyPI when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: PyDyNet 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | release-build: 20 | runs-on: ubuntu-latest 21 | 22 | steps: 23 | - uses: actions/checkout@v4 24 | 25 | - uses: actions/setup-python@v5 26 | with: 27 | python-version: "3.x" 28 | 29 | - name: Build release distributions 30 | run: | 31 | # NOTE: put your own distribution build steps here. 32 | python -m pip install build 33 | python -m build 34 | 35 | - name: Upload distributions 36 | uses: actions/upload-artifact@v4 37 | with: 38 | name: release-dists 39 | path: dist/ 40 | 41 | pypi-publish: 42 | runs-on: ubuntu-latest 43 | needs: 44 | - release-build 45 | permissions: 46 | # IMPORTANT: this permission is mandatory for trusted publishing 47 | id-token: write 48 | 49 | # Dedicated environments with protections for publishing are strongly recommended. 50 | # For more information, see: https://docs.github.com/en/actions/deployment/targeting-different-environments/using-environments-for-deployment#deployment-protection-rules 51 | environment: 52 | name: pypi 53 | # OPTIONAL: uncomment and update to include your PyPI project URL in the deployment status: 54 | # url: https://pypi.org/p/YOURPROJECT 55 | # 56 | # ALTERNATIVE: if your GitHub Release name is the PyPI project version string 57 | # ALTERNATIVE: exactly, uncomment the following line instead: 58 | # url: https://pypi.org/project/YOURPROJECT/${{ github.event.release.name }} 59 | 60 | steps: 61 | - name: Retrieve release distributions 62 | uses: actions/download-artifact@v4 63 | with: 64 | name: release-dists 65 | path: dist/ 66 | 67 | - name: Publish release distributions to PyPI 68 | uses: pypa/gh-action-pypi-publish@release/v1 69 | with: 70 | packages-dir: dist/ 71 | -------------------------------------------------------------------------------- /examples/pydynet/autograd1d.py: -------------------------------------------------------------------------------- 1 | import pydynet as pdn 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | device = f'cuda:{pdn.cuda.device_count() - 1}' if pdn.cuda.is_available( 6 | ) else 'cpu' 7 | 8 | 9 | def auto_grad(x: float, lr: float, n_iter: int): 10 | x_list = [x] 11 | x: pdn.Tensor = pdn.Tensor(float(x), requires_grad=True, device=device) 12 | 13 | for _ in range(n_iter): 14 | x.zero_grad() 15 | y = pdn.log((x - 7)**2 + 6) 16 | y.backward() 17 | 18 | with x.device: 19 | x.data -= lr * x.grad 20 | x_list.append(x.item()) 21 | 22 | return x_list 23 | 24 | 25 | def manual_grad(x: float, lr: float, n_iter: int): 26 | x_list = [x] 27 | for _ in range(n_iter): 28 | grad = 2 * (x - 7) / ((x - 7)**2 + 6) 29 | x -= lr * grad 30 | 31 | x_list.append(x) 32 | 33 | return x_list 34 | 35 | 36 | x_ = np.linspace(0, 10, 101) 37 | f = np.log((x_ - 7)**2 + 6) 38 | 39 | x1 = np.array(auto_grad(1., 1.5, 20)) 40 | x2 = np.array(manual_grad(1., 1.5, 20)) 41 | y1 = np.log((x1 - 7)**2 + 6) 42 | y2 = np.log((x2 - 7)**2 + 6) 43 | 44 | plt.figure(figsize=(9, 3)) 45 | 46 | plt.rcParams['font.sans-serif'] = ['Times New Roman'] 47 | plt.rcParams['mathtext.fontset'] = 'stix' 48 | plt.rcParams['xtick.direction'] = 'in' 49 | plt.rcParams['ytick.direction'] = 'in' 50 | plt.rcParams['axes.linewidth'] = 0.5 51 | 52 | plt.subplot(1, 2, 1) 53 | plt.grid() 54 | 55 | plt.xlim(0, 10) 56 | plt.ylim(1.5, 4) 57 | plt.plot(x_, f, label=r"$f(x)=\log((x-7)^2+10)$", color='blue', lw=.7) 58 | plt.scatter(x1, 59 | y1, 60 | color='red', 61 | marker='^', 62 | s=50, 63 | zorder=10, 64 | label='Gradient descent with lr=1.5') 65 | 66 | plt.yticks([1.5, 2, 2.5, 3, 3.5, 4], size=13) 67 | plt.xticks([2, 4, 6, 8, 10], size=13) 68 | plt.title("Gradient descent by AutoGrad") 69 | plt.legend() 70 | 71 | plt.subplot(1, 2, 2) 72 | 73 | plt.grid() 74 | 75 | plt.xlim(0, 10) 76 | plt.ylim(1.5, 4) 77 | plt.plot(x_, f, label=r"$f(x)=\log((x-7)^2+10)$", color='blue', lw=.7) 78 | plt.scatter(x1, 79 | y1, 80 | color='green', 81 | marker='*', 82 | s=50, 83 | zorder=10, 84 | label='Gradient descent with lr=1.5') 85 | plt.yticks([1.5, 2, 2.5, 3, 3.5, 4], size=13) 86 | plt.xticks([2, 4, 6, 8, 10], size=13) 87 | plt.title("Gradient descent by Manual calculation") 88 | plt.legend() 89 | 90 | plt.savefig("imgs/ad1d.png") 91 | -------------------------------------------------------------------------------- /examples/pydynet/autograd2d.py: -------------------------------------------------------------------------------- 1 | import pydynet as pdn 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | np.random.seed(42) 6 | 7 | device = f'cuda:{pdn.cuda.device_count() - 1}' if pdn.cuda.is_available( 8 | ) else 'cpu' 9 | 10 | x = np.random.randn(2) 11 | A = pdn.Tensor([ 12 | [3, 1.], 13 | [1, 2.], 14 | ]).to(device) 15 | b = pdn.Tensor([-1., 1]).to(device) 16 | 17 | 18 | def auto_grad(x, lr: float, n_iter: float): 19 | Xs, ys = [], [] 20 | x = pdn.Tensor(x, requires_grad=True, device=device) 21 | 22 | for _ in range(n_iter): 23 | obj = x @ A @ x / 2 + b @ x 24 | obj.backward() 25 | 26 | Xs.append(x.numpy()) 27 | ys.append(obj.item()) 28 | with x.device: 29 | x.data -= lr * x.grad 30 | x.zero_grad() 31 | 32 | Xs, ys = np.array(Xs), np.array(ys) 33 | return Xs[:, 0], Xs[:, 1], ys 34 | 35 | 36 | def manual_grad(x, lr: float, n_iter: float): 37 | Xs, ys = [], [] 38 | 39 | for _ in range(n_iter): 40 | obj = x @ A @ x / 2 + b @ x 41 | 42 | Xs.append(x.copy()) 43 | ys.append(obj.item()) 44 | 45 | grad = A.numpy() @ x + b.numpy() 46 | x -= lr * grad 47 | 48 | Xs, ys = np.array(Xs), np.array(ys) 49 | return Xs[:, 0], Xs[:, 1], ys 50 | 51 | 52 | plt.rcParams['font.sans-serif'] = ['Times New Roman'] 53 | plt.rcParams['mathtext.fontset'] = 'stix' 54 | 55 | fig = plt.figure(figsize=(8, 4)) 56 | ax1 = fig.add_subplot(1, 2, 1, projection='3d') 57 | ax1.plot3D( 58 | *auto_grad(x, .1, 30), 59 | color='red', 60 | lw=0.7, 61 | label=r'$f(x)=\frac{1}{2}x^\top Ax+b^\top x$', 62 | marker='^', 63 | markersize=6, 64 | ) 65 | 66 | ax1.tick_params(direction='in') 67 | ax1.set_xlim(.45, .60) 68 | ax1.set_ylim(-.8, 0) 69 | ax1.set_zlim(-.8, -.3) 70 | ax1.set_xticks([.45, .5, .55, .6]) 71 | ax1.set_yticks([-.8, -.6, -.4, -.2, 0]) 72 | 73 | plt.title('Gradient descent by AutoGrad') 74 | plt.legend(prop={'size': 11}) 75 | 76 | ax1 = fig.add_subplot(1, 2, 2, projection='3d') 77 | ax1.plot3D( 78 | *manual_grad(x, .1, 30), 79 | color='blue', 80 | lw=0.7, 81 | label=r'$f(x)=\frac{1}{2}x^\top Ax+b^\top x$', 82 | marker='^', 83 | markersize=6, 84 | ) 85 | 86 | ax1.tick_params(direction='in') 87 | ax1.set_xlim(.45, .60) 88 | ax1.set_ylim(-.8, 0) 89 | ax1.set_zlim(-.8, -.3) 90 | ax1.set_xticks([.45, .5, .55, .6]) 91 | ax1.set_yticks([-.8, -.6, -.4, -.2, 0]) 92 | 93 | plt.title('Gradient descent by Manual calculation') 94 | plt.legend(prop={'size': 11}) 95 | 96 | plt.savefig("imgs/ad2d.png") 97 | -------------------------------------------------------------------------------- /pydynet/nn/modules/pool.py: -------------------------------------------------------------------------------- 1 | from .module import Module 2 | from .. import functional as F 3 | 4 | 5 | class MaxPool1d(Module): 6 | 7 | def __init__(self, kernel_size: int, stride: int, padding: int) -> None: 8 | super().__init__() 9 | self.kernel_size = kernel_size 10 | self.stride = stride 11 | self.padding = padding 12 | 13 | def forward(self, x): 14 | return F.max_pool1d(x, self.kernel_size, self.stride, self.padding) 15 | 16 | def __repr__(self) -> str: 17 | return "{}(kernel_size={}, stride={}, padding={})".format( 18 | self.__class__.__name__, 19 | self.kernel_size, 20 | self.stride, 21 | self.padding, 22 | ) 23 | 24 | 25 | class AvgPool1d(Module): 26 | 27 | def __init__(self, kernel_size: int, stride: int, padding: int) -> None: 28 | super().__init__() 29 | self.kernel_size = kernel_size 30 | self.stride = stride 31 | self.padding = padding 32 | 33 | def forward(self, x): 34 | return F.avg_pool1d(x, self.kernel_size, self.stride, self.padding) 35 | 36 | def __repr__(self) -> str: 37 | return "{}(kernel_size={}, stride={}, padding={})".format( 38 | self.__class__.__name__, 39 | self.kernel_size, 40 | self.stride, 41 | self.padding, 42 | ) 43 | 44 | 45 | class MaxPool2d(Module): 46 | 47 | def __init__(self, kernel_size: int, stride: int, padding: int) -> None: 48 | super().__init__() 49 | self.kernel_size = kernel_size 50 | self.stride = stride 51 | self.padding = padding 52 | 53 | def forward(self, x): 54 | return F.max_pool2d(x, self.kernel_size, self.stride, self.padding) 55 | 56 | def __repr__(self) -> str: 57 | return "{}(kernel_size={}, stride={}, padding={})".format( 58 | self.__class__.__name__, 59 | self.kernel_size, 60 | self.stride, 61 | self.padding, 62 | ) 63 | 64 | 65 | class AvgPool2d(Module): 66 | 67 | def __init__(self, kernel_size: int, stride: int, padding: int) -> None: 68 | super().__init__() 69 | self.kernel_size = kernel_size 70 | self.stride = stride 71 | self.padding = padding 72 | 73 | def forward(self, x): 74 | return F.avg_pool2d(x, self.kernel_size, self.stride, self.padding) 75 | 76 | def __repr__(self) -> str: 77 | return "{}(kernel_size={}, stride={}, padding={})".format( 78 | self.__class__.__name__, 79 | self.kernel_size, 80 | self.stride, 81 | self.padding, 82 | ) 83 | -------------------------------------------------------------------------------- /pydynet/special.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from .core import Tensor 3 | 4 | 5 | # 一些包装的特殊矩阵 6 | def zeros(shape, dtype=None, device=None, requires_grad=False): 7 | '''全0张量 8 | 9 | Parameters 10 | ---------- 11 | shape : 12 | 张量形状 13 | require_grad : bool, default=False 14 | 是否需要求导 15 | ''' 16 | return Tensor(np.zeros(shape), 17 | dtype=dtype, 18 | device=device, 19 | requires_grad=requires_grad) 20 | 21 | 22 | def ones(shape, dtype=None, device=None, requires_grad=False): 23 | '''全1张量 24 | 25 | Parameters 26 | ---------- 27 | shape : 28 | 张量形状 29 | require_grad : bool, default=False 30 | 是否需要求导 31 | ''' 32 | return Tensor(np.ones(shape), 33 | dtype=dtype, 34 | device=device, 35 | requires_grad=requires_grad) 36 | 37 | 38 | def randn(*shape, dtype=None, device=None, requires_grad=False): 39 | '''0-1正态分布张量 40 | 41 | Parameters 42 | ---------- 43 | *shape : 44 | 张量形状 45 | require_grad : bool, default=False 46 | 是否需要求导 47 | ''' 48 | return Tensor(np.random.randn(*shape), 49 | dtype=dtype, 50 | device=device, 51 | requires_grad=requires_grad) 52 | 53 | 54 | def rand(*shape, dtype=None, device=None, requires_grad=False): 55 | '''[0, 1)均匀分布张量 56 | 57 | Parameters 58 | ---------- 59 | *shape : 60 | 张量形状 61 | require_grad : bool, default=False 62 | 是否需要求导 63 | ''' 64 | return Tensor(np.random.rand(*shape), 65 | dtype=dtype, 66 | device=device, 67 | requires_grad=requires_grad) 68 | 69 | 70 | def uniform(low: float, 71 | high: float, 72 | shape=None, 73 | dtype=None, 74 | device=None, 75 | requires_grad=False): 76 | '''均匀分布张量 77 | 78 | Parameters 79 | ---------- 80 | low : float 81 | 均匀分布下界; 82 | high : float 83 | 均匀分布下界; 84 | *shape : 85 | 张量形状 86 | require_grad : bool, default=False 87 | 是否需要求导 88 | ''' 89 | return Tensor(np.random.uniform(low, high, size=shape), 90 | dtype=dtype, 91 | device=device, 92 | requires_grad=requires_grad) 93 | 94 | 95 | def empty(shape, dtype=None, device=None, requires_grad=False): 96 | return Tensor(np.empty(shape, dtype=dtype), 97 | dtype=dtype, 98 | device=device, 99 | requires_grad=requires_grad) 100 | -------------------------------------------------------------------------------- /pydynet/nn/modules/linear.py: -------------------------------------------------------------------------------- 1 | from .module import Module 2 | from ..parameter import Parameter 3 | from .. import init, functional as F 4 | from ...core import Tensor 5 | from ...special import empty 6 | from ...cuda import Device 7 | from ...autograd import no_grad 8 | 9 | import math 10 | 11 | 12 | class Linear(Module): 13 | 14 | def __init__( 15 | self, 16 | in_features: int, 17 | out_features: int, 18 | bias: bool = True, 19 | device=None, 20 | dtype=None, 21 | ) -> None: 22 | super().__init__() 23 | self.in_features = in_features 24 | self.out_features = out_features 25 | kwargs = {"device": Device(device), "dtype": dtype} 26 | self.weight = Parameter( 27 | empty((self.in_features, self.out_features), **kwargs)) 28 | self.bias = Parameter(empty(self.out_features, ** 29 | kwargs)) if bias else None 30 | self.reset_paramters() 31 | 32 | def reset_paramters(self): 33 | init.kaiming_uniform_(self.weight, a=math.sqrt(5)) 34 | if self.bias is not None: 35 | fan_in, _ = init._calculate_fan(self.weight) 36 | bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 37 | init.uniform_(self.bias, -bound, bound) 38 | 39 | def forward(self, x: Tensor): 40 | return F.linear(x, self.weight, self.bias) 41 | 42 | def __repr__(self) -> str: 43 | return "Linear(in_features={}, out_features={}, bias={})".format( 44 | self.in_features, self.out_features, self.bias is not None) 45 | 46 | 47 | class Embedding(Module): 48 | 49 | def __init__( 50 | self, 51 | num_embeddings: int, 52 | embedding_dim: int, 53 | padding_idx=None, 54 | device=None, 55 | dtype=None, 56 | ) -> None: 57 | super().__init__() 58 | self.num_embedding = num_embeddings 59 | self.embedding_dim = embedding_dim 60 | self.padding_idx = padding_idx 61 | 62 | kwargs = {"device": Device(device), "dtype": dtype} 63 | self.weight = Parameter( 64 | empty((self.num_embedding, self.embedding_dim), **kwargs)) 65 | 66 | def forward(self, x: Tensor): 67 | return F.embedding(x, self.weight, self.padding_idx) 68 | 69 | def reset_parameters(self) -> None: 70 | init.normal_(self.weight) 71 | self._fill_padding_idx_with_zero() 72 | 73 | def _fill_padding_idx_with_zero(self) -> None: 74 | if self.padding_idx is not None: 75 | with no_grad(): 76 | self.weight[self.padding_idx].data = self.weight.xp.zeros( 77 | self.weight[self.padding_idx].shape, 78 | dtype=self.weight.dtype, 79 | ) 80 | -------------------------------------------------------------------------------- /pydynet/nn/init.py: -------------------------------------------------------------------------------- 1 | from ..core import Tensor 2 | from ..autograd import no_grad 3 | import math 4 | 5 | 6 | def calculate_gain(nonlinearity: str, param: float = None) -> float: 7 | return { 8 | "linear": 1, 9 | "conv1d": 1, 10 | "conv2d": 1, 11 | "sigmoid": 1, 12 | "tanh": 5 / 3, 13 | "relu": math.sqrt(2.), 14 | "leaky_relu": 15 | math.sqrt(2. / (1 + (param if param != None else 0.01)**2)) 16 | }[nonlinearity] 17 | 18 | 19 | def _calculate_fan(tensor: Tensor): 20 | assert tensor.ndim >= 2 21 | fan_in, fan_out = tensor.shape[:2] 22 | if tensor.ndim > 2: 23 | receptive_field_size = math.prod(tensor.shape[2:]) 24 | fan_in *= receptive_field_size 25 | fan_out *= receptive_field_size 26 | return fan_in, fan_out 27 | 28 | 29 | @no_grad() 30 | def uniform_(tensor: Tensor, a=0., b=1.) -> Tensor: 31 | tensor.data[...] = tensor.xp.random.uniform(a, b, tensor.shape) 32 | return tensor 33 | 34 | 35 | @no_grad() 36 | def normal_(tensor: Tensor, mean=0., std=1.) -> Tensor: 37 | tensor.data[...] = tensor.xp.random.normal(mean, std, size=tensor.shape) 38 | return tensor 39 | 40 | 41 | @no_grad() 42 | def constant_(tensor: Tensor, val: float) -> Tensor: 43 | tensor.data[...] = val 44 | return tensor 45 | 46 | 47 | def ones_(tensor: Tensor) -> Tensor: 48 | return constant_(tensor, 1.) 49 | 50 | 51 | def zeros_(tensor: Tensor) -> Tensor: 52 | return constant_(tensor, 0.) 53 | 54 | 55 | def xavier_uniform_(tensor: Tensor, gain: float = 1.) -> Tensor: 56 | fan_in, fan_out = _calculate_fan(tensor) 57 | bound = gain * math.sqrt(6. / (fan_in + fan_out)) 58 | return uniform_(tensor, -bound, bound) 59 | 60 | 61 | def xavier_normal_(tensor: Tensor, gain: float = 1.) -> Tensor: 62 | fan_in, fan_out = _calculate_fan(tensor) 63 | std = gain * math.sqrt(2 / (fan_in + fan_out)) 64 | return normal_(tensor, std=std) 65 | 66 | 67 | def kaiming_uniform_(tensor: Tensor, 68 | a: float = 0., 69 | mode='fan_in', 70 | nonlinearity='relu') -> Tensor: 71 | fan_in, fan_out = _calculate_fan(tensor) 72 | fan = { 73 | "fan_in": fan_in, 74 | "fan_out": fan_out, 75 | }[mode] 76 | gain = calculate_gain(nonlinearity, a) 77 | bound = gain * math.sqrt(3. / fan) 78 | return uniform_(tensor, -bound, bound) 79 | 80 | 81 | def kaiming_normal_(tensor: Tensor, 82 | a: float = 0., 83 | mode='fan_in', 84 | nonlinearity='relu'): 85 | fan_in, fan_out = _calculate_fan(tensor) 86 | fan = { 87 | "fan_in": fan_in, 88 | "fan_out": fan_out, 89 | }[mode] 90 | gain = calculate_gain(nonlinearity, a) 91 | std = gain / math.sqrt(fan) 92 | return normal_(tensor, std=std) 93 | -------------------------------------------------------------------------------- /pydynet/cuda.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import warnings 3 | 4 | try: 5 | import cupy as cp 6 | cuda_available: bool = True 7 | except ModuleNotFoundError: 8 | warnings.warn( 9 | "Cupy is not installed. You can install it with:\n" 10 | " pip install cupy-cuda12x # or appropriate version for your CUDA", 11 | category=UserWarning) 12 | cuda_available: bool = False 13 | cp = object() 14 | 15 | 16 | def is_available() -> bool: 17 | return cuda_available 18 | 19 | 20 | def device_count() -> int: 21 | if is_available(): 22 | return cp.cuda.runtime.getDeviceCount() 23 | else: 24 | return 0 25 | 26 | 27 | def current_device() -> int: 28 | return cp.cuda.runtime.getDevice() 29 | 30 | 31 | def set_device(device: int) -> None: 32 | return cp.cuda.runtime.setDevice(device) 33 | 34 | 35 | class Device: 36 | 37 | def __init__(self, device=None) -> None: 38 | if isinstance(device, str): 39 | if device == "cpu": 40 | self.device = "cpu" 41 | elif device[:4] == "cuda": 42 | self.device = "cuda" 43 | if len(device) == 4: 44 | device += ':0' 45 | 46 | cuda_id = device.split(':')[-1] 47 | if not cuda_id.isdigit(): 48 | raise ValueError(f'Wrong cuda id \"{cuda_id}\"!') 49 | 50 | self.device_id = int(cuda_id) 51 | else: 52 | raise ValueError(f"Unknown device \"{device}\"!") 53 | 54 | elif isinstance(device, int): 55 | self.device = "cuda" 56 | self.device_id = device 57 | 58 | elif device is None: 59 | self.device = "cpu" 60 | 61 | elif isinstance(device, Device): 62 | self.device = device.device 63 | if self.device != "cpu": 64 | self.device_id = device.device_id 65 | 66 | if self.device == "cuda": 67 | if not is_available(): 68 | raise RuntimeError( 69 | "Cuda device is not supported on this system.") 70 | self.device = cp.cuda.Device(self.device_id) 71 | assert self.device == "cpu" or is_available() 72 | 73 | def __repr__(self) -> str: 74 | if self.device == "cpu": 75 | return "Device(type='cpu')" 76 | else: 77 | return "Device(type='cuda', index={})".format(self.device_id) 78 | 79 | def __eq__(self, device) -> bool: 80 | if not isinstance(device, Device): 81 | device = Device(device) 82 | if self.device == "cpu": 83 | return device.device == "cpu" 84 | else: 85 | if device.device == "cpu": 86 | return False 87 | return self.device == device.device 88 | 89 | @property 90 | def xp(self): 91 | return np if self.device == "cpu" else cp 92 | 93 | def __enter__(self): 94 | if self.device != "cpu" and self.device_id != current_device(): 95 | return self.device.__enter__() 96 | 97 | def __exit__(self, type, value, trace): 98 | if self.device != "cpu" and self.device_id != current_device(): 99 | return self.device.__exit__(type, value, trace) 100 | -------------------------------------------------------------------------------- /pydynet/data.py: -------------------------------------------------------------------------------- 1 | from numpy.random import permutation 2 | 3 | 4 | class Dataset: 5 | 6 | def __init__(self) -> None: 7 | pass 8 | 9 | def __getitem__(self, index): 10 | raise NotImplementedError 11 | 12 | def __len__(self): 13 | raise NotImplementedError 14 | 15 | 16 | class Sampler: 17 | 18 | def __init__(self, dataset: Dataset) -> None: 19 | pass 20 | 21 | def __iter__(self): 22 | raise NotImplementedError 23 | 24 | 25 | class SequentialSampler(Sampler): 26 | 27 | def __init__(self, dataset: Dataset) -> None: 28 | self.dataset = dataset 29 | 30 | def __iter__(self): 31 | return iter(range(len(self.dataset))) 32 | 33 | def __len__(self) -> int: 34 | return len(self.dataset) 35 | 36 | 37 | class RandomSampler(Sampler): 38 | 39 | def __init__(self, dataset: Dataset) -> None: 40 | self.dataset = dataset 41 | 42 | def __iter__(self): 43 | yield from permutation(len(self.dataset)).tolist() 44 | 45 | def __len__(self): 46 | return len(self.dataset) 47 | 48 | 49 | class BatchSampler(Sampler): 50 | 51 | def __init__(self, sampler: Sampler, batch_size: int, 52 | drop_last: bool) -> None: 53 | self.sampler = sampler 54 | self.batch_size = batch_size 55 | self.drop_last = drop_last 56 | 57 | def __iter__(self): 58 | batch = [] 59 | for idx in self.sampler: 60 | batch.append(idx) 61 | if len(batch) == self.batch_size: 62 | yield batch 63 | batch = [] 64 | if len(batch) > 0 and not self.drop_last: 65 | yield batch 66 | 67 | def __len__(self): 68 | if self.drop_last: 69 | return len(self.sampler) // self.batch_size 70 | return (len(self.sampler) + self.batch_size - 1) // self.batch_size 71 | 72 | 73 | class _DataLoaderIter: 74 | 75 | def __init__(self, loader) -> None: 76 | self.loader = loader 77 | self.sample_iter = iter(self.loader.batch_sampler) 78 | 79 | def __next__(self): 80 | index = next(self.sample_iter) 81 | return self.loader.dataset[index] 82 | 83 | 84 | class DataLoader: 85 | 86 | def __init__(self, 87 | dataset: Dataset, 88 | batch_size: int = 1, 89 | shuffle: bool = False, 90 | drop_last: bool = False) -> None: 91 | self.dataset = dataset 92 | self.batch_size = batch_size 93 | self.shuffle = shuffle 94 | self.drop_last = drop_last 95 | 96 | if shuffle: 97 | self.sampler = RandomSampler(dataset) 98 | else: 99 | self.sampler = SequentialSampler(dataset) 100 | 101 | self.batch_sampler = BatchSampler(self.sampler, batch_size, drop_last) 102 | self.batch_size = batch_size 103 | self.drop_last = drop_last 104 | 105 | def __iter__(self): 106 | return _DataLoaderIter(self) 107 | 108 | 109 | def data_loader(X, y, batch_size: int, shuffle: bool = False) -> list: 110 | 111 | class TrainSet(Dataset): 112 | 113 | def __init__(self, X, y) -> None: 114 | self.data = X 115 | self.target = y 116 | 117 | def __getitem__(self, index): 118 | return self.data[index], self.target[index] 119 | 120 | def __len__(self): 121 | return len(self.data) 122 | 123 | return DataLoader(TrainSet(X, y), batch_size, shuffle) 124 | -------------------------------------------------------------------------------- /llm/llama/infer.py: -------------------------------------------------------------------------------- 1 | import sys, time, argparse 2 | from .tokenizer import Tokenizer 3 | from .model import Llama 4 | 5 | import pydynet as pdn 6 | import numpy as np 7 | 8 | 9 | @pdn.no_grad() 10 | def load_model(llama: Llama, model_path: str) -> Llama: 11 | weight = np.load(model_path) 12 | 13 | llama.tok_embedding.weight.data[...] = weight['model.embed_tokens.weight'] 14 | llama.lm_head.weight.data[...] = weight['lm_head.weight'].T 15 | 16 | for i in range(llama.n_layers): 17 | ( 18 | llama.layers[i].attention.Q.weight.data[...], 19 | llama.layers[i].attention.K.weight.data[...], 20 | llama.layers[i].attention.V.weight.data[...], 21 | llama.layers[i].attention.O.weight.data[...], 22 | llama.layers[i].ffn.up.weight.data[...], 23 | llama.layers[i].ffn.gate.weight.data[...], 24 | llama.layers[i].ffn.down.weight[...], 25 | llama.layers[i].input_norm.weight.data[...], 26 | llama.layers[i].post_attn_norm.weight.data[...], 27 | ) = ( 28 | weight[f'model.layers.{i}.self_attn.q_proj.weight'].T, 29 | weight[f'model.layers.{i}.self_attn.k_proj.weight'].T, 30 | weight[f'model.layers.{i}.self_attn.v_proj.weight'].T, 31 | weight[f'model.layers.{i}.self_attn.o_proj.weight'].T, 32 | weight[f'model.layers.{i}.mlp.up_proj.weight'].T, 33 | weight[f'model.layers.{i}.mlp.gate_proj.weight'].T, 34 | weight[f'model.layers.{i}.mlp.down_proj.weight'].T, 35 | weight[f'model.layers.{i}.input_layernorm.weight'], 36 | weight[f'model.layers.{i}.post_attention_layernorm.weight'], 37 | ) 38 | 39 | llama.norm.weight.data[...] = weight['model.norm.weight'] 40 | 41 | return llama 42 | 43 | 44 | if __name__ == '__main__': 45 | parser = argparse.ArgumentParser( 46 | description="Prompt input, e.g. There was a boy") 47 | parser.add_argument("--prompt", type=str, default='There was a boy') 48 | parser.add_argument("--cuda", action='store_true') 49 | args = parser.parse_args() 50 | 51 | dim: int = 288 # D 52 | n_layers: int = 6 53 | n_heads: int = 6 54 | vocab_size: int = 32000 # VS 55 | max_seq_len: int = 1024 # M 56 | max_new_tokens: int = 1024 57 | max_batch_size: int = 1 58 | datatype = np.float32 59 | 60 | tokenizer = Tokenizer("llm/llama/data/tokenizer.model.np") 61 | model = load_model( 62 | Llama(vocab_size, 63 | dim, 64 | n_heads, 65 | 768, 66 | max_seq_len, 67 | max_batch_size, 68 | n_layers, 69 | dtype=datatype), "llm/llama/data/stories15M.model.npz") 70 | 71 | # If cuda is available 72 | if args.cuda and pdn.cuda.is_available(): 73 | model: Llama = model.to('cuda:2') 74 | 75 | model.eval() 76 | with pdn.no_grad(): 77 | print(f"\n{args.prompt}", end="") 78 | input_ids = np.array([tokenizer.encode(args.prompt)]) 79 | 80 | _, L = input_ids.shape 81 | start = time.time() 82 | for id in model.generate(input_ids, max_new_tokens): 83 | L += 1 84 | output_id = id[0].numpy().tolist() 85 | 86 | if output_id[-1] in [tokenizer.eos_id, tokenizer.bos_id]: 87 | break 88 | print(tokenizer.decode(output_id), end="") 89 | sys.stdout.flush() 90 | elapsed = time.time() - start 91 | print( 92 | f"\n\nToken count: {L}, elapsed: {elapsed:.2f}s, {round(L / elapsed)} tokens/s" 93 | ) 94 | -------------------------------------------------------------------------------- /examples/pytorch/ts_prediction.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn.model_selection import train_test_split 4 | from tqdm import tqdm 5 | 6 | import torch 7 | import pydynet.nn as nn 8 | from pydynet.optim import Adam 9 | 10 | 11 | def windowize(y, input_len, horizon=1, stride=1, step=1): 12 | 13 | y = np.asarray(y) 14 | max_i = len(y) - (input_len + horizon) * step + step 15 | idx_inputs = [] 16 | idx_targets = [] 17 | for i in range(0, max_i, stride): 18 | inp_idx = i + np.arange(0, input_len * step, step) 19 | tgt_idx = i + input_len * step + np.arange(0, horizon * step, step) 20 | idx_inputs.append(inp_idx) 21 | idx_targets.append(tgt_idx) 22 | X = y[np.array(idx_inputs)] 23 | Y = y[np.array(idx_targets)] 24 | return ( 25 | torch.tensor(X[..., np.newaxis]), 26 | torch.tensor(Y), 27 | ) 28 | 29 | 30 | TIME_STEP = 40 # rnn 时序步长数 31 | INPUT_SIZE = 1 # rnn 的输入维度 32 | H_SIZE = 32 # rnn 隐藏单元个数 33 | EPOCHS = 50 # 总共训练次数 34 | h_state = None # 隐藏层状态 35 | 36 | 37 | def f(t): 38 | return np.sin(np.pi * t) + 0.5 * np.cos(2 * np.pi * t) 39 | 40 | 41 | steps = np.arange(0, 100, .05) 42 | X, Y = windowize(f(steps), input_len=TIME_STEP, horizon=1, stride=1, step=1) 43 | 44 | X_train, X_test, Y_train, Y_test = train_test_split( 45 | X, 46 | Y, 47 | test_size=0.2, 48 | random_state=42, 49 | ) 50 | 51 | 52 | class RNN(nn.Module): 53 | 54 | def __init__(self): 55 | super(RNN, self).__init__() 56 | self.rnn = nn.GRU( 57 | input_size=INPUT_SIZE, 58 | hidden_size=H_SIZE, 59 | num_layers=1, 60 | batch_first=True, 61 | dtype=np.float32, 62 | ) 63 | self.out = nn.Linear(H_SIZE, 1) 64 | 65 | def forward(self, x, h_state): 66 | _, h_state = self.rnn(x, h_state) 67 | out = self.out(h_state[:, self.rnn.num_layers - 1, :]) 68 | return out 69 | 70 | 71 | rnn = RNN() 72 | optimizer = Adam(rnn.parameters(), lr=0.01) 73 | criterion = nn.MSELoss() 74 | 75 | loss_list = [] 76 | 77 | plt.rcParams['font.sans-serif'] = ['Times New Roman'] 78 | plt.rcParams['mathtext.fontset'] = 'stix' 79 | plt.rcParams['xtick.direction'] = 'in' 80 | plt.rcParams['ytick.direction'] = 'in' 81 | plt.rcParams['axes.linewidth'] = 0.5 82 | 83 | bar = tqdm(range(EPOCHS)) 84 | visual_steps = np.arange(0, 10, .05) 85 | visual_X, visual_Y = windowize(f(visual_steps), 86 | TIME_STEP, 87 | horizon=1, 88 | stride=1, 89 | step=1) 90 | 91 | for step in bar: 92 | 93 | rnn.train() 94 | prediction = rnn(X_train, h_state) 95 | train_loss = criterion(prediction, Y_train) 96 | 97 | optimizer.zero_grad() 98 | train_loss.backward() 99 | optimizer.step() 100 | 101 | plt.figure(figsize=(5, 3)) 102 | plt.grid() 103 | 104 | rnn.eval() 105 | with torch.no_grad(): 106 | test_loss = criterion(rnn(X_test, h_state), Y_test) 107 | 108 | plt.plot(visual_steps[TIME_STEP:], 109 | visual_Y.numpy(), 110 | 'r-', 111 | lw=0.7, 112 | label=r'$f(x)=\sin(\pi x)+\cos(2\pi x)/2$') 113 | plt.plot( 114 | visual_steps[TIME_STEP:], 115 | rnn(visual_X, h_state).numpy(), 116 | 'b-.', 117 | lw=0.7, 118 | label='Prediction', 119 | ) 120 | 121 | plt.xticks([4, 6, 8, 10]) 122 | plt.yticks([-1.6, -.8, 0, .8]) 123 | 124 | plt.legend(loc=1) 125 | plt.ylim(-1.6, 0.8) 126 | plt.xlim(visual_steps[TIME_STEP], 10) 127 | plt.title('Prediction with GRU') 128 | plt.tight_layout() 129 | plt.savefig("imgs/rnn.png") 130 | plt.close() 131 | 132 | bar.set_postfix( 133 | train_loss="{:.5f}".format(train_loss.item()), 134 | test_loss="{:.5f}".format(test_loss.item()), 135 | ) 136 | -------------------------------------------------------------------------------- /pydynet/nn/modules/conv.py: -------------------------------------------------------------------------------- 1 | from .module import Module 2 | from ..parameter import Parameter 3 | from .. import init 4 | from .. import functional as F 5 | from ...special import empty 6 | from ...cuda import Device 7 | 8 | import math 9 | 10 | 11 | class Conv1d(Module): 12 | 13 | def __init__( 14 | self, 15 | in_channels: int, 16 | out_channels: int, 17 | kernel_size: int, 18 | stride: int = 1, 19 | padding: int = 0, 20 | bias: bool = True, 21 | device=None, 22 | dtype=None, 23 | ) -> None: 24 | super().__init__() 25 | kwargs = {"device": Device(device), "dtype": dtype} 26 | self.in_channels = in_channels 27 | self.out_channels = out_channels 28 | self.kernel_size = kernel_size 29 | self.padding = padding 30 | self.stride = stride 31 | self.weight = Parameter( 32 | empty((self.out_channels, self.in_channels, self.kernel_size), 33 | **kwargs)) 34 | self.bias = Parameter(empty( 35 | (1, self.out_channels, 1), **kwargs)) if bias else None 36 | self.reset_parameters() 37 | 38 | def reset_parameters(self): 39 | init.kaiming_uniform_(self.weight, a=math.sqrt(5)) 40 | if self.bias is not None: 41 | fan_in, _ = init._calculate_fan(self.weight) 42 | if fan_in != 0: 43 | bound = 1 / math.sqrt(fan_in) 44 | init.uniform_(self.bias, -bound, bound) 45 | 46 | def forward(self, x): 47 | conv1d = F.conv1d(x, self.weight, self.padding, self.stride) 48 | if self.bias is not None: 49 | return conv1d + self.bias 50 | return conv1d 51 | 52 | def __repr__(self) -> str: 53 | return "{}(in_channels={}, out_channels={}, kernel_size={}, padding={}, stride={}, bias={})".format( 54 | self.__class__.__name__, 55 | self.in_channels, 56 | self.out_channels, 57 | self.kernel_size, 58 | self.padding, 59 | self.stride, 60 | self.bias is not None, 61 | ) 62 | 63 | 64 | class Conv2d(Module): 65 | 66 | def __init__( 67 | self, 68 | in_channels: int, 69 | out_channels: int, 70 | kernel_size: int, 71 | stride: int = 1, 72 | padding: int = 0, 73 | bias: bool = True, 74 | device=None, 75 | dtype=None, 76 | ) -> None: 77 | super().__init__() 78 | kwargs = {"device": Device(device), "dtype": dtype} 79 | self.in_channels = in_channels 80 | self.out_channels = out_channels 81 | self.kernel_size = kernel_size 82 | self.padding = padding 83 | self.stride = stride 84 | self.weight = Parameter( 85 | empty((self.out_channels, self.in_channels, self.kernel_size, 86 | self.kernel_size), **kwargs)) 87 | self.bias = Parameter(empty( 88 | (1, self.out_channels, 1, 1), **kwargs)) if bias else None 89 | self.reset_parameters() 90 | 91 | def reset_parameters(self): 92 | init.kaiming_uniform_(self.weight, a=math.sqrt(5)) 93 | if self.bias is not None: 94 | fan_in, _ = init._calculate_fan(self.weight) 95 | if fan_in != 0: 96 | bound = 1 / math.sqrt(fan_in) 97 | init.uniform_(self.bias, -bound, bound) 98 | 99 | def forward(self, x): 100 | conv2d = F.conv2d(x, self.weight, self.padding, self.stride) 101 | if self.bias is not None: 102 | return conv2d + self.bias 103 | return conv2d 104 | 105 | def __repr__(self) -> str: 106 | return "{}(in_channels={}, out_channels={}, kernel_size={}, padding={}, stride={}, bias={})".format( 107 | self.__class__.__name__, 108 | self.in_channels, 109 | self.out_channels, 110 | self.kernel_size, 111 | self.padding, 112 | self.stride, 113 | self.bias is not None, 114 | ) 115 | -------------------------------------------------------------------------------- /examples/pydynet/ts_prediction.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn.model_selection import train_test_split 4 | from tqdm import tqdm 5 | 6 | import pydynet as pdn 7 | from pydynet import Tensor 8 | import pydynet.nn as nn 9 | from pydynet.optim import Adam 10 | 11 | 12 | def windowize(y, input_len, horizon=1, stride=1, step=1): 13 | 14 | y = np.asarray(y) 15 | max_i = len(y) - (input_len + horizon) * step + step 16 | idx_inputs = [] 17 | idx_targets = [] 18 | for i in range(0, max_i, stride): 19 | inp_idx = i + np.arange(0, input_len * step, step) 20 | tgt_idx = i + input_len * step + np.arange(0, horizon * step, step) 21 | idx_inputs.append(inp_idx) 22 | idx_targets.append(tgt_idx) 23 | X = y[np.array(idx_inputs)] 24 | Y = y[np.array(idx_targets)] 25 | return ( 26 | Tensor(X[..., np.newaxis], dtype=np.float32), 27 | Tensor(Y, dtype=np.float32), 28 | ) 29 | 30 | 31 | TIME_STEP = 40 # rnn 时序步长数 32 | INPUT_SIZE = 1 # rnn 的输入维度 33 | H_SIZE = 32 # rnn 隐藏单元个数 34 | EPOCHS = 50 # 总共训练次数 35 | h_state = None # 隐藏层状态 36 | 37 | 38 | def f(t): 39 | return np.sin(np.pi * t) + 0.5 * np.cos(2 * np.pi * t) 40 | 41 | 42 | steps = np.arange(0, 100, .05) 43 | X, Y = windowize(f(steps), input_len=TIME_STEP, horizon=1, stride=1, step=1) 44 | 45 | X_train, X_test, Y_train, Y_test = train_test_split( 46 | X, 47 | Y, 48 | test_size=0.2, 49 | random_state=42, 50 | ) 51 | 52 | 53 | class RNN(nn.Module): 54 | 55 | def __init__(self): 56 | super(RNN, self).__init__() 57 | self.rnn = nn.GRU( 58 | input_size=INPUT_SIZE, 59 | hidden_size=H_SIZE, 60 | num_layers=1, 61 | batch_first=True, 62 | dtype=np.float32, 63 | ) 64 | self.out = nn.Linear(H_SIZE, 1, dtype=np.float32) 65 | 66 | def forward(self, x, h_state): 67 | _, h_state = self.rnn(x, h_state) 68 | out = self.out(h_state[:, self.rnn.num_layers - 1, :]) 69 | return out 70 | 71 | 72 | rnn = RNN() 73 | optimizer = Adam(rnn.parameters(), lr=0.01) 74 | criterion = nn.MSELoss() 75 | 76 | loss_list = [] 77 | 78 | plt.rcParams['font.sans-serif'] = ['Times New Roman'] 79 | plt.rcParams['mathtext.fontset'] = 'stix' 80 | plt.rcParams['xtick.direction'] = 'in' 81 | plt.rcParams['ytick.direction'] = 'in' 82 | plt.rcParams['axes.linewidth'] = 0.5 83 | 84 | bar = tqdm(range(EPOCHS)) 85 | visual_steps = np.arange(0, 10, .05) 86 | visual_X, visual_Y = windowize(f(visual_steps), 87 | TIME_STEP, 88 | horizon=1, 89 | stride=1, 90 | step=1) 91 | 92 | for step in bar: 93 | 94 | rnn.train() 95 | prediction = rnn(X_train, h_state) 96 | train_loss = criterion(prediction, Y_train) 97 | 98 | optimizer.zero_grad() 99 | train_loss.backward() 100 | optimizer.step() 101 | 102 | plt.figure(figsize=(5, 3)) 103 | plt.grid() 104 | 105 | rnn.eval() 106 | with pdn.no_grad(): 107 | test_loss = criterion(rnn(X_test, h_state), Y_test) 108 | 109 | plt.plot(visual_steps[TIME_STEP:], 110 | visual_Y.numpy(), 111 | 'r-', 112 | lw=0.7, 113 | label=r'$f(x)=\sin(\pi x)+\cos(2\pi x)/2$') 114 | plt.plot( 115 | visual_steps[TIME_STEP:], 116 | rnn(visual_X, h_state).numpy(), 117 | 'b-.', 118 | lw=0.7, 119 | label='Prediction', 120 | ) 121 | 122 | plt.xticks([4, 6, 8, 10]) 123 | plt.yticks([-1.6, -.8, 0, .8]) 124 | 125 | plt.legend(loc=1) 126 | plt.ylim(-1.6, 0.8) 127 | plt.xlim(visual_steps[TIME_STEP], 10) 128 | plt.title('Prediction with GRU') 129 | plt.tight_layout() 130 | plt.savefig("imgs/rnn.png") 131 | plt.close() 132 | 133 | bar.set_postfix( 134 | train_loss="{:.5f}".format(train_loss.item()), 135 | test_loss="{:.5f}".format(test_loss.item()), 136 | ) 137 | -------------------------------------------------------------------------------- /pydynet/nn/modules/module.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | from ..parameter import Parameter 4 | from ...core import Tensor 5 | from ...autograd import set_grad_enabled 6 | from ...cuda import Device, current_device 7 | 8 | 9 | class Module: 10 | 11 | def __init__(self) -> None: 12 | self._train = True 13 | self.device = Device("cpu") 14 | self._parameters = OrderedDict() 15 | 16 | def __call__(self, *x) -> Tensor: 17 | return self.forward(*x) 18 | 19 | def __setattr__(self, __name: str, __value) -> None: 20 | self.__dict__[__name] = __value 21 | if isinstance(__value, Parameter): 22 | self._parameters[__name] = __value 23 | if isinstance(__value, Module): 24 | for key in __value._parameters: 25 | self._parameters[__name + "." + key] = __value._parameters[key] 26 | 27 | def __repr__(self) -> str: 28 | module_list = [ 29 | module for module in self.__dict__.items() 30 | if isinstance(module[1], Module) 31 | ] 32 | return "{}(\n{}\n)".format( 33 | self.__class__.__name__, 34 | "\n".join([ 35 | "{:>10} : {}".format(module_name, module) 36 | for module_name, module in module_list 37 | ]), 38 | ) 39 | 40 | def parameters(self): 41 | for param in self._parameters.values(): 42 | if param.requires_grad: 43 | yield param 44 | 45 | def train(self, mode: bool = True): 46 | set_grad_enabled(mode) 47 | self.set_module_state(mode) 48 | 49 | def set_module_state(self, mode: bool): 50 | self._train = mode 51 | for module in self.__dict__.values(): 52 | if isinstance(module, Module): 53 | module.set_module_state(mode) 54 | 55 | def forward(self, x: Tensor) -> Tensor: 56 | raise NotImplementedError 57 | 58 | def eval(self): 59 | return self.train(False) 60 | 61 | def to(self, device): 62 | if not isinstance(device, Device): 63 | device = Device(device) 64 | if self.device != device: 65 | self.move(device) 66 | return self 67 | 68 | def move(self, device): 69 | self.device = device 70 | for module in self.__dict__.values(): 71 | if isinstance(module, Module): 72 | module.move(device) 73 | if isinstance(module, Parameter): 74 | module.to(device) 75 | 76 | def cuda(self): 77 | return self.to(current_device()) 78 | 79 | def cpu(self): 80 | return self.to('cpu') 81 | 82 | 83 | class Sequential(Module): 84 | 85 | def __init__(self, *args) -> None: 86 | super().__init__() 87 | self.module_list = [] 88 | if len(args) == 1 and isinstance(args[0], OrderedDict): 89 | for name, module in args[0].items(): 90 | self.__setattr__(name, module) 91 | self.module_list.append(module) 92 | else: 93 | for idx, module in enumerate(args): 94 | self.__setattr__(str(idx), module) 95 | self.module_list.append(module) 96 | 97 | def forward(self, x: Tensor) -> Tensor: 98 | for module in self.module_list: 99 | x = module(x) 100 | return x 101 | 102 | def __len__(self): 103 | return len(self.module_list) 104 | 105 | 106 | class ModuleList(Module): 107 | 108 | def __init__(self, module_list: list) -> None: 109 | super().__init__() 110 | self.module_list = module_list 111 | 112 | for idx, module in enumerate(module_list): 113 | self.__setattr__(str(idx), module) 114 | 115 | def __getitem__(self, index): 116 | return self.module_list[index] 117 | 118 | def __len__(self): 119 | return len(self.module_list) 120 | 121 | def append(self, module): 122 | self.module_list.append(module) 123 | self.__setattr__(str(len(self.module_list) - 1), module) 124 | 125 | def index(self, module): 126 | return self.module_list.index(module) 127 | -------------------------------------------------------------------------------- /tests/test_tensor_basic.py: -------------------------------------------------------------------------------- 1 | import sys, pytest, random 2 | import numpy as np 3 | from itertools import product 4 | 5 | sys.path.append('../pydynet') 6 | 7 | import pydynet as pdn 8 | 9 | np.random.seed(0) 10 | random.seed(0) 11 | 12 | type_list = [np.float16, np.float32, np.float64] 13 | 14 | 15 | def matmul_shape_pair(max_dim=4, max_size=5): 16 | ndim = random.randint(0, max_dim) 17 | 18 | shape1 = [] 19 | shape2 = [] 20 | for _ in range(ndim): 21 | if random.random() < 0.5: 22 | # 50% 概率设置为 1, 确保广播可能 23 | s1, s2 = random.choice([(1, random.randint(1, max_size)), 24 | (random.randint(1, max_size), 1)]) 25 | else: 26 | # 否则两边相同 27 | val = random.randint(1, max_size) 28 | s1, s2 = val, val 29 | shape1.append(s1) 30 | shape2.append(s2) 31 | shape1, shape2 = tuple(shape1), tuple(shape2) 32 | 33 | m = random.randint(1, max_size) 34 | n = random.randint(1, max_size) 35 | p = random.randint(1, max_size) 36 | 37 | shape1 = shape1 + (m, n) 38 | shape2 = shape2 + (n, p) 39 | 40 | shape1 = shape1[random.randint(0, len(shape1) - 2):] 41 | 42 | return shape1, shape2 43 | 44 | 45 | def broadcastable_shape_pair(max_dim=4, max_size=5): 46 | ndim = random.randint(0, max_dim) # 随机维数 47 | shape1 = [] 48 | shape2 = [] 49 | for _ in range(ndim): 50 | if random.random() < 0.5: 51 | # 50% 概率设置为 1, 确保广播可能 52 | s1, s2 = random.choice([(1, random.randint(1, max_size)), 53 | (random.randint(1, max_size), 1)]) 54 | else: 55 | # 否则两边相同 56 | val = random.randint(1, max_size) 57 | s1, s2 = val, val 58 | shape1.append(s1) 59 | shape2.append(s2) 60 | shape1, shape2 = tuple(shape1), tuple(shape2) 61 | 62 | # 随机缺失维度 63 | shape1 = shape1[random.randint(0, len(shape1)):] 64 | return shape1, shape2 65 | 66 | 67 | def array_pair_generator(pair_gen_func, 68 | max_dim=4, 69 | max_size=5, 70 | n_iter=4, 71 | seed=None): 72 | rng = np.random.default_rng(seed) 73 | count = 0 74 | while n_iter is None or count < n_iter: 75 | shape1, shape2 = pair_gen_func(max_dim, max_size) 76 | a = rng.standard_normal(size=shape1).astype(rng.choice(type_list)) 77 | b = rng.standard_normal(size=shape2).astype(rng.choice(type_list)) 78 | yield a, b 79 | count += 1 80 | 81 | 82 | test_list = array_pair_generator(broadcastable_shape_pair, 4, 5, 8, seed=42) 83 | func_list = [(pdn.add, np.add), (pdn.sub, np.subtract), (pdn.mul, np.multiply), 84 | (pdn.div, np.divide), (pdn.pow, np.power), 85 | (pdn.maximum, np.maximum), (pdn.minimum, np.minimum)] 86 | test_list = [(*array, *funcs) 87 | for (array, funcs) in product(test_list, func_list)] 88 | 89 | 90 | @pytest.mark.parametrize("operand1, operand2, pdn_func, np_func", test_list) 91 | @pytest.mark.filterwarnings("ignore:invalid value") 92 | @pytest.mark.filterwarnings("ignore:divide by zero") 93 | def test_binary_operator(operand1: np.ndarray, operand2: np.ndarray, 94 | pdn_func: callable, np_func: callable): 95 | pdn_operand1, pdn_operand2 = pdn.Tensor(operand1), pdn.Tensor(operand2) 96 | pdn_output: pdn.Tensor = pdn_func(pdn_operand1, pdn_operand2) 97 | np_output: np.ndarray = np_func(operand1, operand2) 98 | assert pdn_output.shape == np_output.shape 99 | assert pdn_output.dtype == np_output.dtype 100 | assert np.allclose(pdn_output.data, np_output, equal_nan=True) 101 | 102 | 103 | test_list = array_pair_generator(matmul_shape_pair, 4, 5, 8, seed=42) 104 | 105 | 106 | @pytest.mark.parametrize("operand1, operand2", test_list) 107 | def test_matmul(operand1: np.ndarray, operand2: np.ndarray): 108 | pdn_operand1, pdn_operand2 = pdn.Tensor(operand1), pdn.Tensor(operand2) 109 | pdn_output: pdn.Tensor = pdn.matmul(pdn_operand1, pdn_operand2) 110 | np_output: np.ndarray = np.matmul(operand1, operand2) 111 | assert pdn_output.shape == np_output.shape 112 | assert pdn_output.dtype == np_output.dtype 113 | assert np.allclose(pdn_output.data, np_output, equal_nan=True) 114 | 115 | -------------------------------------------------------------------------------- /cnREADME.md: -------------------------------------------------------------------------------- 1 | # PyDyNet:NumPy-based Dynamic Deep Learning Framework 2 | 3 | **PyDyNet已被多个技术公众号和社区分享**:[居然用Numpy实现了一个深度学习框架](https://segmentfault.com/a/1190000042108301). 4 | 5 | [![Downloads](https://pepy.tech/badge/pydynet)](https://pepy.tech/project/pydynet) 6 | [![Downloads](https://static.pepy.tech/personalized-badge/pydynet?period=month&units=international_system&left_color=grey&right_color=orange&left_text=downloads/month)](https://pepy.tech/project/pydynet) 7 | ![x](https://img.shields.io/pypi/l/pydynet) 8 | ![x](https://img.shields.io/pypi/implementation/numpy) 9 | ![x](https://img.shields.io/github/stars/Kaslanarian/PyDyNet?style=social) 10 | ![x](https://img.shields.io/github/forks/Kaslanarian/PyDyNet?style=social) 11 | 12 | ## Towards Large Language Model 13 | 14 | **2025.8.12**: 实现了纯推理的llama3 (6-layer Transformer, vocab-size=32000). 参考了[这里](https://github.com/likejazz/llama3.np)的NumPy实现和数据集. 将数据集下载到`llama`文件夹即可运行: 15 | 16 | ```bash 17 | >>> python -m llama.infer 18 | There was a boy named Timmy. He loved to play with hi toy and run around outside. One day, Timmy' mom asked him to help her with the laundry. Timmy didn't want to help because he wanted to play. But hi mom said, "Timmy, you need to help me. It' important to help out." 19 | Timmy didn't want to help, but he knew he had to. So, he put on hi shoe and went outside to help hi mom. A they were folding the clothe, Timmy saw a big pile of laundry on the floor. He wanted to help, so he started to pick it up. But then, he accidentally knocked over a pile of clothe and they fell on him. Timmy wa okay, but he felt bad. 20 | Hi mom saw what happened and said, "Timmy, you need to be more careful. You could have hurt yourself." Timmy felt bad and said sorry. Hi mom hugged him and said, "It' okay, accident happen. Let' clean up the laundry together." Timmy learned that it' important to be careful and help out when you need it. 21 | 22 | Token count: 262, elapsed: 0.87s, 300 tokens/s 23 | ``` 24 | 25 | ## Overview 26 | 27 | PyDyNet也是纯NumPy(0.0.7版本后加入CuPy,其用法和NumPy一致)实现的神经网络,语法受PyTorch的启发,大致结构如下: 28 | 29 | ```mermaid 30 | graph LR 31 | N(numpy/cupy.ndarray)--Backend--> A(Tensor) --> ds(Dataset) ---> Data(DataLoader)---> Mission 32 | A --Eager execution--> B(Basic operators:
add, exp, etc) 33 | B -.Autograd-.-> A 34 | 35 | B --> CO(Complex
operators) 36 | --> f(Function:
img2col, etc) 37 | --> M(Basic Module:
Linear, etc) 38 | --> CM(Advanced Module: CNN, RNN, Transformer, etc) 39 | --> Mission(Learning task) 40 | A --> GD(Optimizer:
SGD, Adam, etc) ---> LS(lr_scheduler:
StepLR, etc)---> Mission 41 | ``` 42 | 43 | 虚线表示用户可以通过`no_grad`来关闭自动微分功能. 44 | 45 | ## Install 46 | 47 | ```bash 48 | git clone https://github.com/Kaslanarian/PyDyNet 49 | cd PyDyNet 50 | python setup.py install 51 | ``` 52 | 53 | ## Example 54 | 55 | [examples/pydynet](./examples/pydynet)中是一些例子,[examples/pytorch](./examples/pytorch)给出等价的pytorch实现. 运行`python examples.pydynet.xxx`即可: 56 | 57 | ### AutoDiff 58 | 59 | [autodiff1d.py](examples/pydynet/autodiff1d.py)利用自动微分,对一个一维凸函数进行梯度下降: 60 | 61 | ad1 62 | 63 | 以及一个多元凸函数的例子: [autodiff2d.py](examples/pydynet/autodiff2d.py) 64 | 65 | ad2 66 | 67 | ### MLP & LeNet 68 | 69 | [mlp_cnn.py](examples/pydynet/mnist.py)使用MLP和LeNet对MNIST进行分类. 训练准确率和测试准确率: 70 | 71 | dnn 72 | 73 | ### Dropout & BN 74 | 75 | [mlp_dropout_bn.py](examples/pydynet/dropout_bn.py)使用三种网络对`fetch_olivetti_faces`人脸(64×64)数据集进行分类并进行性能对比: 76 | 77 | 1. 三层MLP; 78 | 2. 三层MLP + Dropout; 79 | 3. 三层MLP + BatchNormalization. 80 | 81 | 学习效果对比: 82 | 83 | cnn 84 | 85 | ### RNN 86 | 87 | [ts_prediction](examples/pydynet/ts_prediction.py)中是一个用GRU做时序预测例子: 88 | 89 | RNN 90 | 91 | ### Transformer 92 | 93 | [transformer.py](examples/pydynet/transformer.py)中是一个用Transformer训练文本分类模型的例子. 训练结果: 94 | 95 | transformer 96 | 97 | > 数据集 (CoLA) 链接: 98 | 99 | ## cuda加速 100 | 101 | 在训练batch size为256, 测试batch size为1024情况下,模型在CPU和GPU上的训练速度比较: 102 | 103 | | Network structure | Dataset | CPU time (s) per epoch | GPU time (s) per epoch | 104 | | :-----------------: | :---------------: | :--------------------: | :--------------------: | 105 | | 3-layer MLP | MNIST (80000×574) | 7.256±0.138 | 1.203±.0181 | 106 | | LeNet | MNIST (80000×574) | 239.664±2.108 | 2.841±0.026 | 107 | | 1-layer Transformer (dim=512, head=4) | CoLA (8551×45×64) | 17.503±0.251 | 1.075±0.002 | 108 | 109 | 设备: Nvidia GeForce RTX 4090. 110 | -------------------------------------------------------------------------------- /pydynet/optim/lr_scheduler.py: -------------------------------------------------------------------------------- 1 | '''学习率调节器类,我们目前实现了\n 2 | - ExponentialLR;\n 3 | - StepLR;\n 4 | - MultiStepLR;\n 5 | - CosineAnnealingLR.\n 6 | ''' 7 | 8 | from typing import List 9 | from .optimizer import Optimizer 10 | import weakref 11 | from functools import wraps 12 | from collections import Counter 13 | from math import cos, pi 14 | 15 | 16 | class _LRScheduler: 17 | def __init__(self, optimizer: Optimizer, last_epoch: int = -1) -> None: 18 | self.optimizer = optimizer 19 | self.last_epoch = last_epoch 20 | 21 | if self.last_epoch == -1: 22 | self.optimizer.initial_lr = self.optimizer.lr 23 | else: 24 | assert hasattr( 25 | self.optimizer, "initial_lr" 26 | ), "last_epoch=1 but no 'initial_lr' attribute in optimizer!" 27 | 28 | def with_counter(method): 29 | if getattr(method, '_with_counter', False): 30 | # `optimizer.step()` has already been replaced, return. 31 | return method 32 | 33 | # 建立一个method的弱引用。弱引用不增加对象的引用计数,只存在弱引用的对象是可被垃圾回收的; 34 | # 弱引用可以解决循环引用的问题。 35 | instance_ref = weakref.ref(method.__self__) 36 | # Get the unbound method for the same purpose. 37 | func = method.__func__ # __func__是method的底层实现,不跟具体的实例绑定 38 | cls = instance_ref().__class__ # method的所属类 39 | del method 40 | 41 | @wraps(func) 42 | def wrapper(*args, **kwargs): 43 | instance = instance_ref() 44 | instance._step_count += 1 45 | wrapped = func.__get__(instance, cls) 46 | return wrapped(*args, **kwargs) 47 | 48 | # Note that the returned function here is no longer a bound method, 49 | # so attributes like `__func__` and `__self__` no longer exist. 50 | wrapper._with_counter = True 51 | return wrapper 52 | 53 | # 通过装饰器来为optimizer.step添加计数功能,并初始化计数器 54 | self.optimizer.step = with_counter(self.optimizer.step) 55 | self.optimizer._step_count = 0 56 | self._step_count = 0 57 | 58 | self.step() 59 | 60 | def step(self): 61 | self._step_count += 1 # lr_scheduler的step计数 62 | 63 | # 支持上下文管理器协议的类 64 | class _enable_get_lr_call: 65 | def __init__(self, o): 66 | self.o = o 67 | 68 | def __enter__(self): 69 | self.o._get_lr_called_within_step = True 70 | return self 71 | 72 | def __exit__(self, type, value, traceback): 73 | self.o._get_lr_called_within_step = False 74 | 75 | with _enable_get_lr_call(self): 76 | self.last_epoch += 1 # 更新epoch 77 | lr = self.get_lr() # 计算新的lr,与具体的lr_scheduler类型有关 78 | 79 | # _last_lr记录上一轮次更新的lr值 80 | self._last_lr = self.optimizer.lr 81 | self.optimizer.lr = lr 82 | 83 | def get_lr(self): 84 | raise NotImplementedError 85 | 86 | def get_last_lr(self): 87 | return self._last_lr 88 | 89 | 90 | class ExponentialLR(_LRScheduler): 91 | def __init__( 92 | self, 93 | optimizer: Optimizer, 94 | gamma: float = 0.1, 95 | last_epoch: int = -1, 96 | ) -> None: 97 | self.gamma = gamma 98 | super().__init__(optimizer, last_epoch) 99 | 100 | def get_lr(self): 101 | return self.optimizer.lr * self.gamma**self.last_epoch 102 | 103 | 104 | class StepLR(_LRScheduler): 105 | def __init__( 106 | self, 107 | optimizer: Optimizer, 108 | step_size: int, 109 | gamma=0.1, 110 | last_epoch: int = -1, 111 | ) -> None: 112 | self.step_size = step_size 113 | self.gamma = gamma 114 | super().__init__(optimizer, last_epoch) 115 | 116 | def get_lr(self): 117 | return self.optimizer.lr * self.gamma**(self.last_epoch // 118 | self.step_size) 119 | 120 | 121 | class MultiStepLR(_LRScheduler): 122 | def __init__( 123 | self, 124 | optimizer: Optimizer, 125 | milestones: List[int], 126 | gamma=0.1, 127 | last_epoch: int = -1, 128 | ) -> None: 129 | self.milestones = Counter(milestones) 130 | self.gamma = gamma 131 | super().__init__(optimizer, last_epoch) 132 | 133 | def get_lr(self): 134 | if self.last_epoch not in self.milestones: 135 | return self.optimizer.lr 136 | return self.optimizer.lr * self.gamma**self.milestones[self.last_epoch] 137 | 138 | 139 | class CosineAnnealingLR(_LRScheduler): 140 | def __init__( 141 | self, 142 | optimizer: Optimizer, 143 | T_max: int, 144 | eta_min: float = 0, 145 | last_epoch: int = -1, 146 | ) -> None: 147 | self.T_max = T_max 148 | self.eta_min = eta_min 149 | super().__init__(optimizer, last_epoch) 150 | 151 | def get_lr(self): 152 | base_lr = self.optimizer.initial_lr 153 | if self.last_epoch == 0: 154 | return base_lr 155 | elif (self.last_epoch - 1 - self.T_max) % (2 * self.T_max) == 0: 156 | return self.get_last_lr() + (base_lr - self.eta_min) * ( 157 | 1 - cos(pi / self.T_max)) / 2 158 | return (1 + cos(pi * self.last_epoch / self.T_max)) / ( 159 | 1 + cos(pi * (self.last_epoch - 1) / self.T_max)) * ( 160 | self.get_last_lr() - self.eta_min) + self.eta_min 161 | -------------------------------------------------------------------------------- /pydynet/optim/optimizer.py: -------------------------------------------------------------------------------- 1 | '''优化器类,我们目前实现了\n 2 | - SGD with momentum and Nestrov;\n 3 | - Adagrad;\n 4 | - Adadelta;\n 5 | - Adam.\n 6 | 7 | Reference 8 | --------- 9 | 论文: https://arxiv.org/abs/1609.04747;\n 10 | 博客: https://xingcy.net/2021/08/20/gd/. 11 | ''' 12 | 13 | from math import sqrt 14 | from ..core import Tensor 15 | 16 | 17 | class Optimizer: 18 | '''优化器基类''' 19 | 20 | def __init__(self, params: list[Tensor]) -> None: 21 | self.params: list[Tensor] = list(params) 22 | 23 | def step(self): 24 | raise NotImplementedError 25 | 26 | def zero_grad(self): 27 | '''针对self.params梯度清零.''' 28 | for param in self.params: 29 | param.zero_grad() 30 | 31 | 32 | class SGD(Optimizer): 33 | '''带动量的梯度下降 34 | 35 | Parameters 36 | ---------- 37 | params : list[Parameter] 38 | 待优化参数; 39 | lr : float 40 | 学习率; 41 | momentum : float 42 | 动量系数; 43 | weight_decay : float, default=0. 44 | 权重衰减系数. 45 | nesterov : bool, defallt=True. 46 | 是否采用Nesterov加速. 47 | ''' 48 | 49 | def __init__( 50 | self, 51 | params: list[Tensor], 52 | lr: float, 53 | momentum: float = .5, 54 | weight_decay: float = 0., 55 | nesterov=True, 56 | ) -> None: 57 | super().__init__(params) 58 | self.lr = lr 59 | self.momentum = momentum 60 | self.weight_decay = weight_decay 61 | self.nesterov = nesterov 62 | self.v = [ 63 | param.xp.zeros(param.shape, dtype=param.dtype) 64 | for param in self.params 65 | ] 66 | 67 | def step(self): 68 | for i in range(len(self.params)): 69 | with self.params[i].device: 70 | grad = self.params[i].grad + self.weight_decay * self.params[i].data 71 | self.v[i] *= self.momentum 72 | self.v[i] += self.lr * grad 73 | self.params[i].data -= self.v[i] 74 | if self.nesterov: 75 | self.params[i].data -= self.lr * grad 76 | 77 | 78 | class Adagrad(Optimizer): 79 | '''Adaptive Gradient Descent 80 | 81 | Parameters 82 | ---------- 83 | params : list[Parameter] 84 | 待优化参数; 85 | lr : float, default=1e-2. 86 | 学习率; 87 | weight_decay : float, default=0. 88 | 权重衰减系数. 89 | eps : float, default=1e-10 90 | epsilon. 91 | ''' 92 | 93 | def __init__( 94 | self, 95 | params: list[Tensor], 96 | lr: float = 1e-2, 97 | weight_decay: float = 0, 98 | eps: float = 1e-10, 99 | ) -> None: 100 | super().__init__(params) 101 | self.lr = lr 102 | self.weight_decay = weight_decay 103 | self.eps = eps 104 | self.G = [ 105 | param.xp.zeros(param.shape, dtype=param.dtype) 106 | for param in self.params 107 | ] 108 | 109 | def step(self): 110 | for i in range(len(self.params)): 111 | with self.params[i].device: 112 | grad = self.params[i].grad + self.weight_decay * self.params[i].data 113 | self.G[i] += grad**2 114 | self.params[i].data -= self.lr * grad / (self.eps + self.G[i])**0.5 115 | 116 | 117 | class Adadelta(Optimizer): 118 | ''' 119 | Adadelta优化器 120 | 121 | params : list[Parameter] 122 | 待优化参数; 123 | lr : float, default=1e-2. 124 | 学习率; 125 | rho :float, default= 126 | weight_decay : float, default=0. 127 | 权重衰减系数. 128 | eps : float, default=1e-10 129 | epsilon. 130 | ''' 131 | 132 | def __init__( 133 | self, 134 | params: list[Tensor], 135 | lr: float = 1.0, 136 | rho: float = 0.9, 137 | weight_decay: float = 0, 138 | eps: float = 1e-6, 139 | ) -> None: 140 | super().__init__(params) 141 | self.lr = lr 142 | self.rho = rho 143 | self.eps = eps 144 | self.eps = eps 145 | self.weight_decay = weight_decay 146 | self.G = [ 147 | param.xp.zeros(param.shape, dtype=param.dtype) 148 | for param in self.params 149 | ] 150 | 151 | def step(self): 152 | for i in range(len(self.params)): 153 | with self.params[i].device: 154 | grad = self.params[i].grad + self.weight_decay * self.params[i].data 155 | 156 | self.G[i] = self.rho * self.G[i] + (1 - self.rho) * grad**2 157 | self.params[i].data -= self.lr * grad / (self.G[i] + self.eps)**0.5 158 | 159 | 160 | class Adam(Optimizer): 161 | 162 | def __init__( 163 | self, 164 | params: list[Tensor], 165 | lr: float = 1e-3, 166 | betas: tuple[float] = (0.9, 0.999), 167 | eps: float = 1e-8, 168 | weight_decay: float = 0, 169 | ) -> None: 170 | super().__init__(params) 171 | self.lr = lr 172 | self.beta1, self.beta2 = betas 173 | self.eps = eps 174 | self.weight_decay = weight_decay 175 | self.m = [ 176 | param.xp.zeros(param.shape, dtype=param.dtype) 177 | for param in self.params 178 | ] 179 | self.v = [ 180 | param.xp.zeros(param.shape, dtype=param.dtype) 181 | for param in self.params 182 | ] 183 | self.t = 1 184 | 185 | def step(self): 186 | for i in range(len(self.params)): 187 | with self.params[i].device: 188 | grad = self.params[i].grad + self.weight_decay * self.params[i].data 189 | self.m[i] *= self.beta1 190 | self.m[i] += (1 - self.beta1) * grad 191 | self.v[i] *= self.beta2 192 | self.v[i] += (1 - self.beta2) * grad**2 193 | a_t = sqrt(1 - self.beta2**self.t) / (1 - self.beta1**self.t) 194 | self.params[i].data -= self.lr * a_t * self.m[i] / ( 195 | self.v[i]**0.5 + self.eps) 196 | self.t += 1 197 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PyDyNet:NumPy-based Dynamic Deep Learning Framework 2 | 3 | Chinese README: [cnREADME.md](./cnREADME.md) 4 | 5 | [![Downloads](https://pepy.tech/badge/pydynet)](https://pepy.tech/project/pydynet) 6 | [![Downloads](https://static.pepy.tech/personalized-badge/pydynet?period=month&units=international_system&left_color=grey&right_color=orange&left_text=downloads/month)](https://pepy.tech/project/pydynet) 7 | ![x](https://img.shields.io/pypi/l/pydynet) 8 | ![x](https://img.shields.io/pypi/implementation/numpy) 9 | ![x](https://img.shields.io/github/stars/Kaslanarian/PyDyNet?style=social) 10 | ![x](https://img.shields.io/github/forks/Kaslanarian/PyDyNet?style=social) 11 | 12 | ## Towards Large Language Model 13 | 14 | **In the summer of 2025, I restart the development of PyDyNet after two years.** PyDyNet implemented a pure inference version of Llama3 (6-layer Transformer, vocab-size=32000). The implementation is inspired by the NumPy version and dataset available [here](https://github.com/likejazz/llama3.np). To run it, download the dataset into the `llm/llama` folder and execute: 15 | 16 | ```bash 17 | >>> python -m llm.llama.infer 18 | 19 | There was a boy named Timmy. He loved to play with hi toy and run around outside. One day, Timmy' mom asked him to help her with the laundry. Timmy didn't want to help because he wanted to play. But hi mom said, "Timmy, you need to help me. It' important to help out." 20 | Timmy didn't want to help, but he knew he had to. So, he put on hi shoe and went outside to help hi mom. A they were folding the clothe, Timmy saw a big pile of laundry on the floor. He wanted to help, so he started to pick it up. But then, he accidentally knocked over a pile of clothe and they fell on him. Timmy wa okay, but he felt bad. 21 | Hi mom saw what happened and said, "Timmy, you need to be more careful. You could have hurt yourself." Timmy felt bad and said sorry. Hi mom hugged him and said, "It' okay, accident happen. Let' clean up the laundry together." Timmy learned that it' important to be careful and help out when you need it. 22 | 23 | Token count: 262, elapsed: 0.87s, 300 tokens/s 24 | ``` 25 | 26 | We also implemented a pure inference version of CLIP, inspired by the NumPy version and dataset available [NPCLIP](https://github.com/99991/NPCLIP). To run it, imigrate `data` folder of `MPCLIP` into `llm/clip` folder and execute: 27 | 28 | ```bash 29 | >>> python -m llm.clip.infer 30 | Label probs: [0.000953 0.48176003 0.51728696] 31 | ``` 32 | 33 | for the following image and query ["a fish", "a dog", "a cat"] 34 | 35 | cat_dog 36 | 37 | ## Overview 38 | 39 | PyDyNet is a neural network framework implemented entirely in NumPy (with CuPy support since version 0.0.7, using the same API). Its syntax is inspired by PyTorch, and its structure is as follows: 40 | 41 | ```mermaid 42 | graph LR 43 | N(numpy/cupy.ndarray)--Backend--> A(Tensor) --> ds(Dataset) ---> Data(DataLoader)---> Mission 44 | A --Eager execution--> B(Basic operators:
add, exp, etc) 45 | B -.Autograd-.-> A 46 | 47 | B --> CO(Complex
operators) 48 | --> f(Function:
img2col, etc) 49 | --> M(Basic Module:
Linear, etc) 50 | --> CM(Advanced Module: CNN, RNN, Transformer, etc) 51 | --> Mission(Learning task) 52 | A --> GD(Optimizer:
SGD, Adam, etc) ---> LS(lr_scheduler:
StepLR, etc)---> Mission 53 | ``` 54 | 55 | Dashed lines indicate that users can disable automatic differentiation using `no_grad`. 56 | 57 | ## Install 58 | 59 | Just 60 | 61 | ```bash 62 | pip install pydynet 63 | ``` 64 | 65 | or 66 | 67 | ```bash 68 | git clone https://github.com/Kaslanarian/PyDyNet 69 | cd PyDyNet 70 | python setup.py install 71 | ``` 72 | 73 | ## Example 74 | 75 | Examples can be found in the [examples/pydynet](./examples/pydynet) directory, with equivalent PyTorch implementations in [examples/pytorch](./examples/pytorch). To run an example, use: 76 | 77 | ```bash 78 | python -m examples.pydynet.xxx 79 | ``` 80 | 81 | ### Automatic Differentiation 82 | 83 | The example [autodiff1d.py](examples/pydynet/autodiff1d.py) demonstrates automatic differentiation by performing gradient descent on a one-dimensional convex function: 84 | 85 | ad1 86 | 87 | A multi-variable convex function example is provided in [autodiff2d.py](examples/pydynet/autodiff2d.py): 88 | 89 | ad2 90 | 91 | ### MLP & LeNet 92 | 93 | The example [mlp_cnn.py](examples/pydynet/mnist.py) uses MLP and LeNet to classify MNIST digits. The training and testing accuracies are shown below: 94 | 95 | dnn 96 | 97 | ### Dropout & Batch Normalization 98 | 99 | The example [mlp_dropout_bn.py](examples/pydynet/dropout_bn.py) compares the performance of three networks on the `fetch_olivetti_faces` dataset (64×64 pixel images): 100 | 101 | 1. Three-layer MLP; 102 | 2. Three-layer MLP with Dropout; 103 | 3. Three-layer MLP with Batch Normalization. 104 | 105 | cnn 106 | 107 | ### Recurrent Neural Network (RNN) 108 | 109 | The example [ts_prediction.py](examples/pydynet/ts_prediction.py) demonstrates time series prediction using a GRU: 110 | 111 | RNN 112 | 113 | ### Transformer 114 | 115 | The example [transformer.py](examples/pydynet/transformer.py) shows how to train a text classification model using a Transformer. The training results are as follows: 116 | 117 | transformer 118 | 119 | > Dataset (CoLA) link: 120 | 121 | ## Cuda Acceleration 122 | 123 | PyDyNet supports CUDA acceleration through CuPy. To use it, simply install CuPy and use the same API as NumPy. We compare the performance of PyDyNet with CuPy and NumPy as follows on **Nvidia GeForce RTX 4090**: 124 | 125 | | Network structure | Dataset | CPU time (s) per epoch | GPU time (s) per epoch | 126 | | :-----------------: | :---------------: | :--------------------: | :--------------------: | 127 | | 3-layer MLP | MNIST (80000×574) | 7.256±0.138 | 1.203±.0181 | 128 | | LeNet | MNIST (80000×574) | 239.664±2.108 | 2.841±0.026 | 129 | | 1-layer Transformer (dim=512, head=4) | CoLA (8551×45×64) | 17.503±0.251 | 1.075±0.002 | 130 | -------------------------------------------------------------------------------- /llm/clip/tokenizer.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import html 3 | import os 4 | import re 5 | import typing 6 | from functools import lru_cache 7 | 8 | 9 | @lru_cache() 10 | def default_bpe() -> str: 11 | return os.path.join(os.path.dirname(os.path.abspath(__file__)), 12 | "data/bpe_simple_vocab_16e6.txt.gz") 13 | 14 | 15 | @lru_cache() 16 | def bytes_to_unicode() -> typing.Dict[int, str]: 17 | """ 18 | Returns list of utf-8 byte and a corresponding list of unicode strings. 19 | The reversible bpe codes work on unicode strings. 20 | This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. 21 | When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. 22 | This is a signficant percentage of your normal, say, 32K bpe vocab. 23 | To avoid that, we want lookup tables between utf-8 bytes and unicode strings. 24 | And avoids mapping to whitespace/control characters the bpe code barfs on. 25 | """ 26 | byte_ints = list(range(ord("!"), 27 | ord("~") + 1)) + list(range(ord("¡"), 28 | ord("¬") + 1)) + list( 29 | range( 30 | ord("®"), 31 | ord("ÿ") + 1)) 32 | char_ints = byte_ints[:] 33 | n = 0 34 | for b in range(2**8): 35 | if b not in byte_ints: 36 | byte_ints.append(b) 37 | char_ints.append(2**8 + n) 38 | n += 1 39 | chars = [chr(n) for n in char_ints] 40 | return dict(zip(byte_ints, chars)) 41 | 42 | 43 | def get_pairs( 44 | word: typing.Tuple[str, ...]) -> typing.Set[typing.Tuple[str, str]]: 45 | """Return set of symbol pairs in a word. 46 | Word is represented as tuple of symbols (symbols being variable-length strings). 47 | """ 48 | pairs = set() 49 | prev_char = word[0] 50 | for char in word[1:]: 51 | pairs.add((prev_char, char)) 52 | prev_char = char 53 | return pairs 54 | 55 | 56 | def basic_clean(text: str) -> str: 57 | import ftfy 58 | 59 | text = ftfy.fix_text(text) 60 | text = html.unescape(html.unescape(text)) 61 | return text.strip() 62 | 63 | 64 | def whitespace_clean(text: str) -> str: 65 | text = re.sub(r"\s+", " ", text) 66 | text = text.strip() 67 | return text 68 | 69 | 70 | def read_text(path: str) -> str: 71 | with open(path, "r", encoding="utf-8") as f: 72 | return f.read() 73 | 74 | 75 | class SimpleTokenizer(object): 76 | 77 | def __init__(self, bpe_path: str = default_bpe()) -> None: 78 | self.byte_encoder = bytes_to_unicode() 79 | self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} 80 | with gzip.open(bpe_path) as f: 81 | lines = f.read().decode("utf-8").split("\n") 82 | lines = lines[1:49152 - 256 - 2 + 1] 83 | merges = [tuple(line.split()) for line in lines] 84 | vocab = list(bytes_to_unicode().values()) 85 | vocab = vocab + [v + "" for v in vocab] 86 | for merge in merges: 87 | vocab.append("".join(merge)) 88 | vocab.extend(["<|startoftext|>", "<|endoftext|>"]) 89 | self.encoder: typing.Dict[str, 90 | int] = dict(zip(vocab, range(len(vocab)))) 91 | self.decoder = {v: k for k, v in self.encoder.items()} 92 | self.bpe_ranks = dict(zip(merges, range(len(merges)))) 93 | self.cache = { 94 | "<|startoftext|>": "<|startoftext|>", 95 | "<|endoftext|>": "<|endoftext|>" 96 | } 97 | pattern = r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""" 98 | pattern = pattern.replace(r"\p{N}", read_text("llm/clip/data/pN.txt")) 99 | pattern = pattern.replace(r"\p{L}", read_text("llm/clip/data/pL.txt")) 100 | self.pat = re.compile(pattern, re.IGNORECASE) 101 | 102 | def bpe(self, token: str) -> str: 103 | if token in self.cache: 104 | return self.cache[token] 105 | word = tuple(token[:-1]) + (token[-1] + "", ) 106 | pairs = get_pairs(word) 107 | 108 | if not pairs: 109 | return token + "" 110 | 111 | while True: 112 | bigram = min( 113 | pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) 114 | if bigram not in self.bpe_ranks: 115 | break 116 | first, second = bigram 117 | new_word: typing.List[str] = [] 118 | i = 0 119 | while i < len(word): 120 | try: 121 | j = word.index(first, i) 122 | new_word.extend(word[i:j]) 123 | i = j 124 | except: 125 | new_word.extend(word[i:]) 126 | break 127 | 128 | if word[i] == first and i < len(word) - 1 and word[ 129 | i + 1] == second: 130 | new_word.append(first + second) 131 | i += 2 132 | else: 133 | new_word.append(word[i]) 134 | i += 1 135 | word = tuple(new_word) 136 | if len(word) == 1: 137 | break 138 | else: 139 | pairs = get_pairs(word) 140 | joined_word = " ".join(word) 141 | self.cache[token] = joined_word 142 | return joined_word 143 | 144 | def encode(self, 145 | text: str, 146 | basic_cleaning: bool = False) -> typing.List[int]: 147 | bpe_tokens: typing.List[int] = [] 148 | if basic_cleaning: 149 | text = basic_clean(text) 150 | text = whitespace_clean(text).lower() 151 | for token in re.findall(self.pat, text): 152 | token = "".join(self.byte_encoder[b] 153 | for b in token.encode("utf-8")) 154 | bpe_tokens.extend(self.encoder[bpe_token] 155 | for bpe_token in self.bpe(token).split(" ")) 156 | return bpe_tokens 157 | 158 | def decode(self, tokens: typing.Iterable[int]) -> str: 159 | text = "".join([self.decoder[token] for token in tokens]) 160 | text = bytearray([self.byte_decoder[c] for c in text 161 | ]).decode("utf-8", 162 | errors="replace").replace("", " ") 163 | return text 164 | -------------------------------------------------------------------------------- /examples/pytorch/mnist.py: -------------------------------------------------------------------------------- 1 | import gzip, argparse 2 | from os.path import join 3 | from tqdm import tqdm 4 | 5 | import numpy as np 6 | import torch 7 | from torch import nn 8 | import torch.nn.functional as F 9 | from torch.optim import Adam 10 | from pydynet.data import data_loader 11 | 12 | 13 | class MNISTDataset: 14 | 15 | def __init__(self, root) -> None: 16 | self.root = root 17 | self.train_images_path = join(root, 'train-images-idx3-ubyte.gz') 18 | self.train_labels_path = join(root, 'train-labels-idx1-ubyte.gz') 19 | self.test_images_path = join(root, 't10k-images-idx3-ubyte.gz') 20 | self.test_labels_path = join(root, 't10k-labels-idx1-ubyte.gz') 21 | 22 | def load_train(self): 23 | return ( 24 | MNISTDataset.load_mnist_images(self.train_images_path), 25 | MNISTDataset.load_mnist_labels(self.train_labels_path), 26 | ) 27 | 28 | def load_test(self): 29 | return ( 30 | MNISTDataset.load_mnist_images(self.test_images_path), 31 | MNISTDataset.load_mnist_labels(self.test_labels_path), 32 | ) 33 | 34 | @staticmethod 35 | def load_mnist_images(file_path): 36 | with gzip.open(file_path, 'r') as f: 37 | # Skip the magic number and dimensions (4 bytes magic number + 4 bytes each for dimensions) 38 | f.read(16) 39 | # Read the rest of the file 40 | buffer = f.read() 41 | data = np.frombuffer(buffer, dtype=np.uint8).astype(np.float32) 42 | # Normalize the data to be in the range [0, 1] 43 | data = data / 255.0 44 | # Reshape the data to be in the shape (number_of_images, 28, 28) 45 | data = data.reshape(-1, 1, 28, 28) 46 | return torch.tensor(data) 47 | 48 | @staticmethod 49 | def load_mnist_labels(file_path): 50 | with gzip.open(file_path, 'r') as f: 51 | # Skip the magic number and number of items (4 bytes magic number + 4 bytes number of items) 52 | f.read(8) 53 | # Read the rest of the file 54 | buffer = f.read() 55 | labels = np.frombuffer(buffer, dtype=np.uint8) 56 | return torch.tensor(labels, dtype=int) 57 | 58 | 59 | class Flatten(nn.Module): 60 | 61 | def forward(self, x): # for batch only 62 | return x.reshape(x.shape[0], -1) 63 | 64 | 65 | class MLP(nn.Module): 66 | 67 | def __init__(self) -> None: 68 | super().__init__() 69 | self.layer1 = nn.Sequential( 70 | Flatten(), 71 | nn.Linear(28 * 28, 1024), 72 | ) 73 | self.layer2 = nn.Linear(1024, 1024) 74 | self.layer3 = nn.Linear(1024, 10) 75 | 76 | def forward(self, x): 77 | z1 = F.relu(self.layer1(x)) 78 | z2 = F.relu(self.layer2(z1)) 79 | return self.layer3(z2) 80 | 81 | 82 | class ConvNet(nn.Module): 83 | 84 | def __init__(self): 85 | super().__init__() 86 | self.conv1 = nn.Conv2d(1, 20, 3, 1, 1) 87 | self.conv2 = nn.Conv2d(20, 50, 3, 1, 1) 88 | self.fc1 = nn.Linear(7 * 7 * 50, 500) 89 | self.fc2 = nn.Linear(500, 10) 90 | 91 | def forward(self, x): 92 | x = F.relu(self.conv1(x)) 93 | x = F.max_pool2d(x, 2, 2) 94 | x = F.relu(self.conv2(x)) 95 | x = F.max_pool2d(x, 2, 2) 96 | x = x.reshape(-1, 7 * 7 * 50) 97 | x = F.relu(self.fc1(x)) 98 | return self.fc2(x) 99 | 100 | 101 | parser = argparse.ArgumentParser() 102 | parser.add_argument("--network", 103 | help="Network structure", 104 | choices=['mlp', 'conv'], 105 | default='conv') 106 | parser.add_argument('--batch-size', 107 | type=int, 108 | default=256, 109 | help='input batch size for training (default: 256)') 110 | parser.add_argument('--test-batch-size', 111 | type=int, 112 | default=1024, 113 | metavar='N', 114 | help='input batch size for testing (default: 1024)') 115 | parser.add_argument('--epochs', 116 | type=int, 117 | default=20, 118 | help='number of epochs to train (default: 20)') 119 | parser.add_argument('--lr', 120 | type=float, 121 | default=1e-4, 122 | help='learning rate (default: 1e-4)') 123 | parser.add_argument('--no-cuda', 124 | action='store_true', 125 | default=False, 126 | help='disables CUDA training') 127 | parser.add_argument('--seed', 128 | type=int, 129 | default=42, 130 | help='random seed (default: 1)') 131 | args = parser.parse_args() 132 | 133 | torch.manual_seed(42) 134 | torch.cuda.manual_seed(42) 135 | 136 | device = f'cuda:{torch.cuda.device_count() - 1}' if torch.cuda.is_available( 137 | ) and not args.no_cuda else 'cpu' 138 | 139 | net = {'mlp': MLP(), 'conv': ConvNet()}.get(args.network).to(device) 140 | print(net) 141 | 142 | optimizer = Adam(net.parameters(), lr=args.lr) 143 | 144 | dataset = MNISTDataset(r'./examples/data/MNIST/raw') 145 | train_loader = data_loader( 146 | *dataset.load_train(), 147 | shuffle=True, 148 | batch_size=args.batch_size, 149 | ) 150 | test_loader = data_loader( 151 | *dataset.load_test(), 152 | shuffle=False, 153 | batch_size=args.test_batch_size, 154 | ) 155 | 156 | bar = tqdm(range(args.epochs)) 157 | info_list = [] 158 | for epoch in bar: 159 | 160 | net.train() 161 | 162 | for batch_X, batch_y in train_loader: 163 | input_, label = batch_X.to(device), batch_y.to(device) 164 | loss = F.cross_entropy(net(input_), label) 165 | optimizer.zero_grad() 166 | loss.backward() 167 | optimizer.step() 168 | 169 | net.eval() 170 | 171 | train_right, train_size = 0, 0 172 | test_right, test_size = 0, 0 173 | with torch.no_grad(): 174 | for batch_X, batch_y in train_loader: 175 | input_, label = batch_X.to(device), batch_y.to(device) 176 | pred: torch.Tensor = net(input_).argmax(-1) 177 | train_right += pred.eq(label).sum().item() 178 | train_size += batch_X.shape[0] 179 | 180 | for batch_X, batch_y in test_loader: 181 | input_, label = batch_X.to(device), batch_y.to(device) 182 | pred = net(input_).argmax(-1) 183 | test_right += pred.eq(label).sum().item() 184 | test_size += batch_X.shape[0] 185 | 186 | train_acc, test_acc = train_right / train_size, test_right / test_size 187 | bar.set_postfix(TEST_ACC="{:.4f}".format(test_acc), 188 | TRAIN_ACC="{:.4f}".format(train_acc), 189 | LOSS="{:.6f}".format(loss.item())) 190 | -------------------------------------------------------------------------------- /examples/pydynet/mnist.py: -------------------------------------------------------------------------------- 1 | import gzip, argparse 2 | from os.path import join 3 | from tqdm import tqdm 4 | 5 | import numpy as np 6 | import pydynet as pdn 7 | from pydynet import nn 8 | import pydynet.nn.functional as F 9 | from pydynet.optim import Adam 10 | from pydynet.data import data_loader 11 | 12 | 13 | class MNISTDataset: 14 | 15 | def __init__(self, root) -> None: 16 | self.root = root 17 | self.train_images_path = join(root, 'train-images-idx3-ubyte.gz') 18 | self.train_labels_path = join(root, 'train-labels-idx1-ubyte.gz') 19 | self.test_images_path = join(root, 't10k-images-idx3-ubyte.gz') 20 | self.test_labels_path = join(root, 't10k-labels-idx1-ubyte.gz') 21 | 22 | def load_train(self): 23 | return ( 24 | MNISTDataset.load_mnist_images(self.train_images_path), 25 | MNISTDataset.load_mnist_labels(self.train_labels_path), 26 | ) 27 | 28 | def load_test(self): 29 | return ( 30 | MNISTDataset.load_mnist_images(self.test_images_path), 31 | MNISTDataset.load_mnist_labels(self.test_labels_path), 32 | ) 33 | 34 | @staticmethod 35 | def load_mnist_images(file_path): 36 | with gzip.open(file_path, 'r') as f: 37 | # Skip the magic number and dimensions (4 bytes magic number + 4 bytes each for dimensions) 38 | f.read(16) 39 | # Read the rest of the file 40 | buffer = f.read() 41 | data = np.frombuffer(buffer, dtype=np.uint8) 42 | # Normalize the data to be in the range [0, 1] 43 | data = data / 255.0 44 | # Reshape the data to be in the shape (number_of_images, 28, 28) 45 | data = data.reshape(-1, 1, 28, 28) 46 | return pdn.Tensor(data).astype(DTYPE) 47 | 48 | @staticmethod 49 | def load_mnist_labels(file_path): 50 | with gzip.open(file_path, 'r') as f: 51 | # Skip the magic number and number of items (4 bytes magic number + 4 bytes number of items) 52 | f.read(8) 53 | # Read the rest of the file 54 | buffer = f.read() 55 | labels = np.frombuffer(buffer, dtype=np.uint8) 56 | return pdn.Tensor(labels, dtype=int) 57 | 58 | 59 | class Flatten(nn.Module): 60 | 61 | def forward(self, x): # for batch only 62 | return x.reshape(x.shape[0], -1) 63 | 64 | 65 | class MLP(nn.Module): 66 | 67 | def __init__(self) -> None: 68 | super().__init__() 69 | self.layer1 = nn.Sequential( 70 | Flatten(), 71 | nn.Linear(28 * 28, 1024, dtype=DTYPE), 72 | ) 73 | self.layer2 = nn.Linear(1024, 1024, dtype=DTYPE) 74 | self.layer3 = nn.Linear(1024, 10, dtype=DTYPE) 75 | 76 | def forward(self, x): 77 | z1 = F.relu(self.layer1(x)) 78 | z2 = F.relu(self.layer2(z1)) 79 | return self.layer3(z2) 80 | 81 | 82 | class ConvNet(nn.Module): 83 | 84 | def __init__(self): 85 | super().__init__() 86 | self.conv1 = nn.Conv2d(1, 20, 3, 1, 1, dtype=DTYPE) 87 | self.conv2 = nn.Conv2d(20, 50, 3, 1, 1, dtype=DTYPE) 88 | self.fc1 = nn.Linear(7 * 7 * 50, 500, dtype=DTYPE) 89 | self.fc2 = nn.Linear(500, 10, dtype=DTYPE) 90 | 91 | def forward(self, x): 92 | x = F.relu(self.conv1(x)) 93 | x = F.max_pool2d(x, 2, 2) 94 | x = F.relu(self.conv2(x)) 95 | x = F.max_pool2d(x, 2, 2) 96 | x = x.reshape(-1, 7 * 7 * 50) 97 | x = F.relu(self.fc1(x)) 98 | return self.fc2(x) 99 | 100 | 101 | parser = argparse.ArgumentParser() 102 | parser.add_argument("--network", 103 | help="Network structure", 104 | choices=['mlp', 'conv'], 105 | default='conv') 106 | parser.add_argument('--batch-size', 107 | type=int, 108 | default=256, 109 | help='input batch size for training (default: 256)') 110 | parser.add_argument('--test-batch-size', 111 | type=int, 112 | default=1024, 113 | metavar='N', 114 | help='input batch size for testing (default: 1024)') 115 | parser.add_argument('--epochs', 116 | type=int, 117 | default=20, 118 | help='number of epochs to train (default: 20)') 119 | parser.add_argument('--lr', 120 | type=float, 121 | default=1e-4, 122 | help='learning rate (default: 1e-4)') 123 | parser.add_argument('--no-cuda', 124 | action='store_true', 125 | default=False, 126 | help='disables CUDA training') 127 | parser.add_argument('--seed', 128 | type=int, 129 | default=42, 130 | help='random seed (default: 1)') 131 | args = parser.parse_args() 132 | 133 | DTYPE = np.float32 134 | np.random.seed(args.seed) 135 | device = f'cuda:{pdn.cuda.device_count() - 1}' if pdn.cuda.is_available( 136 | ) and not args.no_cuda else 'cpu' 137 | 138 | net = {'mlp': MLP(), 'conv': ConvNet()}.get(args.network).to(device) 139 | print(net) 140 | 141 | optimizer = Adam(net.parameters(), lr=args.lr) 142 | 143 | dataset = MNISTDataset(r'./examples/data/MNIST/raw') 144 | train_loader = data_loader( 145 | *dataset.load_train(), 146 | shuffle=True, 147 | batch_size=args.batch_size, 148 | ) 149 | test_loader = data_loader( 150 | *dataset.load_test(), 151 | shuffle=False, 152 | batch_size=args.test_batch_size, 153 | ) 154 | 155 | bar = tqdm(range(args.epochs)) 156 | info_list = [] 157 | for epoch in bar: 158 | 159 | net.train() 160 | 161 | for batch_X, batch_y in train_loader: 162 | input_, label = batch_X.to(device), batch_y.to(device) 163 | loss = F.cross_entropy_loss(net(input_), label) 164 | optimizer.zero_grad() 165 | loss.backward() 166 | optimizer.step() 167 | 168 | net.eval() 169 | 170 | train_right, train_size = 0, 0 171 | test_right, test_size = 0, 0 172 | with pdn.no_grad(): 173 | for batch_X, batch_y in train_loader: 174 | input_, label = batch_X.to(device), batch_y.to(device) 175 | pred: pdn.Tensor = net(input_).argmax(-1) 176 | train_right += pred.eq(label).sum().item() 177 | train_size += batch_X.shape[0] 178 | 179 | for batch_X, batch_y in test_loader: 180 | input_, label = batch_X.to(device), batch_y.to(device) 181 | pred = net(input_).argmax(-1) 182 | test_right += pred.eq(label).sum().item() 183 | test_size += batch_X.shape[0] 184 | 185 | train_acc, test_acc = train_right / train_size, test_right / test_size 186 | bar.set_postfix(TEST_ACC="{:.4f}".format(test_acc), 187 | TRAIN_ACC="{:.4f}".format(train_acc), 188 | LOSS="{:.6f}".format(loss.item())) 189 | -------------------------------------------------------------------------------- /llm/clip/model.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import numpy as np 4 | import pydynet as pdn 5 | from pydynet import nn 6 | import pydynet.nn.functional as F 7 | 8 | 9 | def build_attention_mask(context_length: int): 10 | mask = np.full((context_length, context_length), 11 | fill_value=-np.inf, 12 | dtype=np.float32) 13 | mask = np.triu(mask, 1) 14 | return pdn.Tensor(mask, dtype=np.float32) 15 | 16 | 17 | def patch_project(x: pdn.Tensor, kernel: pdn.Tensor): 18 | # Decompose images into 32x32 patches and multiply all patches by matrix. 19 | 20 | n, c, h, w = x.shape 21 | d, pc, ph, pw = kernel.shape 22 | p = pc * ph * pw 23 | gh = h // ph 24 | gw = w // pw 25 | 26 | assert c == pc and h % ph == 0 and w % pw == 0 27 | 28 | W = kernel.transpose(1, 2, 3, 0).reshape(p, d) 29 | x = x.reshape(n, c, gh, ph, gw, pw).transpose(0, 2, 4, 1, 3, 30 | 5).reshape(n, gh, gw, p) 31 | x = x @ W 32 | return x.reshape(n, gh * gw, d) 33 | 34 | 35 | class MultiHeadAttention(nn.Module): 36 | 37 | def __init__(self, n_dim: int, n_heads: int): 38 | super().__init__() 39 | self.n_dim = n_dim 40 | self.n_heads = n_heads 41 | self.head_dim = n_dim // n_heads 42 | 43 | self.QKV = nn.Linear(self.n_dim, self.n_dim * 3, dtype=np.float32) 44 | self.O = nn.Linear(self.n_dim, self.n_dim, dtype=np.float32) 45 | 46 | def forward(self, x, mask): 47 | B, L, _ = x.shape 48 | xq, xk, xv = pdn.split(self.QKV(x), 3, -1) 49 | xq = xq.reshape(B, L, self.n_heads, self.head_dim) 50 | xk = xk.reshape(B, L, self.n_heads, self.head_dim) 51 | xv = xv.reshape(B, L, self.n_heads, self.head_dim) 52 | 53 | xq, xkT = xq.transpose(0, 2, 1, 3), xk.transpose(0, 2, 3, 1) 54 | attention = xq @ xkT / math.sqrt(self.head_dim) 55 | 56 | if mask is not None: 57 | attention = attention + mask 58 | 59 | attention = F.softmax(attention, axis=-1) 60 | output = attention @ xv.transpose(0, 2, 1, 3) 61 | 62 | output = output.transpose(0, 2, 1, 3).reshape(B, L, -1) 63 | return self.O(output) 64 | 65 | 66 | class CLIPLayerNorm(nn.LayerNorm): 67 | 68 | def __init__(self, 69 | normalized_shape, 70 | eps=0.000001, 71 | momentum=0.1, 72 | device=None, 73 | dtype=None): 74 | super().__init__(normalized_shape, eps, momentum, device, dtype) 75 | 76 | def forward(self, x): 77 | mean = x.mean(axis=-1, keepdims=True) 78 | var = pdn.square(x - mean).mean(axis=-1, keepdims=True) 79 | x = (x - mean) / pdn.sqrt(var + self.eps) * self.scale + self.shift 80 | return x 81 | 82 | 83 | class MLP(nn.Module): 84 | 85 | def __init__(self, d_in: int, d_proj: int): 86 | super().__init__() 87 | self.d_in = d_in 88 | self.d_proj = d_proj 89 | self.fc1 = nn.Linear(d_in, d_proj, dtype=np.float32) 90 | self.fc2 = nn.Linear(d_proj, d_in, dtype=np.float32) 91 | 92 | def forward(self, x): 93 | x = self.fc1(x) 94 | x = x * pdn.sigmoid(1.702 * x) 95 | return self.fc2(x) 96 | 97 | 98 | class Transformer(nn.Module): 99 | 100 | def __init__(self, n_dim: int, n_head: int, mlp_dim: int): 101 | super().__init__() 102 | self.mha = MultiHeadAttention(n_dim, n_head) 103 | self.mlp = MLP(n_dim, mlp_dim) 104 | self.layer_norm1 = CLIPLayerNorm((n_dim, ), eps=1e-5, dtype=np.float32) 105 | self.layer_norm2 = CLIPLayerNorm((n_dim, ), eps=1e-5, dtype=np.float32) 106 | 107 | def forward(self, x, mask): 108 | x = x + self.mha(self.layer_norm1(x), mask) 109 | x = x + self.mlp(self.layer_norm2(x)) 110 | return x 111 | 112 | 113 | class ImageEncoder(nn.Module): 114 | 115 | def __init__(self, n_dim, n_head, mlp_dim, kernel_size, n_layer, 116 | final_dim): 117 | super().__init__() 118 | self.kernel = nn.Parameter( 119 | pdn.randn(n_dim, 3, kernel_size, kernel_size, dtype=np.float32)) 120 | 121 | self.pre_norm = CLIPLayerNorm((n_dim, ), 1e-5, dtype=np.float32) 122 | self.transformers: list[Transformer] = nn.ModuleList( 123 | [Transformer(n_dim, n_head, mlp_dim) for _ in range(n_layer)]) 124 | 125 | self.post_norm = CLIPLayerNorm((n_dim, ), 1e-5, dtype=np.float32) 126 | self.proj = nn.Linear(n_dim, final_dim, bias=False, dtype=np.float32) 127 | 128 | def forward(self, x, class_emb, position_emb): 129 | x = patch_project(x, self.kernel) 130 | x = pdn.concat([class_emb, x], axis=-2) + position_emb 131 | 132 | x = self.pre_norm(x) 133 | for model in self.transformers: 134 | x = model(x, None) 135 | 136 | x = self.post_norm(x[:, 0]) 137 | return self.proj(x) 138 | 139 | 140 | class TextEncoder(nn.Module): 141 | 142 | def __init__(self, n_dim, n_head, mlp_dim, n_layer, final_dim, vocab_size): 143 | super().__init__() 144 | self.token_embed = nn.Embedding(vocab_size, n_dim, dtype=np.float32) 145 | self.transformers: list[Transformer] = nn.ModuleList( 146 | [Transformer(n_dim, n_head, mlp_dim) for _ in range(n_layer)]) 147 | 148 | self.post_norm = CLIPLayerNorm((n_dim, ), 1e-5, dtype=np.float32) 149 | self.proj = nn.Linear(n_dim, final_dim, bias=False, dtype=np.float32) 150 | 151 | def forward(self, idx, position_emb): 152 | x = self.token_embed(idx) + position_emb 153 | mask = build_attention_mask(x.shape[1]) 154 | 155 | for model in self.transformers: 156 | x = model(x, mask) 157 | 158 | x = self.post_norm(x) 159 | 160 | return self.proj(x[np.arange(x.shape[0]), x.xp.argmax(idx, axis=-1)]) 161 | 162 | 163 | class CLIP(nn.Module): 164 | 165 | def __init__(self): 166 | super().__init__() 167 | self.class_embed = nn.Parameter(pdn.randn(1, 1, 768, dtype=np.float32)) 168 | self.v_pos_emb = nn.Parameter(pdn.randn(50, 768, dtype=np.float32)) 169 | self.t_pos_emb = nn.Parameter(pdn.randn(77, 512, dtype=np.float32)) 170 | self.image_encoder = ImageEncoder(768, 12, 3072, 32, 12, 512) 171 | self.text_encoder = TextEncoder(512, 8, 2048, 12, 512, 49408) 172 | self.scale = 1 173 | 174 | def forward(self, img, idx): 175 | img_feature = self.image_encoder(img, self.class_embed, self.v_pos_emb) 176 | txt_feature = self.text_encoder(idx, self.t_pos_emb) 177 | 178 | norm_img = pdn.sqrt(pdn.square(img_feature).sum(1, keepdims=True)) 179 | norm_txt = pdn.sqrt(pdn.square(txt_feature).sum(1, keepdims=True)) 180 | 181 | img_feature = img_feature / norm_img 182 | txt_feature = txt_feature / norm_txt 183 | logits_per_image = self.scale * img_feature @ txt_feature.T 184 | return logits_per_image 185 | -------------------------------------------------------------------------------- /examples/pytorch/dropout_bn.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | from sklearn.datasets import fetch_olivetti_faces 3 | from sklearn.preprocessing import MinMaxScaler 4 | from sklearn.model_selection import train_test_split 5 | from tqdm import tqdm 6 | 7 | import numpy as np 8 | import torch 9 | import torch.nn.functional as F 10 | import torch.nn as nn 11 | from torch.optim import Adam 12 | from pydynet.data import data_loader 13 | 14 | data_X, data_y = fetch_olivetti_faces(return_X_y=True) 15 | print(data_X.shape) 16 | train_X, test_X, train_y, test_y = train_test_split( 17 | data_X, 18 | data_y, 19 | train_size=0.8, 20 | stratify=data_y, 21 | random_state=42, 22 | ) 23 | scaler = MinMaxScaler() 24 | train_X = scaler.fit_transform(train_X) 25 | test_X = scaler.transform(test_X) 26 | 27 | 28 | class DNN(nn.Module): 29 | 30 | def __init__(self) -> None: 31 | super().__init__() 32 | self.fc1 = nn.Linear(4096, 512) 33 | self.fc2 = nn.Linear(512, 128) 34 | self.fc3 = nn.Linear(128, 40) 35 | 36 | def forward(self, x): 37 | x = F.relu(self.fc1(x)) 38 | x = F.relu(self.fc2(x)) 39 | return self.fc3(x) 40 | 41 | 42 | class DNN_dropout(DNN): 43 | 44 | def __init__(self) -> None: 45 | super().__init__() 46 | self.dropout = nn.Dropout(p=0.05) 47 | 48 | def forward(self, x): 49 | x = F.relu(self.dropout(self.fc1(x))) 50 | x = F.relu(self.dropout(self.fc2(x))) 51 | return self.fc3(x) 52 | 53 | 54 | class DNN_BN(DNN): 55 | 56 | def __init__(self) -> None: 57 | super().__init__() 58 | self.bn1 = nn.BatchNorm1d(512) 59 | self.bn2 = nn.BatchNorm1d(128) 60 | 61 | def forward(self, x): 62 | x = F.relu(self.bn1(self.fc1(x))) 63 | x = F.relu(self.bn2(self.fc2(x))) 64 | return self.fc3(x) 65 | 66 | 67 | np.random.seed(42) 68 | use_cuda = True 69 | device = f'cuda:{torch.cuda.device_count() - 1}' if torch.cuda.is_available( 70 | ) else 'cpu' 71 | 72 | net1 = DNN().to(device) 73 | net2 = DNN_dropout().to(device) 74 | net3 = DNN_BN().to(device) 75 | print(net1) 76 | print(net2) 77 | print(net3) 78 | optim1 = Adam(net1.parameters(), lr=5e-5) 79 | optim2 = Adam(net2.parameters(), lr=5e-5) 80 | optim3 = Adam(net3.parameters(), lr=5e-5) 81 | loss = nn.CrossEntropyLoss() 82 | EPOCHES = 50 83 | BATCH_SIZE = 40 84 | 85 | train_loader = data_loader(torch.tensor(train_X), torch.tensor(train_y), 86 | BATCH_SIZE, True) 87 | 88 | train_accs, test_accs = [], [] 89 | test_X_cuda = torch.tensor(test_X, device=device) 90 | test_y_cuda = torch.tensor(test_y, device=device) 91 | 92 | bar = tqdm(range(EPOCHES)) 93 | 94 | for epoch in bar: 95 | # 相同数据训练3个网络 96 | net1.train() 97 | net2.train() 98 | net3.train() 99 | 100 | for batch_X, batch_y in train_loader: 101 | input_, label = batch_X.to(device), batch_y.to(device) 102 | 103 | output1 = net1(input_) 104 | l1 = loss(output1, label) 105 | output2 = net2(input_) 106 | l2 = loss(output2, label) 107 | output3 = net3(input_) 108 | l3 = loss(output3, label) 109 | 110 | optim1.zero_grad() 111 | optim2.zero_grad() 112 | optim3.zero_grad() 113 | (l1 + l2 + l3).backward() 114 | optim1.step() 115 | optim2.step() 116 | optim3.step() 117 | 118 | net1.eval() 119 | net2.eval() 120 | net3.eval() 121 | 122 | # train 123 | train_right = [0, 0, 0] 124 | with torch.no_grad(): 125 | for batch_X, batch_y in train_loader: 126 | input_, label = batch_X.to(device), batch_y.to(device) 127 | pred1 = net1(input_).argmax(-1) 128 | pred2 = net2(input_).argmax(-1) 129 | pred3 = net3(input_).argmax(-1) 130 | 131 | train_right[0] += pred1.eq(label).sum().item() 132 | train_right[1] += pred2.eq(label).sum().item() 133 | train_right[2] += pred3.eq(label).sum().item() 134 | 135 | train_acc = np.array(train_right) / len(train_X) 136 | 137 | pred1, pred2, pred3 = ( 138 | net1(test_X_cuda).argmax(-1), 139 | net2(test_X_cuda).argmax(-1), 140 | net3(test_X_cuda).argmax(-1), 141 | ) 142 | test_acc = np.array([ 143 | pred1.eq(test_y_cuda).float().mean().item(), 144 | pred2.eq(test_y_cuda).float().mean().item(), 145 | pred3.eq(test_y_cuda).float().mean().item(), 146 | ]) 147 | 148 | bar.set_postfix( 149 | TRAIN_ACC="{:.3f}, {:.3f}, {:.3f}".format(*train_acc), 150 | TEST_ACC="{:.3f}, {:.3f}, {:.3f}".format(*test_acc), 151 | ) 152 | train_accs.append(train_acc) 153 | test_accs.append(test_acc) 154 | 155 | train_accs = np.array(train_accs) 156 | test_accs = np.array(test_accs) 157 | 158 | plt.figure(figsize=(9, 3)) 159 | 160 | plt.rcParams['font.sans-serif'] = ['Times New Roman'] 161 | plt.rcParams['mathtext.fontset'] = 'stix' 162 | plt.rcParams['xtick.direction'] = 'in' 163 | plt.rcParams['ytick.direction'] = 'in' 164 | plt.rcParams['axes.linewidth'] = 0.5 165 | 166 | plot_kwargs = {'linewidth': 0.7, 'zorder': 10} 167 | 168 | plt.subplot(1, 2, 1) 169 | plt.grid(zorder=-10) 170 | 171 | plt.xlim(2, 50) 172 | plt.ylim(0, 1.05) 173 | 174 | x = np.arange(0, 50, 2) + 2 175 | plt.plot(x, 176 | train_accs[::2, 0], 177 | label="MLP", 178 | color='blue', 179 | marker='^', 180 | **plot_kwargs) 181 | plt.plot(x, 182 | train_accs[::2, 1], 183 | label="MLP with Dropout", 184 | color='green', 185 | marker='s', 186 | **plot_kwargs) 187 | plt.plot(x, 188 | train_accs[::2, 2], 189 | label="MLP with BN", 190 | color='red', 191 | marker='*', 192 | **plot_kwargs) 193 | 194 | plt.yticks([0, .2, .4, .6, .8, 1], size=13) 195 | plt.xticks([10, 20, 30, 40, 50], size=13) 196 | plt.xlabel("Epochs", size=13) 197 | plt.title("Training Accuracy on Olivetti Faces Dataset") 198 | plt.legend() 199 | plt.tight_layout() 200 | 201 | plt.subplot(1, 2, 2) 202 | plt.grid(zorder=-10) 203 | 204 | plt.xlim(2, 50) 205 | plt.ylim(0, 1.) 206 | 207 | plt.plot(x, 208 | test_accs[::2, 0], 209 | label="MLP", 210 | color='blue', 211 | marker='^', 212 | **plot_kwargs) 213 | plt.plot(x, 214 | test_accs[::2, 1], 215 | label="MLP with Dropout", 216 | color='green', 217 | marker='s', 218 | **plot_kwargs) 219 | plt.plot(x, 220 | test_accs[::2, 2], 221 | label="MLP with BN", 222 | color='red', 223 | marker='*', 224 | **plot_kwargs) 225 | 226 | plt.yticks([0, .2, .4, .6, .8, 1], size=13) 227 | plt.xticks([10, 20, 30, 40, 50], size=13) 228 | plt.xlabel("Epochs", size=13) 229 | plt.title("Test Accuracy on Olivetti Faces Dataset") 230 | plt.legend() 231 | plt.tight_layout() 232 | 233 | plt.savefig("imgs/dropout_bn.png") 234 | -------------------------------------------------------------------------------- /examples/pydynet/dropout_bn.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | from sklearn.datasets import fetch_olivetti_faces 3 | from sklearn.preprocessing import MinMaxScaler 4 | from sklearn.model_selection import train_test_split 5 | from tqdm import tqdm 6 | 7 | import numpy as np 8 | import pydynet as pdn 9 | import pydynet.nn.functional as F 10 | import pydynet.nn as nn 11 | from pydynet.optim import Adam 12 | from pydynet.data import data_loader 13 | 14 | data_X, data_y = fetch_olivetti_faces(return_X_y=True) 15 | print(data_X.shape) 16 | train_X, test_X, train_y, test_y = train_test_split( 17 | data_X, 18 | data_y, 19 | train_size=0.8, 20 | stratify=data_y, 21 | random_state=42, 22 | ) 23 | scaler = MinMaxScaler() 24 | train_X = scaler.fit_transform(train_X) 25 | test_X = scaler.transform(test_X) 26 | 27 | 28 | class DNN(nn.Module): 29 | 30 | def __init__(self) -> None: 31 | super().__init__() 32 | self.fc1 = nn.Linear(4096, 512, dtype=np.float32) 33 | self.fc2 = nn.Linear(512, 128, dtype=np.float32) 34 | self.fc3 = nn.Linear(128, 40, dtype=np.float32) 35 | 36 | def forward(self, x): 37 | x = F.relu(self.fc1(x)) 38 | x = F.relu(self.fc2(x)) 39 | return self.fc3(x) 40 | 41 | 42 | class DNN_dropout(DNN): 43 | 44 | def __init__(self) -> None: 45 | super().__init__() 46 | self.dropout = nn.Dropout(p=0.05) 47 | 48 | def forward(self, x): 49 | x = F.relu(self.dropout(self.fc1(x))) 50 | x = F.relu(self.dropout(self.fc2(x))) 51 | return self.fc3(x) 52 | 53 | 54 | class DNN_BN(DNN): 55 | 56 | def __init__(self) -> None: 57 | super().__init__() 58 | self.bn1 = nn.BatchNorm1d(512, dtype=np.float32) 59 | self.bn2 = nn.BatchNorm1d(128, dtype=np.float32) 60 | 61 | def forward(self, x): 62 | x = F.relu(self.bn1(self.fc1(x))) 63 | x = F.relu(self.bn2(self.fc2(x))) 64 | return self.fc3(x) 65 | 66 | 67 | np.random.seed(42) 68 | use_cuda = True 69 | device = f'cuda:{pdn.cuda.device_count() - 1}' if pdn.cuda.is_available( 70 | ) else 'cpu' 71 | 72 | net1 = DNN().to(device) 73 | net2 = DNN_dropout().to(device) 74 | net3 = DNN_BN().to(device) 75 | print(net1) 76 | print(net2) 77 | print(net3) 78 | optim1 = Adam(net1.parameters(), lr=5e-5) 79 | optim2 = Adam(net2.parameters(), lr=5e-5) 80 | optim3 = Adam(net3.parameters(), lr=5e-5) 81 | loss = nn.CrossEntropyLoss() 82 | EPOCHES = 50 83 | BATCH_SIZE = 40 84 | 85 | train_loader = data_loader(pdn.Tensor(train_X), pdn.Tensor(train_y), 86 | BATCH_SIZE, True) 87 | 88 | train_accs, test_accs = [], [] 89 | test_X_cuda = pdn.Tensor(test_X, device=device) 90 | test_y_cuda = pdn.Tensor(test_y, device=device) 91 | 92 | bar = tqdm(range(EPOCHES)) 93 | 94 | for epoch in bar: 95 | # 相同数据训练3个网络 96 | net1.train() 97 | net2.train() 98 | net3.train() 99 | 100 | for batch_X, batch_y in train_loader: 101 | input_, label = batch_X.to(device), batch_y.to(device) 102 | 103 | output1 = net1(input_) 104 | l1 = loss(output1, label) 105 | output2 = net2(input_) 106 | l2 = loss(output2, label) 107 | output3 = net3(input_) 108 | l3 = loss(output3, label) 109 | 110 | optim1.zero_grad() 111 | optim2.zero_grad() 112 | optim3.zero_grad() 113 | (l1 + l2 + l3).backward() 114 | optim1.step() 115 | optim2.step() 116 | optim3.step() 117 | 118 | net1.eval() 119 | net2.eval() 120 | net3.eval() 121 | 122 | # train 123 | train_right = [0, 0, 0] 124 | with pdn.no_grad(): 125 | for batch_X, batch_y in train_loader: 126 | input_, label = batch_X.to(device), batch_y.to(device) 127 | pred1 = net1(input_).argmax(-1) 128 | pred2 = net2(input_).argmax(-1) 129 | pred3 = net3(input_).argmax(-1) 130 | 131 | train_right[0] += pred1.eq(label).sum().item() 132 | train_right[1] += pred2.eq(label).sum().item() 133 | train_right[2] += pred3.eq(label).sum().item() 134 | 135 | train_acc = np.array(train_right) / len(train_X) 136 | 137 | pred1, pred2, pred3 = ( 138 | net1(test_X_cuda).argmax(-1), 139 | net2(test_X_cuda).argmax(-1), 140 | net3(test_X_cuda).argmax(-1), 141 | ) 142 | test_acc = np.array([ 143 | pred1.eq(test_y_cuda.data).mean().item(), 144 | pred2.eq(test_y_cuda.data).mean().item(), 145 | pred3.eq(test_y_cuda.data).mean().item(), 146 | ]) 147 | 148 | bar.set_postfix( 149 | TRAIN_ACC="{:.3f}, {:.3f}, {:.3f}".format(*train_acc), 150 | TEST_ACC="{:.3f}, {:.3f}, {:.3f}".format(*test_acc), 151 | ) 152 | train_accs.append(train_acc) 153 | test_accs.append(test_acc) 154 | 155 | train_accs = np.array(train_accs) 156 | test_accs = np.array(test_accs) 157 | 158 | plt.figure(figsize=(9, 3)) 159 | 160 | plt.rcParams['font.sans-serif'] = ['Times New Roman'] 161 | plt.rcParams['mathtext.fontset'] = 'stix' 162 | plt.rcParams['xtick.direction'] = 'in' 163 | plt.rcParams['ytick.direction'] = 'in' 164 | plt.rcParams['axes.linewidth'] = 0.5 165 | 166 | plot_kwargs = {'linewidth': 0.7, 'zorder': 10} 167 | 168 | plt.subplot(1, 2, 1) 169 | plt.grid(zorder=-10) 170 | 171 | plt.xlim(2, 50) 172 | plt.ylim(0, 1.05) 173 | 174 | x = np.arange(0, 50, 2) + 2 175 | plt.plot(x, 176 | train_accs[::2, 0], 177 | label="MLP", 178 | color='blue', 179 | marker='^', 180 | **plot_kwargs) 181 | plt.plot(x, 182 | train_accs[::2, 1], 183 | label="MLP with Dropout", 184 | color='green', 185 | marker='s', 186 | **plot_kwargs) 187 | plt.plot(x, 188 | train_accs[::2, 2], 189 | label="MLP with BN", 190 | color='red', 191 | marker='*', 192 | **plot_kwargs) 193 | 194 | plt.yticks([0, .2, .4, .6, .8, 1], size=13) 195 | plt.xticks([10, 20, 30, 40, 50], size=13) 196 | plt.xlabel("Epochs", size=13) 197 | plt.title("Training Accuracy on Olivetti Faces Dataset") 198 | plt.legend() 199 | plt.tight_layout() 200 | 201 | plt.subplot(1, 2, 2) 202 | plt.grid(zorder=-10) 203 | 204 | plt.xlim(2, 50) 205 | plt.ylim(0, 1.) 206 | 207 | plt.plot(x, 208 | test_accs[::2, 0], 209 | label="MLP", 210 | color='blue', 211 | marker='^', 212 | **plot_kwargs) 213 | plt.plot(x, 214 | test_accs[::2, 1], 215 | label="MLP with Dropout", 216 | color='green', 217 | marker='s', 218 | **plot_kwargs) 219 | plt.plot(x, 220 | test_accs[::2, 2], 221 | label="MLP with BN", 222 | color='red', 223 | marker='*', 224 | **plot_kwargs) 225 | 226 | plt.yticks([0, .2, .4, .6, .8, 1], size=13) 227 | plt.xticks([10, 20, 30, 40, 50], size=13) 228 | plt.xlabel("Epochs", size=13) 229 | plt.title("Test Accuracy on Olivetti Faces Dataset") 230 | plt.legend() 231 | plt.tight_layout() 232 | 233 | plt.savefig("imgs/dropout_bn.png") 234 | -------------------------------------------------------------------------------- /pydynet/core/function.py: -------------------------------------------------------------------------------- 1 | from .tensor import Tensor, swapaxes 2 | 3 | 4 | def sqrt(x: Tensor): 5 | '''平方根函数''' 6 | return x**0.5 7 | 8 | 9 | def square(x: Tensor): 10 | '''平方函数''' 11 | return x * x 12 | 13 | 14 | def vsplit(x: Tensor, indices_or_sections: int | tuple) -> list[Tensor]: 15 | if not isinstance(x, Tensor): 16 | x = Tensor(x) 17 | 18 | try: 19 | len(indices_or_sections) 20 | except TypeError: 21 | sections = indices_or_sections 22 | N = x.shape[0] 23 | assert N % sections == 0, 'array split does not result in an equal division' 24 | 25 | Ntotal = x.shape[0] 26 | try: 27 | # handle array case. 28 | Nsections = len(indices_or_sections) + 1 29 | div_points = [0] + list(indices_or_sections) + [Ntotal] 30 | except TypeError: 31 | # indices_or_sections is a scalar, not an array. 32 | Nsections = int(indices_or_sections) 33 | if Nsections <= 0: 34 | raise ValueError( 35 | 'number sections must be larger than 0.') from None 36 | Neach_section, extras = divmod(Ntotal, Nsections) 37 | section_sizes = ([0] + extras * [Neach_section + 1] + 38 | (Nsections - extras) * [Neach_section]) 39 | div_points = x.xp.array(section_sizes, dtype=x.xp.intp).cumsum() 40 | 41 | sub_tensors = [] 42 | for i in range(Nsections): 43 | st = div_points[i] 44 | end = div_points[i + 1] 45 | sub_tensors.append(x[st:end]) 46 | 47 | return sub_tensors 48 | 49 | 50 | def hsplit(x: Tensor, indices_or_sections: int | tuple) -> list[Tensor]: 51 | if not isinstance(x, Tensor): 52 | x = Tensor(x) 53 | 54 | try: 55 | len(indices_or_sections) 56 | except TypeError: 57 | sections = indices_or_sections 58 | N = x.shape[1] 59 | assert N % sections == 0, 'array split does not result in an equal division' 60 | 61 | Ntotal = x.shape[1] 62 | try: 63 | # handle array case. 64 | Nsections = len(indices_or_sections) + 1 65 | div_points = [0] + list(indices_or_sections) + [Ntotal] 66 | except TypeError: 67 | # indices_or_sections is a scalar, not an array. 68 | Nsections = int(indices_or_sections) 69 | if Nsections <= 0: 70 | raise ValueError( 71 | 'number sections must be larger than 0.') from None 72 | Neach_section, extras = divmod(Ntotal, Nsections) 73 | section_sizes = ([0] + extras * [Neach_section + 1] + 74 | (Nsections - extras) * [Neach_section]) 75 | div_points = x.xp.array(section_sizes, dtype=x.xp.intp).cumsum() 76 | 77 | sub_tensors = [] 78 | for i in range(Nsections): 79 | st = div_points[i] 80 | end = div_points[i + 1] 81 | sub_tensors.append(x[:, st:end]) 82 | 83 | return sub_tensors 84 | 85 | 86 | def dsplit(x: Tensor, indices_or_sections: int | tuple) -> list[Tensor]: 87 | if not isinstance(x, Tensor): 88 | x = Tensor(x) 89 | 90 | try: 91 | len(indices_or_sections) 92 | except TypeError: 93 | sections = indices_or_sections 94 | N = x.shape[2] 95 | assert N % sections == 0, 'array split does not result in an equal division' 96 | 97 | Ntotal = x.shape[2] 98 | try: 99 | # handle array case. 100 | Nsections = len(indices_or_sections) + 1 101 | div_points = [0] + list(indices_or_sections) + [Ntotal] 102 | except TypeError: 103 | # indices_or_sections is a scalar, not an array. 104 | Nsections = int(indices_or_sections) 105 | if Nsections <= 0: 106 | raise ValueError( 107 | 'number sections must be larger than 0.') from None 108 | Neach_section, extras = divmod(Ntotal, Nsections) 109 | section_sizes = ([0] + extras * [Neach_section + 1] + 110 | (Nsections - extras) * [Neach_section]) 111 | div_points = x.xp.array(section_sizes, dtype=x.xp.intp).cumsum() 112 | 113 | sub_tensors = [] 114 | for i in range(Nsections): 115 | st = div_points[i] 116 | end = div_points[i + 1] 117 | sub_tensors.append(x[:, :, st:end]) 118 | 119 | return sub_tensors 120 | 121 | 122 | def split( 123 | x: Tensor, 124 | indices_or_sections: int | tuple, 125 | axis: int = 0, 126 | ) -> list[Tensor]: 127 | if not isinstance(x, Tensor): 128 | x = Tensor(x) 129 | 130 | if axis == 0 or axis == -x.ndim: 131 | return vsplit(x, indices_or_sections) 132 | elif axis == 1 or axis == -x.ndim + 1: 133 | return hsplit(x, indices_or_sections) 134 | elif axis == 2 or axis == -x.ndim + 2: 135 | return dsplit(x, indices_or_sections) 136 | 137 | try: 138 | len(indices_or_sections) 139 | except TypeError: 140 | sections = indices_or_sections 141 | N = x.shape[axis] 142 | assert N % sections == 0, 'array split does not result in an equal division' 143 | 144 | Ntotal = x.shape[axis] 145 | try: 146 | # handle array case. 147 | Nsections = len(indices_or_sections) + 1 148 | div_points = [0] + list(indices_or_sections) + [Ntotal] 149 | except TypeError: 150 | # indices_or_sections is a scalar, not an array. 151 | Nsections = int(indices_or_sections) 152 | if Nsections <= 0: 153 | raise ValueError( 154 | 'number sections must be larger than 0.') from None 155 | Neach_section, extras = divmod(Ntotal, Nsections) 156 | section_sizes = ([0] + extras * [Neach_section + 1] + 157 | (Nsections - extras) * [Neach_section]) 158 | div_points = x.xp.array(section_sizes, dtype=x.xp.intp).cumsum() 159 | 160 | sub_tensors = [] 161 | stensor = swapaxes(x, 0, axis) 162 | for i in range(Nsections): 163 | st = div_points[i] 164 | end = div_points[i + 1] 165 | sub_tensors.append(swapaxes(stensor[st:end], axis, 0)) 166 | return sub_tensors 167 | 168 | 169 | def unsqueeze(x: Tensor, axis): 170 | '''等价于numpy的expand_dims, 因此我们借用了expand_dims的源码''' 171 | from numpy.core.numeric import normalize_axis_tuple 172 | 173 | if type(axis) not in (tuple, list): 174 | axis = (axis, ) 175 | 176 | out_ndim = len(axis) + x.ndim 177 | axis = normalize_axis_tuple(axis, out_ndim) 178 | 179 | shape_it = iter(x.shape) 180 | shape = [1 if ax in axis else next(shape_it) for ax in range(out_ndim)] 181 | return x.reshape(*shape) 182 | 183 | 184 | def squeeze(x: Tensor, axis=None): 185 | shape = x.shape 186 | if axis is None: 187 | new_shape = tuple(dim for dim in shape if dim != 1) 188 | else: 189 | if isinstance(axis, int): 190 | axis = (axis, ) 191 | axis = tuple(axis) 192 | 193 | for ax in axis: 194 | if ax >= len(shape) or ax < -len(shape): 195 | raise ValueError("Axis out of range") 196 | if shape[ax] != 1: 197 | raise ValueError( 198 | f"Cannot squeeze axis {ax} with size {shape[ax]}") 199 | 200 | # 构造新形状,排除指定轴 201 | new_shape = tuple(dim for i, dim in enumerate(shape) if i not in axis) 202 | 203 | # 返回重塑后的数组 204 | return x.reshape(*new_shape) 205 | -------------------------------------------------------------------------------- /llm/llama/model.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | 4 | import pydynet as pdn 5 | from pydynet.core.tensor import Tensor 6 | import pydynet.nn as nn 7 | import pydynet.nn.functional as F 8 | 9 | 10 | def compute_cos_sin_cache(head_dim: int, 11 | max_seq_len: int, 12 | base: int = 10000, 13 | dtype=None): 14 | inv_freq = 1.0 / (base**(np.arange(0, head_dim, 2)[:(head_dim // 2)] / 15 | head_dim)) 16 | t = np.arange(max_seq_len) 17 | freqs = np.outer(t, inv_freq).astype(dtype) 18 | 19 | return Tensor(np.cos(freqs)), Tensor(np.sin(freqs)) 20 | 21 | 22 | def apply_rotary_emb(xq: Tensor, xk: Tensor, freqs_cos, freqs_sin): 23 | xqri = xq.reshape(*(xq.shape[:-1] + (-1, 2))) 24 | xkri = xk.reshape(*(xk.shape[:-1] + (-1, 2))) 25 | 26 | xq_r, xq_i = xqri[..., 0], xqri[..., 1] 27 | xk_r, xk_i = xkri[..., 0], xkri[..., 1] 28 | 29 | freqs_cos = pdn.unsqueeze(freqs_cos, axis=-2) 30 | freqs_sin = pdn.unsqueeze(freqs_sin, axis=-2) 31 | 32 | # Apply rotation using real numbers. 33 | xq_out_r = pdn.unsqueeze(xq_r * freqs_cos - xq_i * freqs_sin, -1) 34 | xq_out_i = pdn.unsqueeze(xq_r * freqs_sin + xq_i * freqs_cos, -1) 35 | xk_out_r = pdn.unsqueeze(xk_r * freqs_cos - xk_i * freqs_sin, -1) 36 | xk_out_i = pdn.unsqueeze(xk_r * freqs_sin + xk_i * freqs_cos, -1) 37 | 38 | # Flatten last two dimensions. 39 | xq_out = pdn.concat([xq_out_r, xq_out_i], axis=-1) 40 | xk_out = pdn.concat([xk_out_r, xk_out_i], axis=-1) 41 | xq_out = xq_out.reshape(*(xq_out.shape[:-2] + (-1, ))) 42 | xk_out = xk_out.reshape(*(xk_out.shape[:-2] + (-1, ))) 43 | return xq_out, xk_out 44 | 45 | 46 | class FeedForward(nn.Module): 47 | 48 | def __init__(self, dim, up_dim, dtype=None): 49 | super().__init__() 50 | self.dim, self.up_dim = dim, up_dim 51 | self.up = nn.Linear(dim, up_dim, bias=False, dtype=dtype) 52 | self.gate = nn.Linear(dim, up_dim, bias=False, dtype=dtype) 53 | self.down = nn.Linear(up_dim, dim, bias=False, dtype=dtype) 54 | 55 | def forward(self, x): 56 | swish, x_V = F.silu(self.gate(x)), self.up(x) 57 | return self.down(swish * x_V) 58 | 59 | 60 | class Attention(nn.Module): 61 | 62 | def __init__( 63 | self, 64 | dim: int, 65 | n_heads: int, 66 | max_seq_len: int, 67 | max_batch_size: int = None, 68 | dtype=None, 69 | ): 70 | super().__init__() 71 | self.dim = dim 72 | self.n_heads = n_heads 73 | 74 | assert dim % n_heads == 0 75 | self.head_dim = dim // n_heads 76 | 77 | self.Q = nn.Linear(self.dim, self.dim, bias=False, dtype=dtype) 78 | self.K = nn.Linear(self.dim, self.dim, bias=False, dtype=dtype) 79 | self.V = nn.Linear(self.dim, self.dim, bias=False, dtype=dtype) 80 | self.O = nn.Linear(self.dim, self.dim, bias=False, dtype=dtype) 81 | 82 | self.max_seq_len = max_seq_len 83 | self.max_batch_size = max_batch_size if max_batch_size is not None else 1 84 | 85 | self.cache_k = nn.Parameter(pdn.special.zeros( 86 | (self.max_batch_size, max_seq_len, self.n_heads, self.head_dim), 87 | dtype=dtype), 88 | requires_grad=False) 89 | self.cache_v = nn.Parameter(pdn.special.zeros( 90 | (self.max_batch_size, max_seq_len, self.n_heads, self.head_dim), 91 | dtype=dtype), 92 | requires_grad=False) 93 | 94 | def __call__(self, x, start_pos: int, mask, freqs_cos, freqs_sin): 95 | B, L, _ = x.shape 96 | xq, xk, xv = ( 97 | self.Q(x).reshape(B, L, self.n_heads, self.head_dim), 98 | self.K(x).reshape(B, L, self.n_heads, self.head_dim), 99 | self.V(x).reshape(B, L, self.n_heads, self.head_dim), 100 | ) 101 | 102 | xq, xk = apply_rotary_emb(xq, xk, freqs_cos, freqs_sin) 103 | 104 | if not self._train: 105 | self.cache_k[:B, start_pos:start_pos + L] = xk 106 | self.cache_v[:B, start_pos:start_pos + L] = xv 107 | 108 | xk = self.cache_k[:B, :start_pos + L] 109 | xv = self.cache_v[:B, :start_pos + L] 110 | 111 | xq, xkT = xq.transpose(0, 2, 1, 3), xk.transpose(0, 2, 3, 1) 112 | attention = xq @ xkT / math.sqrt(self.head_dim) 113 | 114 | if mask is not None: 115 | attention = attention + mask 116 | attention = F.softmax(attention, axis=-1) 117 | output = attention @ xv.transpose(0, 2, 1, 3) 118 | 119 | output = output.transpose(0, 2, 1, 3).reshape(B, L, -1) 120 | return self.O(output) 121 | 122 | 123 | class TransformerBlock(nn.Module): 124 | 125 | def __init__( 126 | self, 127 | dim: int, 128 | n_heads: int, 129 | ffn_dim: int, 130 | max_seq_len: int, 131 | max_batch_size: int = None, 132 | dtype=None, 133 | ): 134 | super().__init__() 135 | self.attention = Attention(dim, n_heads, max_seq_len, max_batch_size, 136 | dtype) 137 | self.ffn = FeedForward(dim, ffn_dim, dtype) 138 | self.input_norm = nn.RMSNorm(dim, dtype=dtype) 139 | self.post_attn_norm = nn.RMSNorm(dim, dtype=dtype) 140 | 141 | def forward(self, x, start_pos: int, mask, freqs_cos, freqs_sin): 142 | norm_x = self.input_norm(x) 143 | 144 | h1 = self.attention(norm_x, start_pos, mask, freqs_cos, freqs_sin) 145 | z = x + h1 146 | 147 | norm_z = self.post_attn_norm(z) 148 | h2 = self.ffn(norm_z) 149 | return z + h2 150 | 151 | 152 | class Llama(nn.Module): 153 | 154 | def __init__( 155 | self, 156 | vocab_size, 157 | embed_dim, 158 | n_heads, 159 | ffn_dim: int, 160 | max_seq_len: int, 161 | max_batch_size: int = None, 162 | n_layers: int = 6, 163 | dtype=None, 164 | ): 165 | super().__init__() 166 | self.vocab_size = vocab_size 167 | self.embed_dim = embed_dim 168 | self.n_heads = n_heads 169 | self.ffn_dim = ffn_dim 170 | self.max_seq_len = max_seq_len 171 | self.max_batch_size = max_batch_size 172 | self.n_layers = n_layers 173 | 174 | self.tok_embedding = nn.Embedding(vocab_size, embed_dim, dtype=dtype) 175 | freqs_cos, freqs_sin = compute_cos_sin_cache(embed_dim // n_heads, 176 | max_seq_len, 177 | dtype=dtype) 178 | 179 | self.freqs_cos = nn.parameter.Parameter(freqs_cos, False) 180 | self.freqs_sin = nn.parameter.Parameter(freqs_sin, False) 181 | 182 | self.layers = nn.ModuleList([ 183 | TransformerBlock(embed_dim, n_heads, ffn_dim, max_seq_len, 184 | max_batch_size, dtype) 185 | for _ in range(self.n_layers) 186 | ]) 187 | 188 | self.norm = nn.RMSNorm(embed_dim, dtype=dtype) 189 | self.lm_head = nn.Linear(embed_dim, vocab_size, dtype=dtype) 190 | 191 | def forward(self, input_ids, start_pos: int): 192 | L = input_ids.shape[-1] 193 | h = self.tok_embedding(input_ids) 194 | 195 | freqs_cos = self.freqs_cos[start_pos:start_pos + L] 196 | freqs_sin = self.freqs_sin[start_pos:start_pos + L] 197 | 198 | mask = None 199 | if L > 1: 200 | mask = np.triu(np.full((L, L), float("-inf")), k=1) 201 | mask = np.concatenate([np.zeros((L, start_pos)), mask], axis=1) 202 | mask = pdn.Tensor(mask, device=h.device, dtype=h.dtype) 203 | 204 | for layer in self.layers: 205 | h = layer(h, start_pos, mask, freqs_cos, freqs_sin) 206 | 207 | logit = self.lm_head(self.norm(h)[:, [-1], :]) 208 | return logit 209 | 210 | def generate(self, input_ids, max_new_tokens: int): 211 | _, L = input_ids.shape 212 | for i, curr_pos in enumerate(range(L, max_new_tokens)): 213 | if i == 0: # Prefill Phase 214 | inputs = input_ids 215 | pos = 0 216 | else: # Decode Phase 217 | inputs = next_id 218 | pos = curr_pos 219 | logits = self(inputs, pos) 220 | next_id = logits[:, -1, :].argmax(-1, True) 221 | yield next_id 222 | -------------------------------------------------------------------------------- /pydynet/nn/modules/norm.py: -------------------------------------------------------------------------------- 1 | from .module import Module 2 | from ..parameter import Parameter 3 | from .. import init 4 | from ...special import empty 5 | from ... import core 6 | from ...cuda import Device 7 | 8 | 9 | class BatchNorm1d(Module): 10 | ''' 11 | 一维Batch Normalization层 12 | 13 | Parameters 14 | ---------- 15 | num_features : int 16 | 输入特征数. 17 | eps : float, default=1e-5 18 | 防止除数为0的极小项. 19 | momentum : float, default=0.5 20 | 计算累积均值和方差的动量项. 21 | device : Optional[Device], default=None 22 | 层数据所在的设备. 23 | dtype : default=Nonr 24 | 层数据的类型. 25 | ''' 26 | 27 | def __init__( 28 | self, 29 | num_features: int, 30 | eps: float = 1e-6, 31 | momentum: float = 0.1, 32 | device=None, 33 | dtype=None, 34 | ) -> None: 35 | super().__init__() 36 | kwargs = {"device": Device(device), "dtype": dtype} 37 | self.num_features = num_features 38 | self.eps = eps 39 | self.momentum = momentum 40 | self.running_mean = Parameter( 41 | empty(self.num_features, **kwargs), 42 | requires_grad=False, 43 | ) 44 | self.running_var = Parameter( 45 | empty(self.num_features, **kwargs), 46 | requires_grad=False, 47 | ) 48 | self.scale = Parameter(empty(self.num_features, **kwargs)) 49 | self.shift = Parameter(empty(self.num_features, **kwargs)) 50 | self.reset_parameters() 51 | 52 | def reset_parameters(self): 53 | init.zeros_(self.running_mean) 54 | init.ones_(self.running_var) 55 | init.zeros_(self.shift) 56 | init.ones_(self.scale) 57 | 58 | def forward(self, x): 59 | if self._train: 60 | mean = x.mean(0) 61 | center_data = x - mean 62 | var = core.mean(core.square(center_data), 0) 63 | std_data = center_data / core.sqrt(var + self.eps) 64 | 65 | self.running_mean *= (1 - self.momentum) 66 | self.running_mean += self.momentum * mean 67 | self.running_var *= (1 - self.momentum) 68 | self.running_var += self.momentum * var 69 | 70 | return std_data * self.scale + self.shift 71 | else: 72 | return (x - self.running_mean) * self.scale / core.sqrt( 73 | self.running_var + self.eps) + self.shift 74 | 75 | def __repr__(self) -> str: 76 | return "{}(num_features={}, momentum={})".format( 77 | self.__class__.__name__, 78 | self.num_features, 79 | self.momentum, 80 | ) 81 | 82 | 83 | class BatchNorm2d(Module): 84 | ''' 85 | 二维Batch Normalization层 86 | 87 | Parameters 88 | ---------- 89 | num_features : int 90 | 输入特征数(通道数). 91 | eps : float, default=1e-5 92 | 防止除数为0的极小项. 93 | momentum : float, default=0.5 94 | 计算累积均值和方差的动量项. 95 | device : Optional[Device], default=None 96 | 层数据所在的设备. 97 | dtype : default=None 98 | 层数据的类型. 99 | ''' 100 | 101 | def __init__( 102 | self, 103 | num_features: int, 104 | eps: float = 1e-6, 105 | momentum: float = 0.1, 106 | device=None, 107 | dtype=None, 108 | ) -> None: 109 | super().__init__() 110 | kwargs = {"device": Device(device), "dtype": dtype} 111 | self.num_features = num_features 112 | self.eps = eps 113 | self.momentum = momentum 114 | self.running_mean = Parameter( 115 | empty((1, self.num_features, 1, 1), **kwargs), 116 | requires_grad=False, 117 | ) 118 | self.running_var = Parameter( 119 | empty((1, self.num_features, 1, 1), **kwargs), 120 | requires_grad=False, 121 | ) 122 | self.scale = Parameter(empty(1, self.num_features, 1, 1, **kwargs)) 123 | self.shift = Parameter(empty((1, self.num_features, 1, 1), **kwargs)) 124 | self.reset_parameters() 125 | 126 | def reset_parameters(self): 127 | init.zeros_(self.running_mean) 128 | init.ones_(self.running_var) 129 | init.zeros_(self.shift) 130 | init.ones_(self.scale) 131 | 132 | def forward(self, x): 133 | if self._train: 134 | mean = x.mean((0, 2, 3), keepdims=True) 135 | center_data = x - mean 136 | var = core.mean(core.square(center_data), (0, 2, 3), keepdims=True) 137 | std_data = center_data / core.sqrt(var + self.eps) 138 | 139 | self.running_mean *= (1 - self.momentum) 140 | self.running_mean += self.momentum * mean 141 | self.running_var *= (1 - self.momentum) 142 | self.running_var += self.momentum * var 143 | 144 | return std_data * self.scale + self.shift 145 | else: 146 | return (x - self.running_mean) * self.scale / core.sqrt( 147 | self.running_var + self.eps) + self.shift 148 | 149 | def __repr__(self) -> str: 150 | return "{}(num_features={}, momentum={})".format( 151 | self.__class__.__name__, 152 | self.num_features, 153 | self.momentum, 154 | ) 155 | 156 | 157 | class LayerNorm(Module): 158 | ''' 159 | Layer Normalization 160 | 161 | Parameters 162 | ---------- 163 | normalized_shape : Tuple[int] 164 | eps : float, default=1e-5 165 | momentum : float, default=0.5 166 | device : Optional[Device], default=None 167 | dtype : default=None 168 | ''' 169 | 170 | def __init__( 171 | self, 172 | normalized_shape: int, 173 | eps: float = 1e-6, 174 | momentum: float = 0.1, 175 | device=None, 176 | dtype=None, 177 | ) -> None: 178 | super().__init__() 179 | kwargs = {"device": Device(device), "dtype": dtype} 180 | if isinstance(normalized_shape, int): 181 | normalized_shape = (normalized_shape, ) 182 | self.normalized_shape = tuple(normalized_shape) 183 | self.eps = eps 184 | self.momentum = momentum 185 | self.running_mean = Parameter( 186 | empty(normalized_shape, **kwargs), 187 | requires_grad=False, 188 | ) 189 | self.running_var = Parameter( 190 | empty(normalized_shape, **kwargs), 191 | requires_grad=False, 192 | ) 193 | self.scale = Parameter(empty(*normalized_shape, **kwargs)) 194 | self.shift = Parameter(empty(normalized_shape, **kwargs)) 195 | self.reset_parameters() 196 | 197 | def reset_parameters(self): 198 | init.zeros_(self.running_mean) 199 | init.ones_(self.running_var) 200 | init.zeros_(self.shift) 201 | init.ones_(self.scale) 202 | 203 | def forward(self, x): 204 | if self._train: 205 | axis = tuple(range(x.ndim - len(self.normalized_shape))) 206 | mean = x.mean(axis) 207 | center_data = x - mean 208 | var = core.square(center_data).mean(axis) 209 | std_data = center_data / core.sqrt(var + self.eps) 210 | self.running_mean *= (1 - self.momentum) 211 | self.running_mean += self.momentum * mean 212 | self.running_var *= (1 - self.momentum) 213 | self.running_var += self.momentum * var 214 | 215 | return std_data * self.scale + self.shift 216 | else: 217 | return (x - self.running_mean) * self.scale / core.sqrt( 218 | self.running_var + self.eps) + self.shift 219 | 220 | 221 | class RMSNorm(Module): 222 | 223 | def __init__( 224 | self, 225 | normalized_shape: tuple, 226 | eps: float = 1e-6, 227 | device=None, 228 | dtype=None, 229 | ): 230 | super().__init__() 231 | kwargs = {"device": Device(device), "dtype": dtype} 232 | if isinstance(normalized_shape, int): 233 | normalized_shape = (normalized_shape, ) 234 | self.normalized_shape = tuple(normalized_shape) 235 | self.sum_axis = tuple( 236 | [-(i + 1) for i in range(len(self.normalized_shape))]) 237 | self.eps = eps 238 | 239 | self.weight = Parameter(empty(self.normalized_shape, **kwargs)) 240 | self.reset_parameters() 241 | 242 | def reset_parameters(self): 243 | init.ones_(self.weight) 244 | 245 | def forward(self, x): 246 | z = core.square(x).mean(self.sum_axis, keepdims=True) 247 | z = x / core.sqrt(z + self.eps) 248 | return z * self.weight 249 | -------------------------------------------------------------------------------- /examples/pytorch/transformer.py: -------------------------------------------------------------------------------- 1 | from os.path import join 2 | from tqdm import tqdm 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.optim import Adam 8 | from pydynet.data import data_loader 9 | 10 | import numpy as np 11 | 12 | import numpy as np 13 | import matplotlib.pyplot as plt 14 | from sklearn.model_selection import train_test_split 15 | 16 | np.random.seed(42) 17 | 18 | path = r'examples/data/CoLA/tokenized' 19 | 20 | 21 | def extract(line: str): 22 | lines = line.split('\t') 23 | y = int(lines[1]) 24 | sentence = lines[-1][:-1] 25 | return sentence.split(), y 26 | 27 | 28 | def load_data(): 29 | 30 | with open(join(path, 'in_domain_train.tsv'), 'r', encoding='utf-8') as f: 31 | lines = f.readlines() 32 | 33 | sens, ys = [], [] 34 | max_len = -1 35 | word_dict = set() 36 | for line in tqdm(lines): 37 | x, y = extract(line) 38 | word_dict = word_dict.union(set(x)) 39 | max_len = max(max_len, len(x)) 40 | sens.append(x) 41 | ys.append(y) 42 | word_dict = list(word_dict) 43 | 44 | X = np.zeros((len(lines), max_len), dtype=int) 45 | for i in tqdm(range(len(lines))): 46 | for j, word in enumerate(sens[i]): 47 | X[i, j] = word_dict.index(word) + 1 48 | y = np.array(ys) 49 | 50 | return X, y 51 | 52 | 53 | class SelfAttention(nn.Module): 54 | 55 | def __init__(self, embed_size, heads): 56 | super(SelfAttention, self).__init__() 57 | self.embed_size = embed_size 58 | self.heads = heads 59 | self.head_dim = embed_size // heads 60 | 61 | assert (self.head_dim * heads == embed_size 62 | ), "Embedding size needs to be divisible by heads" 63 | 64 | self.Q = nn.Linear(self.embed_size, self.embed_size, bias=False) 65 | self.K = nn.Linear(self.embed_size, self.embed_size, bias=False) 66 | self.V = nn.Linear(self.embed_size, self.embed_size, bias=False) 67 | self.O = nn.Linear(self.embed_size, self.embed_size, bias=False) 68 | 69 | def forward(self, values, keys, query, mask): 70 | N = query.shape[0] 71 | value_len, key_len, query_len = values.shape[1], keys.shape[ 72 | 1], query.shape[1] 73 | 74 | xq, xk, xv = ( 75 | self.Q(values).reshape(N, value_len, self.heads, self.head_dim), 76 | self.K(values).reshape(N, key_len, self.heads, self.head_dim), 77 | self.V(values).reshape(N, query_len, self.heads, self.head_dim), 78 | ) 79 | 80 | # Split the embedding into self.heads different pieces 81 | xq, xkT = xq.permute(0, 2, 1, 3), xk.permute(0, 2, 3, 1) 82 | attention = xq @ xkT / self.head_dim**.5 83 | 84 | if mask is not None: 85 | mask[mask.eq(1)] = -torch.inf 86 | attention = attention + mask 87 | 88 | attention = F.softmax(attention, dim=-1) 89 | output = attention @ xv.permute(0, 2, 1, 3) 90 | 91 | output = output.permute(0, 2, 1, 3).reshape(N, value_len, -1) 92 | return self.O(output) 93 | 94 | 95 | class TransformerBlock(nn.Module): 96 | 97 | def __init__(self, embed_size, heads, dropout, forward_expansion): 98 | super(TransformerBlock, self).__init__() 99 | self.attention = SelfAttention(embed_size, heads) 100 | self.norm1 = nn.LayerNorm(embed_size) 101 | self.norm2 = nn.LayerNorm(embed_size) 102 | 103 | self.feed_forward = nn.Sequential( 104 | nn.Linear( 105 | embed_size, 106 | forward_expansion * embed_size, 107 | ), 108 | nn.ReLU(), 109 | nn.Linear( 110 | forward_expansion * embed_size, 111 | embed_size, 112 | ), 113 | ) 114 | 115 | def forward(self, value, key, query, mask): 116 | attention = self.attention(value, key, query, mask) 117 | x = (self.norm1(attention + query)) 118 | forward = self.feed_forward(x) 119 | out = (self.norm2(forward + x)) 120 | return out 121 | 122 | 123 | def sinusoidal_positional_encoding(max_len: int, d_model: int): 124 | position = np.arange(max_len)[:, np.newaxis] 125 | div_term = np.exp(np.arange(0, d_model, 2) * (-np.log(10000.0) / d_model)) 126 | pe = np.zeros((max_len, d_model)) 127 | pe[:, 0::2] = np.sin(position * div_term) 128 | pe[:, 1::2] = np.cos(position * div_term) 129 | 130 | return torch.tensor(pe.astype(np.float32)) 131 | 132 | 133 | @torch.no_grad() 134 | def construct_mask(x: torch.Tensor, padding_idx=0): 135 | mask = x.eq(padding_idx) # [batch_size, seq_len] 136 | return torch.unsqueeze(mask, 1).unsqueeze(1) # [batch_size, 1, 1, seq_len] 137 | 138 | 139 | class Transformer(nn.Module): 140 | 141 | def __init__( 142 | self, 143 | embed_size, 144 | num_layers, 145 | heads, 146 | forward_expansion, 147 | dropout, 148 | vocab_size, 149 | max_length, 150 | ): 151 | super(Transformer, self).__init__() 152 | self.embed_size = embed_size 153 | self.word_embedding = nn.Embedding( 154 | vocab_size, 155 | embed_size, 156 | padding_idx=0, 157 | ) 158 | self.position_embedding = nn.Parameter( 159 | sinusoidal_positional_encoding(max_length, embed_size), False) 160 | 161 | self.layers = nn.ModuleList([ 162 | TransformerBlock( 163 | embed_size, 164 | heads, 165 | dropout=dropout, 166 | forward_expansion=forward_expansion, 167 | ) for _ in range(num_layers) 168 | ]) 169 | 170 | self.fc_out = nn.Linear(embed_size, 1) 171 | 172 | def forward(self, x, mask): 173 | a = self.word_embedding(x) 174 | out = a + self.position_embedding 175 | 176 | for layer in self.layers: 177 | out = layer(out, out, out, mask) 178 | 179 | out = out[:, 0, :] 180 | return self.fc_out(out) 181 | 182 | 183 | if __name__ == "__main__": 184 | LR = 5e-4 185 | EPOCHES = 100 186 | TRAIN_BATCH_SIZE = 128 187 | TEST_BATCH_SIZE = 512 188 | use_cuda = True 189 | 190 | device = 'cuda' if torch.cuda.is_available() and use_cuda else 'cpu' 191 | 192 | X, y = load_data() 193 | y[y == 0] = -1 194 | 195 | train_X, test_X, train_y, test_y = train_test_split( 196 | torch.tensor(X), 197 | torch.tensor(y), 198 | train_size=0.8, 199 | stratify=y, 200 | shuffle=True, 201 | ) 202 | 203 | ratio_pos = (train_y.float().mean() + 1) / 2 204 | 205 | train_loader = data_loader( 206 | train_X, 207 | train_y, 208 | shuffle=False, 209 | batch_size=TRAIN_BATCH_SIZE, 210 | ) 211 | test_loader = data_loader( 212 | test_X, 213 | test_y, 214 | shuffle=False, 215 | batch_size=TEST_BATCH_SIZE, 216 | ) 217 | 218 | net = Transformer(512, 1, 4, 3, 0.05, X.max() + 1, 44).to(device) 219 | optimizer = Adam(net.parameters(), lr=LR) 220 | bar = tqdm(range(EPOCHES)) 221 | info_list = [] 222 | for epoch in bar: 223 | 224 | net.train() 225 | 226 | for batch_X, batch_y in train_loader: 227 | input_, label = batch_X.to(device), batch_y.to(device) 228 | output = net(input_, construct_mask(input_)) 229 | weight = torch.ones(label.shape) 230 | weight[label == -1] = 1 / (1 - ratio_pos) 231 | weight[label == 1] = 1 / ratio_pos 232 | loss = (weight.to(device) * 233 | torch.log(1 + torch.exp(-label * torch.squeeze(output))) 234 | ).mean() 235 | optimizer.zero_grad() 236 | loss.backward() 237 | optimizer.step() 238 | 239 | net.eval() 240 | train_right, train_size = 0, 0 241 | test_right, test_size = 0, 0 242 | 243 | with torch.no_grad(): 244 | for batch_X, batch_y in train_loader: 245 | input_, label = batch_X.to(device), batch_y.to(device) 246 | pred = torch.sign( 247 | torch.squeeze(net(input_, construct_mask(input_)))) 248 | train_right += (pred.data == label.data).sum() 249 | train_size += batch_X.shape[0] 250 | 251 | for batch_X, batch_y in test_loader: 252 | input_, label = batch_X.to(device), batch_y.to(device) 253 | pred = torch.sign( 254 | torch.squeeze(net(input_, construct_mask(input_)))) 255 | test_right += (pred.data == label.data).sum() 256 | test_size += batch_X.shape[0] 257 | 258 | train_acc, test_acc = train_right / train_size, test_right / test_size 259 | bar.set_postfix( 260 | Loss="{:.6f}".format(loss.item()), 261 | TEST_ACC="{:.4f}".format(test_acc), 262 | TRAIN_ACC="{:.4f}".format(train_acc), 263 | ) 264 | info_list.append([train_acc.item(), test_acc.item()]) 265 | 266 | info_list = np.array(info_list) 267 | 268 | plt.figure(figsize=(5, 3)) 269 | 270 | plt.rcParams['font.sans-serif'] = ['Times New Roman'] 271 | plt.rcParams['mathtext.fontset'] = 'stix' 272 | plt.rcParams['xtick.direction'] = 'in' 273 | plt.rcParams['ytick.direction'] = 'in' 274 | plt.rcParams['axes.linewidth'] = 0.5 275 | 276 | plt.grid(zorder=-10) 277 | plot_kwargs = {'linewidth': 0.7, 'zorder': 10} 278 | 279 | x = np.arange(0, 100, 4) + 2 280 | plt.plot(x, 281 | info_list[::4, 0], 282 | label="Training accuracy", 283 | color='blue', 284 | marker='^', 285 | **plot_kwargs, 286 | linestyle='-') 287 | plt.plot(x, 288 | info_list[::4, 1], 289 | label="Test accuracy", 290 | color='red', 291 | marker='*', 292 | **plot_kwargs, 293 | linestyle='--') 294 | 295 | plt.xlim(0, 100) 296 | plt.ylim(.4, 1) 297 | 298 | plt.yticks([.4, .6, .8, 1], size=13) 299 | plt.xticks([20, 40, 60, 80, 100], size=13) 300 | plt.xlabel("Epochs", size=13) 301 | plt.legend() 302 | plt.tight_layout() 303 | plt.savefig("imgs/transformer.png") 304 | -------------------------------------------------------------------------------- /examples/pydynet/transformer.py: -------------------------------------------------------------------------------- 1 | from os.path import join 2 | from tqdm import tqdm 3 | 4 | import pydynet as pdn 5 | import pydynet.nn as nn 6 | import pydynet.nn.functional as F 7 | from pydynet.optim import Adam 8 | from pydynet.data import data_loader 9 | 10 | import numpy as np 11 | 12 | import numpy as np 13 | import matplotlib.pyplot as plt 14 | from sklearn.model_selection import train_test_split 15 | 16 | np.random.seed(42) 17 | 18 | path = r'examples/data/CoLA/tokenized' 19 | 20 | 21 | def extract(line: str): 22 | lines = line.split('\t') 23 | y = int(lines[1]) 24 | sentence = lines[-1][:-1] 25 | return sentence.split(), y 26 | 27 | 28 | def load_data(): 29 | 30 | with open(join(path, 'in_domain_train.tsv'), 'r', encoding='utf-8') as f: 31 | lines = f.readlines() 32 | 33 | sens, ys = [], [] 34 | max_len = -1 35 | word_dict = set() 36 | for line in tqdm(lines): 37 | x, y = extract(line) 38 | word_dict = word_dict.union(set(x)) 39 | max_len = max(max_len, len(x)) 40 | sens.append(x) 41 | ys.append(y) 42 | word_dict = list(word_dict) 43 | 44 | X = np.zeros((len(lines), max_len), dtype=int) 45 | for i in tqdm(range(len(lines))): 46 | for j, word in enumerate(sens[i]): 47 | X[i, j] = word_dict.index(word) + 1 48 | y = np.array(ys) 49 | 50 | return X, y 51 | 52 | 53 | class SelfAttention(nn.Module): 54 | 55 | def __init__(self, embed_size, heads): 56 | super(SelfAttention, self).__init__() 57 | self.embed_size = embed_size 58 | self.heads = heads 59 | self.head_dim = embed_size // heads 60 | 61 | assert (self.head_dim * heads == embed_size 62 | ), "Embedding size needs to be divisible by heads" 63 | 64 | self.Q = nn.Linear(self.embed_size, 65 | self.embed_size, 66 | bias=False, 67 | dtype=np.float32) 68 | self.K = nn.Linear(self.embed_size, 69 | self.embed_size, 70 | bias=False, 71 | dtype=np.float32) 72 | self.V = nn.Linear(self.embed_size, 73 | self.embed_size, 74 | bias=False, 75 | dtype=np.float32) 76 | self.O = nn.Linear(self.embed_size, 77 | self.embed_size, 78 | bias=False, 79 | dtype=np.float32) 80 | 81 | def forward(self, values, keys, query, mask): 82 | N = query.shape[0] 83 | value_len, key_len, query_len = values.shape[1], keys.shape[ 84 | 1], query.shape[1] 85 | 86 | xq, xk, xv = ( 87 | self.Q(values).reshape(N, value_len, self.heads, self.head_dim), 88 | self.K(values).reshape(N, key_len, self.heads, self.head_dim), 89 | self.V(values).reshape(N, query_len, self.heads, self.head_dim), 90 | ) 91 | 92 | # Split the embedding into self.heads different pieces 93 | xq, xkT = xq.transpose(0, 2, 1, 3), xk.transpose(0, 2, 3, 1) 94 | attention = xq @ xkT / self.head_dim**.5 95 | 96 | if mask is not None: 97 | mask[mask.eq(1)] = np.float32('-inf') 98 | attention = attention + mask 99 | 100 | attention = F.softmax(attention, axis=-1) 101 | output = attention @ xv.transpose(0, 2, 1, 3) 102 | 103 | output = output.transpose(0, 2, 1, 3).reshape(N, value_len, -1) 104 | return self.O(output) 105 | 106 | 107 | class TransformerBlock(nn.Module): 108 | 109 | def __init__(self, embed_size, heads, dropout, forward_expansion): 110 | super(TransformerBlock, self).__init__() 111 | self.attention = SelfAttention(embed_size, heads) 112 | self.norm1 = nn.LayerNorm(embed_size, dtype=np.float32) 113 | self.norm2 = nn.LayerNorm(embed_size, dtype=np.float32) 114 | 115 | self.feed_forward = nn.Sequential( 116 | nn.Linear(embed_size, 117 | forward_expansion * embed_size, 118 | dtype=np.float32), 119 | nn.ReLU(), 120 | nn.Linear(forward_expansion * embed_size, 121 | embed_size, 122 | dtype=np.float32), 123 | ) 124 | 125 | def forward(self, value, key, query, mask): 126 | attention = self.attention(value, key, query, mask) 127 | x = (self.norm1(attention + query)) 128 | forward = self.feed_forward(x) 129 | out = (self.norm2(forward + x)) 130 | return out 131 | 132 | 133 | def sinusoidal_positional_encoding(max_len: int, d_model: int): 134 | position = np.arange(max_len)[:, np.newaxis] 135 | div_term = np.exp(np.arange(0, d_model, 2) * (-np.log(10000.0) / d_model)) 136 | pe = np.zeros((max_len, d_model)) 137 | pe[:, 0::2] = np.sin(position * div_term) 138 | pe[:, 1::2] = np.cos(position * div_term) 139 | 140 | return pdn.Tensor(pe.astype(np.float32)) 141 | 142 | 143 | @pdn.no_grad() 144 | def construct_mask(x: pdn.Tensor, padding_idx=0): 145 | mask = x.eq(padding_idx) # [batch_size, seq_len] 146 | return pdn.unsqueeze(mask, (1, 2)).astype( 147 | np.float32) # [batch_size, 1, 1, seq_len] 148 | 149 | 150 | class Transformer(nn.Module): 151 | 152 | def __init__( 153 | self, 154 | embed_size, 155 | num_layers, 156 | heads, 157 | forward_expansion, 158 | dropout, 159 | vocab_size, 160 | max_length, 161 | ): 162 | super(Transformer, self).__init__() 163 | self.embed_size = embed_size 164 | self.word_embedding = nn.Embedding( 165 | vocab_size, 166 | embed_size, 167 | padding_idx=0, 168 | dtype=np.float32, 169 | ) 170 | self.position_embedding = nn.Parameter( 171 | sinusoidal_positional_encoding(max_length, embed_size), False) 172 | 173 | self.layers = nn.ModuleList([ 174 | TransformerBlock( 175 | embed_size, 176 | heads, 177 | dropout=dropout, 178 | forward_expansion=forward_expansion, 179 | ) for _ in range(num_layers) 180 | ]) 181 | 182 | self.fc_out = nn.Linear(embed_size, 1, dtype=np.float32) 183 | 184 | def forward(self, x, mask): 185 | a = self.word_embedding(x) 186 | out = a + self.position_embedding 187 | 188 | for layer in self.layers: 189 | out = layer(out, out, out, mask) 190 | 191 | out = out[:, 0, :] 192 | return self.fc_out(out) 193 | 194 | 195 | if __name__ == "__main__": 196 | LR = 5e-4 197 | EPOCHES = 100 198 | TRAIN_BATCH_SIZE = 128 199 | TEST_BATCH_SIZE = 512 200 | use_cuda = True 201 | 202 | device = 'cuda' if pdn.cuda.is_available() and use_cuda else 'cpu' 203 | 204 | X, y = load_data() 205 | y[y == 0] = -1 206 | 207 | train_X, test_X, train_y, test_y = train_test_split( 208 | pdn.Tensor(X), 209 | pdn.Tensor(y), 210 | train_size=0.8, 211 | stratify=y, 212 | shuffle=True, 213 | ) 214 | 215 | ratio_pos = (train_y.mean() + 1) / 2 216 | 217 | train_loader = data_loader( 218 | train_X, 219 | train_y, 220 | shuffle=False, 221 | batch_size=TRAIN_BATCH_SIZE, 222 | ) 223 | test_loader = data_loader( 224 | test_X, 225 | test_y, 226 | shuffle=False, 227 | batch_size=TEST_BATCH_SIZE, 228 | ) 229 | 230 | net = Transformer(512, 1, 4, 3, 0.05, X.max() + 1, 44).to(device) 231 | optimizer = Adam(net.parameters(), lr=LR) 232 | bar = tqdm(range(EPOCHES)) 233 | info_list = [] 234 | for epoch in bar: 235 | 236 | net.train() 237 | 238 | for batch_X, batch_y in train_loader: 239 | input_, label = batch_X.to(device), batch_y.to(device) 240 | output = net(input_, construct_mask(input_)) 241 | weight = pdn.ones(label.shape, dtype=np.float32) 242 | weight[label == -1] = 1 / (1 - ratio_pos) 243 | weight[label == 1] = 1 / ratio_pos 244 | loss = (weight.to(device) * 245 | pdn.log(1 + pdn.exp(-label * pdn.squeeze(output)))).mean() 246 | optimizer.zero_grad() 247 | loss.backward() 248 | optimizer.step() 249 | 250 | net.eval() 251 | train_right, train_size = 0, 0 252 | test_right, test_size = 0, 0 253 | 254 | with pdn.no_grad(): 255 | for batch_X, batch_y in train_loader: 256 | input_, label = batch_X.to(device), batch_y.to(device) 257 | pred = pdn.sign( 258 | pdn.squeeze(net(input_, construct_mask(input_)))) 259 | train_right += (pred.data == label.data).sum() 260 | train_size += batch_X.shape[0] 261 | 262 | for batch_X, batch_y in test_loader: 263 | input_, label = batch_X.to(device), batch_y.to(device) 264 | pred = pdn.sign( 265 | pdn.squeeze(net(input_, construct_mask(input_)))) 266 | test_right += (pred.data == label.data).sum() 267 | test_size += batch_X.shape[0] 268 | 269 | train_acc, test_acc = train_right / train_size, test_right / test_size 270 | bar.set_postfix( 271 | Loss="{:.6f}".format(loss.item()), 272 | TEST_ACC="{:.4f}".format(test_acc), 273 | TRAIN_ACC="{:.4f}".format(train_acc), 274 | ) 275 | info_list.append([train_acc.item(), test_acc.item()]) 276 | 277 | info_list = np.array(info_list) 278 | 279 | plt.figure(figsize=(5, 3)) 280 | 281 | plt.rcParams['font.sans-serif'] = ['Times New Roman'] 282 | plt.rcParams['mathtext.fontset'] = 'stix' 283 | plt.rcParams['xtick.direction'] = 'in' 284 | plt.rcParams['ytick.direction'] = 'in' 285 | plt.rcParams['axes.linewidth'] = 0.5 286 | 287 | plt.grid(zorder=-10) 288 | plot_kwargs = {'linewidth': 0.7, 'zorder': 10} 289 | 290 | x = np.arange(0, 100, 4) + 2 291 | plt.plot(x, 292 | info_list[::4, 0], 293 | label="Training accuracy", 294 | color='blue', 295 | marker='^', 296 | **plot_kwargs, 297 | linestyle='-') 298 | plt.plot(x, 299 | info_list[::4, 1], 300 | label="Test accuracy", 301 | color='red', 302 | marker='*', 303 | **plot_kwargs, 304 | linestyle='--') 305 | 306 | plt.xlim(0, 100) 307 | plt.ylim(.4, 1) 308 | 309 | plt.yticks([.4, .6, .8, 1], size=13) 310 | plt.xticks([20, 40, 60, 80, 100], size=13) 311 | plt.xlabel("Epochs", size=13) 312 | plt.legend() 313 | plt.tight_layout() 314 | plt.savefig("imgs/transformer.png") 315 | -------------------------------------------------------------------------------- /llm/clip/infer.py: -------------------------------------------------------------------------------- 1 | import os, json, urllib, zipfile 2 | import urllib.request 3 | from PIL import Image 4 | 5 | import numpy as np 6 | import pydynet as pdn 7 | import pydynet.nn.functional as F 8 | 9 | from .tokenizer import SimpleTokenizer 10 | from .model import CLIP 11 | 12 | 13 | def download(url: str, filename: str, chunk_size: int = 10**6) -> None: 14 | # Create directories if they don't exist yet 15 | directories = os.path.dirname(filename) 16 | if directories: 17 | os.makedirs(directories, exist_ok=True) 18 | 19 | # Download the file 20 | with urllib.request.urlopen(url) as response: 21 | total = int(response.info()["Content-Length"]) 22 | 23 | buf = b"" 24 | while True: 25 | data = response.read(chunk_size) 26 | if not data: 27 | break 28 | buf += data 29 | print(f"Downloading {filename} {len(buf) / total * 100:.2f} %") 30 | 31 | # Write the downloaded data to the file 32 | with open(filename, "wb") as f: 33 | f.write(buf) 34 | 35 | 36 | def load_zip(path: str): 37 | files = {} 38 | 39 | with zipfile.ZipFile(path) as z: 40 | for file_info in z.infolist(): 41 | with z.open(file_info) as f: 42 | path = file_info.filename 43 | files[path] = f.read() 44 | 45 | return files 46 | 47 | 48 | class Params: 49 | 50 | def __init__(self, name: str, download_root: str = None) -> None: 51 | assert name == "ViT-B/32", f"Model {name} not supported yet. Only ViT-B-32 currently supported." 52 | 53 | model_urls = { 54 | "RN50": 55 | "https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt", 56 | "RN101": 57 | "https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt", 58 | "RN50x4": 59 | "https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt", 60 | "RN50x16": 61 | "https://openaipublic.azureedge.net/clip/models/52378b407f34354e150460fe41077663dd5b39c54cd0bfd2b27167a4a06ec9aa/RN50x16.pt", 62 | "RN50x64": 63 | "https://openaipublic.azureedge.net/clip/models/be1cfb55d75a9666199fb2206c106743da0f6468c9d327f3e0d0a543a9919d9c/RN50x64.pt", 64 | "ViT-B/32": 65 | "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt", 66 | "ViT-B/16": 67 | "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt", 68 | "ViT-L/14": 69 | "https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt", 70 | "ViT-L/14@336px": 71 | "https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt", 72 | } 73 | 74 | model_url = model_urls[name] 75 | 76 | name = name.replace("/", "-") 77 | 78 | if download_root is None: 79 | download_root = os.path.expanduser(f"~/.cache/clip") 80 | download_root = os.environ.get("CLIP_DIR", download_root) 81 | 82 | model_path = os.path.join(download_root, f"{name}.pt") 83 | 84 | if not os.path.isfile(model_path): 85 | print(f"Downloading {model_path} from {model_url}") 86 | download(model_url, model_path) 87 | 88 | self.files = load_zip(model_path) 89 | 90 | with open(f"{download_root}/{name}.json") as f: 91 | self.info = json.load(f) 92 | 93 | def get_int(self, name: str) -> int: 94 | info = self.info[name] 95 | 96 | value: int = info["value"] 97 | 98 | return value 99 | 100 | def __getitem__(self, name: str): 101 | info = self.info[name] 102 | 103 | path = info["path"] 104 | dtype = info["dtype"] 105 | shape = info["shape"] 106 | start = info["start"] 107 | end = info["end"] 108 | 109 | assert dtype in ["float16", "float32"] 110 | 111 | data = self.files[path][start:end] 112 | 113 | arr = np.frombuffer(data, dtype=dtype).reshape(shape) 114 | arr = arr.astype(np.float32) 115 | 116 | return arr 117 | 118 | 119 | def tokenize(texts: list[str], context_length: int = 77): 120 | tokenizer = SimpleTokenizer() 121 | 122 | sot_token = tokenizer.encoder["<|startoftext|>"] 123 | eot_token = tokenizer.encoder["<|endoftext|>"] 124 | 125 | all_tokens = [[sot_token] + tokenizer.encode(text) + [eot_token] 126 | for text in texts] 127 | 128 | result = np.zeros((len(all_tokens), context_length), dtype=np.int64) 129 | 130 | for i, tokens in enumerate(all_tokens): 131 | if len(tokens) > context_length: 132 | raise RuntimeError( 133 | f"Input {texts[i]} is too long for context length {context_length}" 134 | ) 135 | 136 | result[i, :len(tokens)] = tokens 137 | 138 | return result 139 | 140 | 141 | def preprocess(image: Image.Image, image_size: int = 224): 142 | # Scale image such that length of smaller side is 224 143 | width, height = image.size 144 | scale = image_size / min(width, height) 145 | width = int(scale * width) 146 | height = int(scale * height) 147 | # Some Pillow versions have different interface 148 | if hasattr(Image, "Resampling"): 149 | image = image.resize((width, height), Image.Resampling.BICUBIC) 150 | else: 151 | image = image.resize((width, height), Image.BICUBIC) 152 | 153 | # Crop center 154 | x0 = round((width - image_size) / 2) 155 | y0 = round((height - image_size) / 2) 156 | x1 = x0 + image_size 157 | y1 = y0 + image_size 158 | image = image.crop((x0, y0, x1, y1)) 159 | 160 | image = image.convert("RGB") 161 | 162 | # Normalize 163 | x = np.array(image, dtype=np.float32) / 255.0 164 | mean = np.array([0.48145466, 0.4578275, 0.40821073], dtype=np.float32) 165 | std = np.array([0.26862954, 0.26130258, 0.27577711], dtype=np.float32) 166 | x = (x - mean) / std 167 | 168 | x = x.transpose(2, 0, 1) 169 | 170 | return pdn.Tensor(x, copy=None) 171 | 172 | 173 | @pdn.no_grad() 174 | def load_model(model: CLIP, param: Params): 175 | 176 | # with pdn.no_grad(): 177 | model.scale = pdn.exp(param["logit_scale"].astype(np.float32)) 178 | model.class_embed.data[0, 0] = param["visual.class_embedding"] 179 | model.v_pos_emb.data[...] = param["visual.positional_embedding"] 180 | model.t_pos_emb.data[...] = param["positional_embedding"] 181 | 182 | model.image_encoder.kernel.data[...] = param["visual.conv1.weight"] 183 | model.image_encoder.pre_norm.scale[...] = param["visual.ln_pre.weight"] 184 | model.image_encoder.pre_norm.shift[...] = param["visual.ln_pre.bias"] 185 | model.image_encoder.post_norm.scale[...] = param["visual.ln_post.weight"] 186 | model.image_encoder.post_norm.shift[...] = param["visual.ln_post.bias"] 187 | 188 | model.image_encoder.proj.weight[...] = param["visual.proj"] 189 | 190 | model.text_encoder.token_embed.weight[ 191 | ...] = param["token_embedding.weight"] 192 | model.text_encoder.post_norm.scale[...] = param["ln_final.weight"] 193 | model.text_encoder.post_norm.shift[...] = param["ln_final.bias"] 194 | model.text_encoder.proj.weight[...] = param["text_projection"] 195 | 196 | prefix = "transformer.resblocks." 197 | for i in range(12): 198 | ( 199 | model.image_encoder.transformers[i].mha.QKV.weight.data[...], 200 | model.image_encoder.transformers[i].mha.QKV.bias.data[...], 201 | model.image_encoder.transformers[i].mha.O.weight.data[...], 202 | model.image_encoder.transformers[i].mha.O.bias.data[...], 203 | model.image_encoder.transformers[i].layer_norm1.scale.data[...], 204 | model.image_encoder.transformers[i].layer_norm1.shift.data[...], 205 | model.image_encoder.transformers[i].layer_norm2.scale.data[...], 206 | model.image_encoder.transformers[i].layer_norm2.shift.data[...], 207 | model.image_encoder.transformers[i].mlp.fc1.weight.data[...], 208 | model.image_encoder.transformers[i].mlp.fc1.bias.data[...], 209 | model.image_encoder.transformers[i].mlp.fc2.weight.data[...], 210 | model.image_encoder.transformers[i].mlp.fc2.bias.data[...], 211 | model.text_encoder.transformers[i].mha.QKV.weight.data[...], 212 | model.text_encoder.transformers[i].mha.QKV.bias.data[...], 213 | model.text_encoder.transformers[i].mha.O.weight.data[...], 214 | model.text_encoder.transformers[i].mha.O.bias.data[...], 215 | model.text_encoder.transformers[i].layer_norm1.scale.data[...], 216 | model.text_encoder.transformers[i].layer_norm1.shift.data[...], 217 | model.text_encoder.transformers[i].layer_norm2.scale.data[...], 218 | model.text_encoder.transformers[i].layer_norm2.shift.data[...], 219 | model.text_encoder.transformers[i].mlp.fc1.weight.data[...], 220 | model.text_encoder.transformers[i].mlp.fc1.bias.data[...], 221 | model.text_encoder.transformers[i].mlp.fc2.weight.data[...], 222 | model.text_encoder.transformers[i].mlp.fc2.bias.data[...], 223 | ) = ( 224 | param["visual." + prefix + f"{i}.attn.in_proj_weight"].T, 225 | param["visual." + prefix + f"{i}.attn.in_proj_bias"], 226 | param["visual." + prefix + f"{i}.attn.out_proj.weight"].T, 227 | param["visual." + prefix + f"{i}.attn.out_proj.bias"], 228 | param["visual." + prefix + f"{i}.ln_1.weight"], 229 | param["visual." + prefix + f"{i}.ln_1.bias"], 230 | param["visual." + prefix + f"{i}.ln_2.weight"], 231 | param["visual." + prefix + f"{i}.ln_2.bias"], 232 | param["visual." + prefix + f"{i}.mlp.c_fc.weight"].T, 233 | param["visual." + prefix + f"{i}.mlp.c_fc.bias"], 234 | param["visual." + prefix + f"{i}.mlp.c_proj.weight"].T, 235 | param["visual." + prefix + f"{i}.mlp.c_proj.bias"], 236 | param[prefix + f"{i}.attn.in_proj_weight"].T, 237 | param[prefix + f"{i}.attn.in_proj_bias"], 238 | param[prefix + f"{i}.attn.out_proj.weight"].T, 239 | param[prefix + f"{i}.attn.out_proj.bias"], 240 | param[prefix + f"{i}.ln_1.weight"], 241 | param[prefix + f"{i}.ln_1.bias"], 242 | param[prefix + f"{i}.ln_2.weight"], 243 | param[prefix + f"{i}.ln_2.bias"], 244 | param[prefix + f"{i}.mlp.c_fc.weight"].T, 245 | param[prefix + f"{i}.mlp.c_fc.bias"], 246 | param[prefix + f"{i}.mlp.c_proj.weight"].T, 247 | param[prefix + f"{i}.mlp.c_proj.bias"], 248 | ) 249 | return model 250 | 251 | 252 | image = preprocess(Image.open("llm/clip/picture.png"))[np.newaxis, :, :, :] 253 | text = tokenize(["a fish", "a dog", "a cat"]) 254 | clip = load_model(CLIP(), Params("ViT-B/32", download_root='llm/clip/data')) 255 | 256 | with pdn.no_grad(): 257 | logits_per_image = clip(image, text) 258 | probs = F.softmax(logits_per_image, axis=-1) 259 | print("Label probs:", probs.numpy()[0]) 260 | -------------------------------------------------------------------------------- /pydynet/nn/functional.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from ..core import tensor, function 4 | from .. import unsqueeze, no_grad 5 | 6 | 7 | def linear(x: tensor.Tensor, weight: tensor.Tensor, bias: tensor.Tensor): 8 | affine = x @ weight 9 | if bias is not None: 10 | affine = affine + bias 11 | return affine 12 | 13 | 14 | def embedding(x: tensor.Tensor, weight: tensor.Tensor, padding_idx: int): 15 | query = weight[x] 16 | if padding_idx is not None: 17 | with tensor.no_grad(): 18 | mask = unsqueeze(x.ne(padding_idx), -1) 19 | query = query * mask 20 | return query 21 | 22 | 23 | def sigmoid(x: tensor.Tensor): 24 | return tensor.sigmoid(x) 25 | 26 | 27 | def tanh(x: tensor.Tensor): 28 | return tensor.tanh(x) 29 | 30 | 31 | def relu(x: tensor.Tensor): 32 | return tensor.maximum(0., x) 33 | 34 | 35 | def leaky_relu(x: tensor.Tensor, alpha: float): 36 | return tensor.maximum(x, alpha * x) 37 | 38 | 39 | def silu(x: tensor.Tensor): 40 | return x / (1 + tensor.exp(-x)) 41 | 42 | 43 | def softmax(x: tensor.Tensor, axis=None): 44 | '''Softmax函数''' 45 | with no_grad(): 46 | max_ = x.max(axis, keepdims=True) 47 | x_sub_max = x - max_ 48 | exp_ = tensor.exp(x_sub_max) 49 | return exp_ / tensor.sum(exp_, axis=axis, keepdims=True) 50 | 51 | 52 | def log_softmax(x: tensor.Tensor, axis=None, keepdims=False): 53 | '''log-softmax函数''' 54 | with no_grad(): 55 | max_ = x.max(axis, keepdims=True) 56 | x_sub_max = x - max_ 57 | return x_sub_max - tensor.log( 58 | tensor.sum(tensor.exp(x_sub_max), axis=axis, keepdims=keepdims)) 59 | 60 | 61 | class __im2col1d(tensor._UnaryOperator): 62 | 63 | def __init__( 64 | self, 65 | x: tensor.Tensor, 66 | kernel_size: int, 67 | stride: int, 68 | ) -> None: 69 | self.N, self.in_channels, self.n_features = x.shape 70 | self.kernel_size = kernel_size 71 | self.stride = stride 72 | self.n_output = (self.n_features - self.kernel_size) // stride + 1 73 | super().__init__(x) 74 | 75 | def forward_(self, x: tensor.Tensor) -> np.ndarray: 76 | s0, s1, s2 = x.strides 77 | shape = (x.shape[0], self.in_channels, self.kernel_size, self.n_output) 78 | self.__strides = (s0, s1, s2, s2 * self.stride) 79 | 80 | col = self.xp.lib.stride_tricks.as_strided( 81 | x.data, 82 | shape=shape, 83 | strides=self.__strides, 84 | ).copy() 85 | return col 86 | 87 | def grad_fn(self, x: tensor.Tensor, grad: np.ndarray) -> np.ndarray: 88 | grad_x = self.xp.zeros(x.shape, dtype=self.dtype) 89 | view = self.xp.lib.stride_tricks.as_strided( 90 | grad_x, 91 | shape=self.shape, 92 | strides=self.__strides, 93 | ) 94 | self.xp.add.at(view, (..., ), grad) 95 | return grad_x 96 | 97 | 98 | class __pad1d(tensor._UnaryOperator): 99 | 100 | def __init__(self, x: tensor.Tensor, pad_width=0) -> None: 101 | self.pad_width = pad_width 102 | super().__init__(x) 103 | 104 | def forward_(self, x: tensor.Tensor) -> np.ndarray: 105 | return self.xp.pad(x.data, [(0, 0), (0, 0), 106 | (self.pad_width, self.pad_width)], 107 | 'constant') 108 | 109 | def grad_fn(self, x: tensor.Tensor, grad: np.ndarray) -> np.ndarray: 110 | if self.pad_width == 0: 111 | return grad[...] 112 | return grad[..., self.pad_width:-self.pad_width] 113 | 114 | 115 | def conv1d( 116 | x: tensor.Tensor, 117 | kernel: tensor.Tensor, 118 | padding: int = 0, 119 | stride: int = 1, 120 | ): 121 | '''一维卷积函数 122 | 123 | 基于im2col实现的一维卷积. 124 | 125 | Parameters 126 | ---------- 127 | x : Tensor 128 | 输入数据, 形状为(N, in_channels, n_features); 129 | kernel : Tensor 130 | 卷积核, 形状为(out_channels, in_channels, kernel_size); 131 | padding : int, default=0 132 | 对输入特征两边补0数量; 133 | stride : int, default=1 134 | 卷积步长. 135 | ''' 136 | kernel_size = kernel.shape[-1] 137 | pad_x = __pad1d(x, padding) 138 | col = __im2col1d(pad_x, kernel_size, stride) 139 | return (col @ kernel.transpose(1, 2, 0)).sum(1).swapaxes(1, 2) 140 | 141 | 142 | def max_pool1d( 143 | x: tensor.Tensor, 144 | kernel_size: int, 145 | stride: int, 146 | padding: int = 0, 147 | ): 148 | '''一维池化函数 149 | 150 | 基于im2col实现的一维池化.` 151 | 152 | Parameters 153 | ---------- 154 | x : Tensor 155 | 输入数据, 形状为(N, in_channels, n_features); 156 | kernel_size : int 157 | 池化核大小; 158 | stride : int 159 | 卷积步长; 160 | padding : int, default=0 161 | 对输入特征两边补0数量. 162 | ''' 163 | pad_x = __pad1d(x, padding) 164 | col = __im2col1d(pad_x, kernel_size, stride) 165 | return col.max(-1) 166 | 167 | 168 | def avg_pool1d( 169 | x: tensor.Tensor, 170 | kernel_size: int, 171 | stride: int, 172 | padding: int = 0, 173 | ): 174 | '''一维平均池化函数 175 | 176 | 基于im2col实现的一维池化.` 177 | 178 | Parameters 179 | ---------- 180 | x : Tensor 181 | 输入数据, 形状为(N, in_channels, n_features); 182 | kernel_size : int 183 | 池化核大小; 184 | stride : int 185 | 卷积步长; 186 | padding : int, default=0 187 | 对输入特征两边补0数量. 188 | ''' 189 | pad_x = __pad1d(x, padding) 190 | col = __im2col1d(pad_x, kernel_size, stride) 191 | return col.mean(-1) 192 | 193 | 194 | class __im2col2d(tensor._UnaryOperator): 195 | 196 | def __init__( 197 | self, 198 | x: tensor.Tensor, 199 | kernel_size: int, 200 | stride: int, 201 | ) -> None: 202 | _, self.in_channels, self.n_h, self.n_w = x.shape 203 | self.kernel_size = kernel_size 204 | self.stride = stride 205 | self.out_h, self.out_w = ( 206 | self.n_h - self.kernel_size) // self.stride + 1, ( 207 | self.n_w - self.kernel_size) // self.stride + 1 208 | 209 | super().__init__(x) 210 | 211 | def forward_(self, x: tensor.Tensor) -> np.ndarray: 212 | s0, s1, s2, s3 = x.strides 213 | shape = (x.shape[0], self.in_channels, self.kernel_size, 214 | self.kernel_size, self.out_h, self.out_w) 215 | self.__strides = (s0, s1, s2, s3, s2 * self.stride, s3 * self.stride) 216 | 217 | col = self.xp.lib.stride_tricks.as_strided( 218 | x.data, 219 | shape=shape, 220 | strides=self.__strides, 221 | ).copy() 222 | return col 223 | 224 | def grad_fn(self, x: tensor.Tensor, grad: np.ndarray) -> np.ndarray: 225 | grad_x = self.xp.zeros(x.shape, dtype=self.dtype) 226 | view = self.xp.lib.stride_tricks.as_strided( 227 | grad_x, 228 | shape=self.shape, 229 | strides=self.__strides, 230 | ) 231 | self.xp.add.at(view, (..., ), grad) 232 | return grad_x 233 | 234 | 235 | class __pad2d(tensor._UnaryOperator): 236 | 237 | def __init__(self, x: tensor.Tensor, pad_width=0) -> None: 238 | self.pad_width = pad_width 239 | super().__init__(x) 240 | 241 | def forward_(self, x: tensor.Tensor) -> np.ndarray: 242 | return self.xp.pad(x.data, [(0, 0), (0, 0), 243 | (self.pad_width, self.pad_width), 244 | (self.pad_width, self.pad_width)], 245 | 'constant') 246 | 247 | def grad_fn(self, x: tensor.Tensor, grad: np.ndarray) -> np.ndarray: 248 | if self.pad_width == 0: 249 | return grad[...] 250 | return grad[..., self.pad_width:-self.pad_width, 251 | self.pad_width:-self.pad_width] 252 | 253 | 254 | def conv2d(x: tensor.Tensor, 255 | kernel: tensor.Tensor, 256 | padding: int = 0, 257 | stride: int = 1): 258 | '''二维卷积函数 259 | 260 | 基于im2col实现的二维卷积. 为了实现上的方便, 我们不考虑长宽不同的卷积核, 步长和补零。 261 | 262 | Parameters 263 | ---------- 264 | x : Tensor 265 | 输入数据, 形状为(N, in_channels, n_height, n_width); 266 | kernel : Tensor 267 | 卷积核, 形状为(out_channels, in_channels, kernel_height, kernel_width); 268 | padding : int, default=0 269 | 对输入图片周围补0数量; 270 | stride : int, default=1 271 | 卷积步长. 272 | ''' 273 | N, _, _, _ = x.shape 274 | out_channels, _, kernel_size, _ = kernel.shape 275 | pad_x = __pad2d(x, padding) 276 | col = __im2col2d(pad_x, kernel_size, stride) 277 | out_h, out_w = col.shape[-2:] 278 | col = col.transpose(0, 4, 5, 1, 2, 3).reshape(N * out_h * out_w, -1) 279 | col_filter = kernel.reshape(out_channels, -1).T 280 | out = col @ col_filter 281 | return out.reshape(N, out_h, out_w, -1).transpose(0, 3, 1, 2) 282 | 283 | 284 | def max_pool2d(x: tensor.Tensor, kernel_size: int, stride: int, padding=0): 285 | '''二维卷积函数池化 286 | 287 | 基于im2col实现的二维卷积. 为了实现上的方便, 我们不考虑长宽不同的kernel_size, 步长和补零。 288 | 289 | Parameters 290 | ---------- 291 | x : Tensor 292 | 输入数据, 形状为(N, in_channels, n_height, n_width); 293 | kernel_size : int 294 | 池化核尺寸; 295 | stride : int, default=1 296 | 卷积步长; 297 | padding : int, default=0 298 | 对输入图片周围补0数量; 299 | ''' 300 | N, in_channels, _, _ = x.shape 301 | pad_x = __pad2d(x, padding) 302 | col = __im2col2d(pad_x, kernel_size, stride) 303 | out_h, out_w = col.shape[-2:] 304 | col = col.transpose(0, 4, 5, 1, 2, 3).reshape( 305 | -1, 306 | kernel_size * kernel_size, 307 | ) 308 | out = col.max(1) 309 | out = out.reshape(N, out_h, out_w, in_channels).transpose(0, 3, 1, 2) 310 | return out 311 | 312 | 313 | def avg_pool2d(x: tensor.Tensor, kernel_size: int, stride: int, padding=0): 314 | '''二维平均池化 315 | 316 | 基于im2col实现的二维池化. 为了实现上的方便, 我们不考虑长宽不同的kernel_size, 步长和补零。 317 | 318 | Parameters 319 | ---------- 320 | x : Tensor 321 | 输入数据, 形状为(N, in_channels, n_height, n_width); 322 | kernel_size : int 323 | 池化核尺寸; 324 | stride : int, default=1 325 | 卷积步长; 326 | padding : int, default=0 327 | 对输入图片周围补0数量; 328 | ''' 329 | N, in_channels, _, _ = x.shape 330 | pad_x = __pad2d(x, padding) 331 | col = __im2col2d(pad_x, kernel_size, stride) 332 | out_h, out_w = col.shape[-2:] 333 | col = col.transpose(0, 4, 5, 1, 2, 3).reshape( 334 | -1, 335 | kernel_size * kernel_size, 336 | ) 337 | out = col.mean(1) 338 | out = out.reshape(N, out_h, out_w, in_channels).transpose(0, 3, 1, 2) 339 | return out 340 | 341 | 342 | def mse_loss(y_pred, y_true, reduction='mean'): 343 | '''均方误差''' 344 | square_sum = function.square(y_pred - y_true) 345 | if reduction == 'mean': 346 | return tensor.mean(square_sum) 347 | elif reduction == 'sum': 348 | return tensor.sum(square_sum) 349 | else: 350 | raise ValueError("reduction must be mean or sum.") 351 | 352 | 353 | def nll_loss(y_pred, y_true, reduction='mean'): 354 | '''负对数似然''' 355 | nll = -y_pred * y_true 356 | if reduction == 'mean': 357 | return tensor.mean(nll) 358 | elif reduction == 'sum': 359 | return tensor.sum(nll) 360 | else: 361 | raise ValueError("reduction must be mean or sum.") 362 | 363 | 364 | def cross_entropy_loss(y_pred, y_true, reduction='mean'): 365 | '''交叉熵损失''' 366 | update_y_pred = y_pred - y_pred.max().item() 367 | log_sum_exp = tensor.log( 368 | tensor.sum(tensor.exp(update_y_pred), 1, keepdims=True)) 369 | 370 | neg_log_sm = log_sum_exp - update_y_pred 371 | if y_true.ndim == 1: 372 | nll = neg_log_sm[range(len(neg_log_sm)), y_true] 373 | else: 374 | nll = neg_log_sm * y_true 375 | 376 | if reduction == 'mean': 377 | return tensor.mean(nll) 378 | elif reduction == 'sum': 379 | return tensor.sum(nll) 380 | else: 381 | raise ValueError("reduction must be mean or sum.") 382 | -------------------------------------------------------------------------------- /pydynet/nn/modules/rnn.py: -------------------------------------------------------------------------------- 1 | from .module import Module 2 | from .. import init 3 | from .. import functional as F 4 | from ..parameter import Parameter 5 | from ...special import empty, zeros 6 | from ... import core 7 | from ...cuda import Device 8 | 9 | from typing import Literal, Optional, Tuple, List 10 | import math 11 | 12 | 13 | class RNNCell(Module): 14 | 15 | def __init__( 16 | self, 17 | input_size: int, 18 | hidden_size: int, 19 | bias: bool = True, 20 | nonlinearity: Literal['tanh', 'relu'] = 'tanh', 21 | device=None, 22 | dtype=None, 23 | ) -> None: 24 | super().__init__() 25 | self.input_size = input_size 26 | self.hidden_size = hidden_size 27 | self.kwargs = {"device": Device(device), "dtype": dtype} 28 | self.nonlinearity = nonlinearity 29 | self.fn = {'tanh': F.tanh, 'relu': F.relu}[nonlinearity] 30 | 31 | self.Wx = Parameter(empty((input_size, hidden_size), **self.kwargs)) 32 | self.Wh = Parameter(empty((hidden_size, hidden_size), **self.kwargs)) 33 | if bias: 34 | self.bias = Parameter(empty(self.hidden_size, **self.kwargs)) 35 | self.has_bias = bias 36 | self.reset_paramters() 37 | 38 | def forward(self, x, h=None): 39 | if h is None: 40 | h = self.init_hidden(x) 41 | else: 42 | assert (x.ndim == 1 and h.shape == (self.hidden_size, )) or ( 43 | x.ndim == 2 and h.shape 44 | == (x.shape[0], self.hidden_size)), "Wrong hidden state input!" 45 | 46 | lin = x @ self.Wx + h @ self.Wh 47 | if self.has_bias: 48 | lin = lin + self.bias 49 | return self.fn(lin) 50 | 51 | def reset_paramters(self): 52 | bound = math.sqrt(1 / self.hidden_size) 53 | init.uniform_(self.Wx, -bound, bound) 54 | init.uniform_(self.Wh, -bound, bound) 55 | if self.has_bias: 56 | init.uniform_(self.bias, -bound, bound) 57 | 58 | def init_hidden(self, x): 59 | assert x.ndim in {1, 2} 60 | if x.ndim == 1: 61 | return zeros(self.hidden_size, **self.kwargs) 62 | else: 63 | batch_size = x.shape[0] 64 | return zeros((batch_size, self.hidden_size), **self.kwargs) 65 | 66 | def __repr__(self) -> str: 67 | return "{}({}, {}, bias={}, nonlinearity={})".format( 68 | self.__class__.__name__, 69 | self.input_size, 70 | self.hidden_size, 71 | self.has_bias, 72 | self.nonlinearity, 73 | ) 74 | 75 | def move(self, device): 76 | self.kwargs['device'] = device 77 | return super().move(device) 78 | 79 | 80 | class RNN(Module): 81 | 82 | def __init__( 83 | self, 84 | input_size: int, 85 | hidden_size: int, 86 | num_layers: int = 1, 87 | nonlinearity: Literal['tanh', 'relu'] = 'tanh', 88 | bias: bool = True, 89 | batch_first: bool = False, 90 | bidirectional: bool = False, 91 | device=None, 92 | dtype=None, 93 | ) -> None: 94 | super().__init__() 95 | self.input_size = input_size 96 | self.hidden_size = hidden_size 97 | self.num_layers = num_layers 98 | self.nonlinearity = nonlinearity 99 | self.has_bias = bias 100 | self.batch_first = batch_first 101 | self.bidirectional = bidirectional 102 | self.kwargs = {"device": Device(device), "dtype": dtype} 103 | 104 | assert num_layers > 0 105 | size_list = [input_size] + [hidden_size] * (num_layers - 1) 106 | self.RNNCells: List[RNNCell] = [] 107 | for i in range(num_layers): 108 | cell = RNNCell( 109 | size_list[i], 110 | hidden_size, 111 | bias, 112 | nonlinearity, 113 | **self.kwargs, 114 | ) 115 | setattr(self, 'rnn_{}'.format(i), cell) 116 | self.RNNCells.append(cell) 117 | if self.bidirectional: 118 | self.rRNNCells: List[RNNCell] = [] 119 | for i in range(num_layers): 120 | cell = RNNCell( 121 | size_list[i], 122 | hidden_size, 123 | bias, 124 | nonlinearity, 125 | **self.kwargs, 126 | ) 127 | setattr(self, 'rrnn_{}'.format(i), cell) 128 | self.rRNNCells.append(cell) 129 | 130 | def forward(self, x, h=None): 131 | if self.batch_first and x.ndim == 3: 132 | x = x.swapaxes(0, 1) 133 | 134 | if h is None: 135 | h = self.init_hidden(x) 136 | else: 137 | d = 2 if self.bidirectional else 1 138 | assert (x.ndim == 2 139 | and h.shape == (d * self.num_layers, self.hidden_size) 140 | ) or (x.ndim == 3 and h.shape 141 | == (d * self.num_layers, x.shape[1], 142 | self.hidden_size)), "Wrong hidden state input!" 143 | 144 | if self.num_layers == 1 and not self.bidirectional: 145 | h_list = self.cell_forward(self.RNNCells[0], x, h[0]) 146 | output = core.concat(h_list) 147 | hn = h_list[-1] 148 | 149 | elif self.num_layers == 1 and self.bidirectional: 150 | h_list = self.cell_forward(self.RNNCells[0], x, h[0]) 151 | hr_list = self.cell_forward(self.rRNNCells[0], x[::-1], h[1]) 152 | output = core.concat( 153 | [core.concat(h_list), 154 | core.concat(hr_list[::-1])], 155 | axis=-1, 156 | ) 157 | hn = core.concat([h_list[-1], hr_list[-1]]) 158 | 159 | elif self.num_layers > 1 and not self.bidirectional: 160 | hn_list = [] 161 | for i in range(self.num_layers): 162 | h_list = self.cell_forward( 163 | self.RNNCells[i], 164 | x if i == 0 else core.concat(h_list), 165 | h[i], 166 | ) 167 | hn_list.append(h_list[-1]) 168 | output = core.concat(h_list) 169 | hn = core.concat(hn_list) 170 | 171 | else: 172 | hn_list = [] 173 | hrn_list = [] 174 | for i in range(self.num_layers): 175 | h_list = self.cell_forward( 176 | self.RNNCells[i], 177 | x if i == 0 else core.concat(h_list), 178 | h[i], 179 | ) 180 | hr_list = self.cell_forward( 181 | self.rRNNCells[i], 182 | x[::-1] if i == 0 else core.concat(hr_list), 183 | h[i + self.num_layers], 184 | ) 185 | hn_list.append(h_list[-1]) 186 | hrn_list.append(hr_list[-1]) 187 | output = core.concat( 188 | [core.concat(h_list), 189 | core.concat(hr_list[::-1])], axis=-1) 190 | hn = core.concat(hn_list + hrn_list) 191 | 192 | if self.batch_first and x.ndim == 3: 193 | output = output.swapaxes(0, 1) 194 | hn = hn.swapaxes(0, 1) 195 | return output, hn 196 | 197 | def reset_parameters(self): 198 | for i in range(self.num_layers): 199 | self.RNNCells[i].reset_paramters() 200 | if self.bidirectional: 201 | for i in range(self.num_layers): 202 | self.rRNNCells[i].reset_paramters() 203 | 204 | def init_hidden(self, x): 205 | assert x.ndim in {2, 3} 206 | d = 2 if self.bidirectional else 1 207 | if x.ndim == 2: 208 | return zeros( 209 | (d * self.num_layers, self.hidden_size), 210 | **self.kwargs, 211 | ) 212 | else: 213 | batch_size = x.shape[1] 214 | return zeros( 215 | (d * self.num_layers, batch_size, self.hidden_size), 216 | **self.kwargs, 217 | ) 218 | 219 | def cell_forward(self, cell: RNNCell, x, h): 220 | seq_len = x.shape[0] 221 | h_list = [] 222 | for i in range(seq_len): 223 | h = cell(x[i], h) 224 | h_list.append(core.unsqueeze(h, axis=0)) 225 | return h_list 226 | 227 | def __repr__(self) -> str: 228 | return "{}({}, {}, num_layers={}, nonlinearity={}, bias={}, batch_first={}, bidirectional={})".format( 229 | self.__class__.__name__, 230 | self.input_size, 231 | self.hidden_size, 232 | self.num_layers, 233 | self.nonlinearity, 234 | self.has_bias, 235 | self.batch_first, 236 | self.bidirectional, 237 | ) 238 | 239 | def move(self, device): 240 | self.kwargs['device'] = device 241 | return super().move(device) 242 | 243 | 244 | class LSTMCell(Module): 245 | 246 | def __init__( 247 | self, 248 | input_size: int, 249 | hidden_size: int, 250 | bias: bool = True, 251 | device=None, 252 | dtype=None, 253 | ) -> None: 254 | super().__init__() 255 | self.input_size = input_size 256 | self.hidden_size = hidden_size 257 | self.kwargs = {"device": Device(device), "dtype": dtype} 258 | 259 | self.Wx = Parameter(empty((input_size, 4 * hidden_size), 260 | **self.kwargs)) 261 | self.Wh = Parameter( 262 | empty((hidden_size, 4 * hidden_size), **self.kwargs)) 263 | if bias: 264 | self.bias = Parameter(empty(4 * self.hidden_size, **self.kwargs)) 265 | self.has_bias = bias 266 | self.reset_paramters() 267 | 268 | def forward(self, x, hx: Optional[Tuple] = None): 269 | if hx is None: 270 | h = self.init_hidden(x) 271 | c = self.init_hidden(x) 272 | else: 273 | h, c = hx 274 | assert (x.ndim == 1 and h.shape == (self.hidden_size, )) or ( 275 | x.ndim == 2 and h.shape 276 | == (x.shape[0], self.hidden_size)), "Wrong hidden state input!" 277 | assert (x.ndim == 1 and c.shape == (self.hidden_size, )) or ( 278 | x.ndim == 2 and c.shape 279 | == (x.shape[0], self.hidden_size)), "Wrong cell state input!" 280 | lin = x @ self.Wx + h @ self.Wh 281 | if self.has_bias: 282 | lin = lin + self.bias 283 | fio, g = core.hsplit(lin, [3 * self.hidden_size]) 284 | sig_fio, tanh_g = F.sigmoid(fio), F.tanh(g) 285 | f, i, o = core.hsplit(sig_fio, 3) 286 | c = f * c + i * tanh_g 287 | h = o * F.tanh(c) 288 | return h, c 289 | 290 | def init_hidden(self, x): 291 | assert x.ndim in {1, 2} 292 | if x.ndim == 1: 293 | return zeros(self.hidden_size, **self.kwargs) 294 | else: 295 | batch_size = x.shape[0] 296 | return zeros((batch_size, self.hidden_size), **self.kwargs) 297 | 298 | def reset_paramters(self): 299 | bound = math.sqrt(1 / self.hidden_size) 300 | init.uniform_(self.Wx, -bound, bound) 301 | init.uniform_(self.Wh, -bound, bound) 302 | if self.has_bias: 303 | init.uniform_(self.bias, -bound, bound) 304 | 305 | def __repr__(self) -> str: 306 | return "{}({}, {}, bias={})".format( 307 | self.__class__.__name__, 308 | self.input_size, 309 | self.hidden_size, 310 | self.has_bias, 311 | ) 312 | 313 | def move(self, device): 314 | self.kwargs['device'] = device 315 | return super().move(device) 316 | 317 | 318 | class LSTM(Module): 319 | 320 | def __init__( 321 | self, 322 | input_size: int, 323 | hidden_size: int, 324 | num_layers: int = 1, 325 | bias: bool = True, 326 | batch_first: bool = False, 327 | bidirectional: bool = False, 328 | device=None, 329 | dtype=None, 330 | ) -> None: 331 | super().__init__() 332 | self.input_size = input_size 333 | self.hidden_size = hidden_size 334 | self.num_layers = num_layers 335 | self.has_bias = bias 336 | self.batch_first = batch_first 337 | self.bidirectional = bidirectional 338 | self.kwargs = {"device": Device(device), "dtype": dtype} 339 | 340 | assert num_layers > 0 341 | size_list = [input_size] + [hidden_size] * (num_layers - 1) 342 | self.LSTMCells: List[LSTMCell] = [] 343 | for i in range(num_layers): 344 | cell = LSTMCell( 345 | size_list[i], 346 | hidden_size, 347 | bias, 348 | **self.kwargs, 349 | ) 350 | setattr(self, 'lstm_{}'.format(i), cell) 351 | self.LSTMCells.append(cell) 352 | if self.bidirectional: 353 | self.rLSTMCells: List[LSTMCell] = [] 354 | for i in range(num_layers): 355 | cell = LSTMCell( 356 | size_list[i], 357 | hidden_size, 358 | bias, 359 | **self.kwargs, 360 | ) 361 | setattr(self, 'rlstm_{}'.format(i), cell) 362 | self.rLSTMCells.append(cell) 363 | 364 | def forward(self, x, hx: Optional[Tuple] = None): 365 | if self.batch_first and x.ndim == 3: 366 | x = x.swapaxes(0, 1) 367 | 368 | if hx is None: 369 | h = self.init_hidden(x) 370 | c = self.init_hidden(x) 371 | else: 372 | d = 2 if self.bidirectional else 1 373 | h, c = hx 374 | assert (x.ndim == 2 375 | and h.shape == (d * self.num_layers, self.hidden_size) 376 | ) or (x.ndim == 3 and h.shape 377 | == (d * self.num_layers, x.shape[1], 378 | self.hidden_size)), "Wrong hidden state input!" 379 | assert (x.ndim == 2 380 | and c.shape == (d * self.num_layers, self.hidden_size) 381 | ) or (x.ndim == 3 and c.shape 382 | == (d * self.num_layers, x.shape[1], 383 | self.hidden_size)), "Wrong cell state input!" 384 | 385 | if self.num_layers == 1 and not self.bidirectional: 386 | h_list, c_list = self.cell_forward( 387 | self.LSTMCells[0], 388 | x, 389 | (h[0], c[0]), 390 | ) 391 | output = core.concat(h_list) 392 | hn = h_list[-1] 393 | cn = c_list[-1] 394 | elif self.num_layers == 1 and self.bidirectional: 395 | h_list, c_list = self.cell_forward( 396 | self.LSTMCells[0], 397 | x, 398 | (h[0], c[0]), 399 | ) 400 | hr_list, cr_list = self.cell_forward( 401 | self.rLSTMCells[0], 402 | x[::-1], 403 | (h[1], c[1]), 404 | ) 405 | output = core.concat( 406 | [core.concat(h_list), 407 | core.concat(hr_list[::-1])], axis=-1) 408 | hn = core.concat([h_list[-1], hr_list[-1]]) 409 | cn = core.concat([c_list[-1], cr_list[-1]]) 410 | elif self.num_layers > 1 and not self.bidirectional: 411 | hn_list, cn_list = [], [] 412 | for i in range(self.num_layers): 413 | h_list, c_list = self.cell_forward( 414 | self.LSTMCells[i], 415 | x if i == 0 else core.concat(h_list), 416 | (h[i], c[i]), 417 | ) 418 | hn_list.append(h_list[-1]) 419 | cn_list.append(c_list[-1]) 420 | output = core.concat(h_list) 421 | hn = core.concat(hn_list) 422 | cn = core.concat(cn_list) 423 | else: 424 | hn_list, hrn_list = [], [] 425 | cn_list, crn_list = [], [] 426 | for i in range(self.num_layers): 427 | h_list, c_list = self.cell_forward( 428 | self.LSTMCells[i], 429 | x if i == 0 else core.concat(h_list), 430 | (h[i], c[i]), 431 | ) 432 | hr_list, cr_list = self.cell_forward( 433 | self.rLSTMCells[i], 434 | x[::-1] if i == 0 else core.concat(hr_list), 435 | (h[i + self.num_layers], c[i + self.num_layers]), 436 | ) 437 | hn_list.append(h_list[-1]) 438 | hrn_list.append(hr_list[-1]) 439 | cn_list.append(c_list[-1]) 440 | crn_list.append(cr_list[-1]) 441 | output = core.concat( 442 | [core.concat(h_list), 443 | core.concat(hr_list[::-1])], axis=-1) 444 | hn = core.concat(hn_list + hrn_list) 445 | cn = core.concat(cn_list + crn_list) 446 | if self.batch_first and x.ndim == 3: 447 | output = output.swapaxes(0, 1) 448 | hn = hn.swapaxes(0, 1) 449 | cn = cn.swapaxes(0, 1) 450 | 451 | return output, (hn, cn) 452 | 453 | def reset_parameters(self): 454 | for i in range(self.num_layers): 455 | self.LSTMCells[i].reset_paramters() 456 | if self.bidirectional: 457 | for i in range(self.num_layers): 458 | self.rLSTMCells[i].reset_paramters() 459 | 460 | def init_hidden(self, x): 461 | assert x.ndim in {2, 3} 462 | d = 2 if self.bidirectional else 1 463 | if x.ndim == 2: 464 | return zeros( 465 | (d * self.num_layers, self.hidden_size), 466 | **self.kwargs, 467 | ) 468 | else: 469 | batch_size = x.shape[1] 470 | return zeros( 471 | (d * self.num_layers, batch_size, self.hidden_size), 472 | **self.kwargs, 473 | ) 474 | 475 | def cell_forward(self, cell: RNNCell, x, h: Tuple): 476 | seq_len = x.shape[0] 477 | h_list, c_list = [], [] 478 | for i in range(seq_len): 479 | h = cell(x[i], h) # Infact, `h` here is a tuple (h, c) 480 | h_list.append(core.unsqueeze(h[0], axis=0)) 481 | c_list.append(core.unsqueeze(h[1], axis=0)) 482 | return h_list, c_list 483 | 484 | def __repr__(self) -> str: 485 | return "{}({}, {}, num_layers={}, bias={}, batch_first={}, bidirectional={})".format( 486 | self.__class__.__name__, 487 | self.input_size, 488 | self.hidden_size, 489 | self.num_layers, 490 | self.has_bias, 491 | self.batch_first, 492 | self.bidirectional, 493 | ) 494 | 495 | def move(self, device): 496 | self.kwargs['device'] = device 497 | return super().move(device) 498 | 499 | 500 | class GRUCell(Module): 501 | 502 | def __init__( 503 | self, 504 | input_size: int, 505 | hidden_size: int, 506 | bias: bool = True, 507 | device=None, 508 | dtype=None, 509 | ) -> None: 510 | super().__init__() 511 | self.input_size = input_size 512 | self.hidden_size = hidden_size 513 | self.kwargs = {"device": Device(device), "dtype": dtype} 514 | 515 | self.Wx1 = Parameter( 516 | empty((input_size, 2 * hidden_size), **self.kwargs)) 517 | self.Wh1 = Parameter( 518 | empty((hidden_size, 2 * hidden_size), **self.kwargs)) 519 | self.Wx2 = Parameter(empty((input_size, hidden_size), **self.kwargs)) 520 | self.Wh2 = Parameter(empty((hidden_size, hidden_size), **self.kwargs)) 521 | 522 | if bias: 523 | self.bias1 = Parameter(empty(2 * self.hidden_size, **self.kwargs)) 524 | self.bias2 = Parameter(empty(self.hidden_size, **self.kwargs)) 525 | 526 | self.has_bias = bias 527 | self.reset_parameters() 528 | 529 | def forward(self, x, h=None): 530 | if h is None: 531 | h = self.init_hidden(x) 532 | else: 533 | assert (x.ndim == 1 and h.shape == (self.hidden_size, )) or ( 534 | x.ndim == 2 and h.shape 535 | == (x.shape[0], self.hidden_size)), "Wrong hidden state input!" 536 | 537 | lin1 = x @ self.Wx1 + h @ self.Wh1 538 | if self.has_bias: 539 | lin1 = lin1 + self.bias1 540 | z, r = core.split(F.sigmoid(lin1), 2, axis=1) 541 | lin2 = x @ self.Wx2 + (r * h) @ self.Wh2 542 | if self.has_bias: 543 | lin2 = lin2 + self.bias2 544 | return (1 - z) * h + z * F.tanh(lin2) 545 | 546 | def reset_parameters(self): 547 | bound = math.sqrt(1 / self.hidden_size) 548 | init.uniform_(self.Wx1, -bound, bound) 549 | init.uniform_(self.Wx2, -bound, bound) 550 | init.uniform_(self.Wh1, -bound, bound) 551 | init.uniform_(self.Wh2, -bound, bound) 552 | if self.has_bias: 553 | init.uniform_(self.bias1, -bound, bound) 554 | init.uniform_(self.bias2, -bound, bound) 555 | 556 | def init_hidden(self, x): 557 | assert x.ndim in {1, 2} 558 | if x.ndim == 1: 559 | return zeros(self.hidden_size, **self.kwargs) 560 | else: 561 | batch_size = x.shape[0] 562 | return zeros((batch_size, self.hidden_size), **self.kwargs) 563 | 564 | def __repr__(self) -> str: 565 | return "{}({}, {}, bias={})".format( 566 | self.__class__.__name__, 567 | self.input_size, 568 | self.hidden_size, 569 | self.has_bias, 570 | ) 571 | 572 | def move(self, device): 573 | self.kwargs['device'] = device 574 | return super().move(device) 575 | 576 | 577 | class GRU(Module): 578 | 579 | def __init__( 580 | self, 581 | input_size: int, 582 | hidden_size: int, 583 | num_layers: int = 1, 584 | bias: bool = True, 585 | batch_first: bool = False, 586 | bidirectional: bool = False, 587 | device=None, 588 | dtype=None, 589 | ) -> None: 590 | super().__init__() 591 | self.input_size = input_size 592 | self.hidden_size = hidden_size 593 | self.num_layers = num_layers 594 | self.has_bias = bias 595 | self.batch_first = batch_first 596 | self.bidirectional = bidirectional 597 | self.kwargs = {"device": Device(device), "dtype": dtype} 598 | 599 | assert num_layers > 0 600 | size_list = [input_size] + [hidden_size] * (num_layers - 1) 601 | self.GRUCells: List[GRUCell] = [] 602 | for i in range(num_layers): 603 | cell = GRUCell( 604 | size_list[i], 605 | hidden_size, 606 | bias, 607 | **self.kwargs, 608 | ) 609 | setattr(self, 'gru_{}'.format(i), cell) 610 | self.GRUCells.append(cell) 611 | if self.bidirectional: 612 | self.rGRUCells: List[GRUCell] = [] 613 | for i in range(num_layers): 614 | cell = GRUCell( 615 | size_list[i], 616 | hidden_size, 617 | bias, 618 | **self.kwargs, 619 | ) 620 | setattr(self, 'rgru_{}'.format(i), cell) 621 | self.rGRUCells.append(cell) 622 | 623 | def forward(self, x, h=None): 624 | if self.batch_first and x.ndim == 3: 625 | x = x.swapaxes(0, 1) 626 | 627 | if h is None: 628 | h = self.init_hidden(x) 629 | else: 630 | d = 2 if self.bidirectional else 1 631 | assert (x.ndim == 2 632 | and h.shape == (d * self.num_layers, self.hidden_size) 633 | ) or (x.ndim == 3 and h.shape 634 | == (d * self.num_layers, x.shape[1], 635 | self.hidden_size)), "Wrong hidden state input!" 636 | 637 | if self.num_layers == 1 and not self.bidirectional: 638 | h_list = self.cell_forward(self.GRUCells[0], x, h[0]) 639 | output = core.concat(h_list) 640 | hn = h_list[-1] 641 | 642 | elif self.num_layers == 1 and self.bidirectional: 643 | h_list = self.cell_forward(self.GRUCells[0], x, h[0]) 644 | hr_list = self.cell_forward(self.rGRUCells[0], x[::-1], h[1]) 645 | output = core.concat( 646 | [core.concat(h_list), 647 | core.concat(hr_list[::-1])], axis=-1) 648 | hn = core.concat([h_list[-1], hr_list[-1]]) 649 | 650 | elif self.num_layers > 1 and not self.bidirectional: 651 | hn_list = [] 652 | for i in range(self.num_layers): 653 | h_list = self.cell_forward( 654 | self.GRUCells[i], 655 | x if i == 0 else core.concat(h_list), 656 | h[i], 657 | ) 658 | hn_list.append(h_list[-1]) 659 | output = core.concat(h_list) 660 | hn = core.concat(hn_list) 661 | 662 | else: 663 | hn_list = [] 664 | hrn_list = [] 665 | for i in range(self.num_layers): 666 | h_list = self.cell_forward( 667 | self.GRUCells[i], 668 | x if i == 0 else core.concat(h_list), 669 | h[i], 670 | ) 671 | hr_list = self.cell_forward( 672 | self.rGRUCells[i], 673 | x[::-1] if i == 0 else core.concat(hr_list), 674 | h[i + self.num_layers], 675 | ) 676 | hn_list.append(h_list[-1]) 677 | hrn_list.append(hr_list[-1]) 678 | output = core.concat( 679 | [core.concat(h_list), 680 | core.concat(hr_list[::-1])], axis=-1) 681 | hn = core.concat(hn_list + hrn_list) 682 | 683 | if self.batch_first and x.ndim == 3: 684 | output = output.swapaxes(0, 1) 685 | hn = hn.swapaxes(0, 1) 686 | return output, hn 687 | 688 | def init_hidden(self, x): 689 | assert x.ndim in {2, 3} 690 | d = 2 if self.bidirectional else 1 691 | if x.ndim == 2: 692 | return zeros( 693 | (d * self.num_layers, self.hidden_size), 694 | **self.kwargs, 695 | ) 696 | else: 697 | return zeros( 698 | (d * self.num_layers, x.shape[1], self.hidden_size), 699 | **self.kwargs, 700 | ) 701 | 702 | def cell_forward(self, cell: GRUCell, x, h): 703 | seq_len = x.shape[0] 704 | h_list = [] 705 | for i in range(seq_len): 706 | h = cell(x[i], h) 707 | h_list.append(core.unsqueeze(h, axis=0)) 708 | return h_list 709 | 710 | def __repr__(self) -> str: 711 | return "{}({}, {}, num_layers={}, bias={}, batch_first={}, bidirectional={})".format( 712 | self.__class__.__name__, 713 | self.input_size, 714 | self.hidden_size, 715 | self.num_layers, 716 | self.has_bias, 717 | self.batch_first, 718 | self.bidirectional, 719 | ) 720 | 721 | def move(self, device): 722 | self.kwargs['device'] = device 723 | return super().move(device) 724 | --------------------------------------------------------------------------------