├── .gitignore
├── imgs
├── ad1d.png
├── ad2d.png
├── rnn.png
├── mlp_cnn.png
├── dropout_bn.png
└── transformer.png
├── pytest.ini
├── .gitattributes
├── llm
├── clip
│ ├── picture.png
│ ├── tokenizer.py
│ ├── model.py
│ └── infer.py
└── llama
│ ├── tokenizer.py
│ ├── infer.py
│ └── model.py
├── pydynet
├── nn
│ ├── __init__.py
│ ├── parameter.py
│ ├── modules
│ │ ├── dropout.py
│ │ ├── loss.py
│ │ ├── __init__.py
│ │ ├── activation.py
│ │ ├── pool.py
│ │ ├── linear.py
│ │ ├── conv.py
│ │ ├── module.py
│ │ ├── norm.py
│ │ └── rnn.py
│ ├── init.py
│ └── functional.py
├── optim
│ ├── __init__.py
│ ├── lr_scheduler.py
│ └── optimizer.py
├── core
│ ├── __init__.py
│ └── function.py
├── __init__.py
├── autograd.py
├── special.py
├── cuda.py
└── data.py
├── requirements.txt
├── tests
├── test_backward.py
└── test_tensor_basic.py
├── setup.py
├── LICENSE
├── .github
└── workflows
│ └── python-publish.yml
├── examples
├── pydynet
│ ├── autograd1d.py
│ ├── autograd2d.py
│ ├── ts_prediction.py
│ ├── mnist.py
│ ├── dropout_bn.py
│ └── transformer.py
└── pytorch
│ ├── ts_prediction.py
│ ├── mnist.py
│ ├── dropout_bn.py
│ └── transformer.py
├── cnREADME.md
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | __pycache__/
3 | data
--------------------------------------------------------------------------------
/imgs/ad1d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WeltXing/PyDyNet/HEAD/imgs/ad1d.png
--------------------------------------------------------------------------------
/imgs/ad2d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WeltXing/PyDyNet/HEAD/imgs/ad2d.png
--------------------------------------------------------------------------------
/imgs/rnn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WeltXing/PyDyNet/HEAD/imgs/rnn.png
--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | filterwarnings =
3 | ignore::UserWarning:pydynet
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
--------------------------------------------------------------------------------
/imgs/mlp_cnn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WeltXing/PyDyNet/HEAD/imgs/mlp_cnn.png
--------------------------------------------------------------------------------
/imgs/dropout_bn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WeltXing/PyDyNet/HEAD/imgs/dropout_bn.png
--------------------------------------------------------------------------------
/imgs/transformer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WeltXing/PyDyNet/HEAD/imgs/transformer.png
--------------------------------------------------------------------------------
/llm/clip/picture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WeltXing/PyDyNet/HEAD/llm/clip/picture.png
--------------------------------------------------------------------------------
/pydynet/nn/__init__.py:
--------------------------------------------------------------------------------
1 | from .modules import *
2 | from .parameter import Parameter
3 | from . import init
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=2.0.0
2 | cupy-cuda12x # For Cuda 12.x; refer to https://docs.cupy.dev/en/stable/install.html
3 |
--------------------------------------------------------------------------------
/tests/test_backward.py:
--------------------------------------------------------------------------------
1 | import sys, pytest, random
2 | import numpy as np
3 |
4 | sys.path.append('../pydynet')
5 |
6 | np.random.seed(0)
7 | random.seed(0)
8 |
9 | type_list = [np.float16, np.float32, np.float64]
10 |
--------------------------------------------------------------------------------
/pydynet/optim/__init__.py:
--------------------------------------------------------------------------------
1 | from .optimizer import (
2 | Optimizer,
3 | SGD,
4 | Adagrad,
5 | Adadelta,
6 | Adam,
7 | )
8 | from .lr_scheduler import (
9 | _LRScheduler,
10 | ExponentialLR,
11 | StepLR,
12 | MultiStepLR,
13 | CosineAnnealingLR,
14 | )
15 |
--------------------------------------------------------------------------------
/pydynet/core/__init__.py:
--------------------------------------------------------------------------------
1 | from .tensor import (Tensor, add, sub, mul, div, pow, matmul, abs, sum, mean,
2 | min, max, min, argmax, argmin, maximum, minimum, exp, log,
3 | sign, reshape, transpose, swapaxes, concat, sigmoid, tanh)
4 | from .function import sqrt, square, vsplit, hsplit, dsplit, split, unsqueeze, squeeze
5 |
--------------------------------------------------------------------------------
/pydynet/nn/parameter.py:
--------------------------------------------------------------------------------
1 | from ..core import Tensor
2 |
3 |
4 | class Parameter(Tensor):
5 |
6 | def __init__(self, data: Tensor, requires_grad: bool = True) -> None:
7 | super().__init__(
8 | data=data.data,
9 | dtype=data.dtype,
10 | device=data.device,
11 | copy=False,
12 | requires_grad=requires_grad,
13 | )
14 |
15 | def __repr__(self) -> str:
16 | return "Parameter : \n{}".format(self.data) + (",\ndevice={}".format(
17 | self.device) if self.device.device != "cpu" else "")
18 |
--------------------------------------------------------------------------------
/pydynet/nn/modules/dropout.py:
--------------------------------------------------------------------------------
1 | from .module import Module
2 | from ...core import Tensor
3 | from ...special import rand
4 |
5 |
6 | class Dropout(Module):
7 |
8 | def __init__(self, p: float = 0.5) -> None:
9 | super().__init__()
10 | assert p >= 0 and p < 1
11 | self.p = p
12 |
13 | def forward(self, x) -> Tensor:
14 | if self._train:
15 | mask = rand(*x.shape, device=x.device) < 1 - self.p
16 | return x * mask.astype(x.dtype) / (1 - self.p)
17 | return x
18 |
19 | def __repr__(self) -> str:
20 | return "{}(p={})".format(self.__class__.__name__, self.p)
21 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 |
3 | setuptools.setup(
4 | name='pydynet',
5 | version='1.0',
6 | description=
7 | 'PyDyNet: Neuron Network (MLP, CNN, RNN, Transformer, ...) implementation using Numpy with Autodiff',
8 | author="Cun-Yuan Xing",
9 | author_email="xingcy@lamda.nju.edu.cn",
10 | maintainer="Cun-Yuan Xing",
11 | maintainer_email="xingcy@lamad.nju.edu.cn",
12 | packages=[
13 | 'pydynet', 'pydynet/optim', 'pydynet/nn', 'pydynet/nn/modules',
14 | 'pydynet/core'
15 | ],
16 | license='MIT License',
17 | install_requires=['numpy>=2.0.0'],
18 | long_description=open('README.md', encoding='utf-8').read(),
19 | long_description_content_type="text/markdown",
20 | url='https://github.com/WeltXing/PyDyNet',
21 | )
22 |
--------------------------------------------------------------------------------
/pydynet/__init__.py:
--------------------------------------------------------------------------------
1 | from .core import (Tensor, add, sub, mul, div, pow, matmul, abs, sum, mean,
2 | min, max, min, argmax, argmin, maximum, minimum, exp, log,
3 | sign, reshape, transpose, swapaxes, concat, sigmoid, tanh,
4 | sqrt, square, vsplit, hsplit, dsplit, split, unsqueeze,
5 | squeeze)
6 | from .special import zeros, ones, rand, randn, empty, uniform
7 | from .cuda import Device
8 | from .autograd import enable_grad, no_grad
9 |
10 | __all__ = [
11 | "Tensor", "add", "sub", "mul", "div", "pow", "matmul", "abs", "sum",
12 | "mean", "min", "max", "argmax", "argmin", "maximum", "minimum", "exp",
13 | "log", "sign", "reshape", "transpose", "swapaxes", "concat", 'sigmoid',
14 | 'tanh', "sqrt", "square", "vsplit", "hsplit", "dsplit", "split",
15 | "unsqueeze", "squeeze", "zeros", "ones", "rand", "randn", "empty",
16 | "uniform", "Device", "enable_grad", "no_grad"
17 | ]
18 |
--------------------------------------------------------------------------------
/pydynet/nn/modules/loss.py:
--------------------------------------------------------------------------------
1 | from .module import Module
2 | from .. import functional as F
3 | from ...core import Tensor
4 |
5 |
6 | class Loss(Module):
7 | '''损失函数基类'''
8 |
9 | def __init__(self, reduction='mean') -> None:
10 | super().__init__()
11 | self.reduction = reduction
12 | assert self.reduction in {'mean', 'sum'}
13 |
14 | def forward(self, y_pred: Tensor, y_true: Tensor) -> Tensor:
15 | raise NotImplementedError
16 |
17 |
18 | class MSELoss(Loss):
19 |
20 | def forward(self, y_pred: Tensor, y_true: Tensor) -> Tensor:
21 | return F.mse_loss(y_pred, y_true, reduction=self.reduction)
22 |
23 |
24 | class NLLLoss(Loss):
25 |
26 | def forward(self, y_pred: Tensor, y_true: Tensor) -> Tensor:
27 | return F.nll_loss(y_pred, y_true, reduction=self.reduction)
28 |
29 |
30 | class CrossEntropyLoss(Loss):
31 |
32 | def forward(self, y_pred: Tensor, y_true: Tensor) -> Tensor:
33 | return F.cross_entropy_loss(y_pred, y_true, reduction=self.reduction)
34 |
--------------------------------------------------------------------------------
/pydynet/nn/modules/__init__.py:
--------------------------------------------------------------------------------
1 | from .activation import Sigmoid, Tanh, ReLU, LeakyReLU, Softmax
2 | from .norm import BatchNorm1d, BatchNorm2d, LayerNorm, RMSNorm
3 | from .conv import Conv1d, Conv2d
4 | from .pool import MaxPool1d, MaxPool2d, AvgPool1d, AvgPool2d
5 | from .dropout import Dropout
6 | from .linear import Linear, Embedding
7 | from .loss import MSELoss, NLLLoss, CrossEntropyLoss
8 | from .module import Module, Sequential, ModuleList
9 | from .rnn import RNN, LSTM, GRU, RNNCell, LSTMCell, GRUCell
10 |
11 | __all__ = [
12 | "Sigmoid",
13 | "Tanh",
14 | "ReLU",
15 | "LeakyReLU",
16 | "Softmax",
17 | "BatchNorm1d",
18 | "BatchNorm2d",
19 | "LayerNorm",
20 | "RMSNorm",
21 | "Conv1d",
22 | "Conv2d",
23 | "MaxPool1d",
24 | "MaxPool2d",
25 | "AvgPool1d",
26 | "AvgPool2d",
27 | "Dropout",
28 | "Linear",
29 | "Embedding",
30 | "MSELoss",
31 | "NLLLoss",
32 | "CrossEntropyLoss",
33 | "Module",
34 | "Sequential",
35 | "ModuleList",
36 | "RNN",
37 | "LSTM",
38 | "GRU",
39 | "RNNCell",
40 | "LSTMCell",
41 | "GRUCell",
42 | ]
43 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Welt Xing
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/pydynet/autograd.py:
--------------------------------------------------------------------------------
1 | import functools
2 |
3 | grad_enable = True
4 |
5 |
6 | def is_grad_enable():
7 | return grad_enable
8 |
9 |
10 | def set_grad_enabled(mode: bool):
11 | global grad_enable
12 | grad_enable = mode
13 |
14 |
15 | class no_grad:
16 |
17 | def __enter__(self) -> None:
18 | self.prev = is_grad_enable()
19 | set_grad_enabled(False)
20 |
21 | def __exit__(self, exc_type, exc_value, traceback) -> None:
22 | set_grad_enabled(self.prev)
23 |
24 | def __call__(self, func):
25 |
26 | @functools.wraps(func)
27 | def decorate_context(*args, **kwargs):
28 | with __class__():
29 | return func(*args, **kwargs)
30 |
31 | return decorate_context
32 |
33 |
34 | class enable_grad:
35 |
36 | def __enter__(self) -> None:
37 | self.prev = is_grad_enable()
38 | set_grad_enabled(True)
39 |
40 | def __exit__(self, exc_type, exc_value, traceback) -> None:
41 | set_grad_enabled(self.prev)
42 |
43 | def __call__(self, func):
44 |
45 | @functools.wraps(func)
46 | def decorate_context(*args, **kwargs):
47 | with __class__():
48 | return func(*args, **kwargs)
49 |
50 | return decorate_context
51 |
--------------------------------------------------------------------------------
/pydynet/nn/modules/activation.py:
--------------------------------------------------------------------------------
1 | from .module import Module
2 | from .. import functional as F
3 | from ...core import Tensor
4 |
5 |
6 | class Sigmoid(Module):
7 | '''激活函数层 : Sigmoid'''
8 |
9 | def forward(self, x) -> Tensor:
10 | return F.sigmoid(x)
11 |
12 | def __repr__(self) -> str:
13 | return "{}()".format(self.__class__.__name__)
14 |
15 |
16 | class Tanh(Module):
17 | '''激活函数层 : Tanh'''
18 |
19 | def forward(self, x) -> Tensor:
20 | return F.tanh(x)
21 |
22 | def __repr__(self) -> str:
23 | return "{}()".format(self.__class__.__name__)
24 |
25 |
26 | class ReLU(Module):
27 | '''激活函数层 : ReLU'''
28 |
29 | def forward(self, x) -> Tensor:
30 | return F.relu(x)
31 |
32 | def __repr__(self) -> str:
33 | return "{}()".format(self.__class__.__name__)
34 |
35 |
36 | class LeakyReLU(Module):
37 | '''
38 | 激活函数层 : LeakyReLU
39 |
40 | Parameter
41 | ---------
42 | alpha : float
43 | 负输入对应的斜率.
44 | '''
45 |
46 | def __init__(self, alpha: float = 0.1) -> None:
47 | super().__init__()
48 | self.alpha = float(alpha)
49 |
50 | def forward(self, x) -> Tensor:
51 | return F.leaky_relu(x, self.alpha)
52 |
53 | def __repr__(self) -> str:
54 | return "{}(alpha={})".format(self.__class__.__name__, self.alpha)
55 |
56 |
57 | class Softmax(Module):
58 | '''
59 | 激活函数层 : softmax
60 |
61 | Parameter
62 | ---------
63 | axis : Optional[Tuple[int]], default=None
64 | 沿着axis计算softmax.
65 | '''
66 |
67 | def __init__(self, axis=None) -> None:
68 | super().__init__()
69 | self.axis = axis
70 |
71 | def forward(self, x) -> Tensor:
72 | return F.softmax(x, self.axis)
73 |
74 | def __repr__(self) -> str:
75 | return "{}(axis={})".format(self.__class__.__name__, self.axis)
76 |
--------------------------------------------------------------------------------
/llm/llama/tokenizer.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 | import json
3 |
4 |
5 | class Tokenizer:
6 | def __init__(self, model_path: str):
7 | with open(model_path, "r", encoding="utf-8") as f:
8 | model = json.load(f)
9 | self.vocab = model["tokens"]
10 | self.scores = model["scores"]
11 | self.bos_id = 1
12 | self.eos_id = 2
13 |
14 | def str_lookup(self, token: str) -> int:
15 | try:
16 | index = self.vocab.index(token)
17 | return index
18 | except ValueError as err:
19 | return -1
20 |
21 | def encode(
22 | self,
23 | text: str,
24 | add_bos: bool = True,
25 | add_eos: bool = False,
26 | ) -> List[int]:
27 | tokens = []
28 | for pos, char in enumerate(text):
29 | id = self.str_lookup(char)
30 | if id >= 0:
31 | tokens.append(id)
32 | while True:
33 | best_score = -1e10
34 | best_id = -1
35 | best_idx = -1
36 |
37 | for i in range(len(tokens) - 1):
38 | # Check if we can merge the pair (tokens[i], tokens[i+1])
39 | string = self.vocab[tokens[i]] + self.vocab[tokens[i + 1]]
40 | id = self.str_lookup(string)
41 | if id != -1 and self.scores[id] > best_score:
42 | best_score = self.scores[id]
43 | best_id = id
44 | best_idx = i
45 |
46 | if best_idx == -1:
47 | break
48 |
49 | # Merge the consecutive pair (best_idx, best_idx+1) into new token best_id
50 | tokens[best_idx] = best_id
51 | # Delete token at position best_idx+1, shift the entire sequence back 1
52 | tokens = tokens[0: best_idx + 1] + tokens[best_idx + 2:]
53 | if add_bos:
54 | tokens.insert(0, self.bos_id)
55 | if add_eos:
56 | tokens.append(self.eos_id)
57 | return tokens
58 |
59 | def decode(self, ids: List[int]) -> str:
60 | res = []
61 | for i in ids:
62 | token = self.vocab[i]
63 | res.append(token)
64 | text = "".join(res)
65 | text = text.strip("").strip("")
66 | return text
67 |
--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
1 | # This workflow will upload a Python Package to PyPI when a release is created
2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
3 |
4 | # This workflow uses actions that are not certified by GitHub.
5 | # They are provided by a third-party and are governed by
6 | # separate terms of service, privacy policy, and support
7 | # documentation.
8 |
9 | name: PyDyNet
10 |
11 | on:
12 | release:
13 | types: [published]
14 |
15 | permissions:
16 | contents: read
17 |
18 | jobs:
19 | release-build:
20 | runs-on: ubuntu-latest
21 |
22 | steps:
23 | - uses: actions/checkout@v4
24 |
25 | - uses: actions/setup-python@v5
26 | with:
27 | python-version: "3.x"
28 |
29 | - name: Build release distributions
30 | run: |
31 | # NOTE: put your own distribution build steps here.
32 | python -m pip install build
33 | python -m build
34 |
35 | - name: Upload distributions
36 | uses: actions/upload-artifact@v4
37 | with:
38 | name: release-dists
39 | path: dist/
40 |
41 | pypi-publish:
42 | runs-on: ubuntu-latest
43 | needs:
44 | - release-build
45 | permissions:
46 | # IMPORTANT: this permission is mandatory for trusted publishing
47 | id-token: write
48 |
49 | # Dedicated environments with protections for publishing are strongly recommended.
50 | # For more information, see: https://docs.github.com/en/actions/deployment/targeting-different-environments/using-environments-for-deployment#deployment-protection-rules
51 | environment:
52 | name: pypi
53 | # OPTIONAL: uncomment and update to include your PyPI project URL in the deployment status:
54 | # url: https://pypi.org/p/YOURPROJECT
55 | #
56 | # ALTERNATIVE: if your GitHub Release name is the PyPI project version string
57 | # ALTERNATIVE: exactly, uncomment the following line instead:
58 | # url: https://pypi.org/project/YOURPROJECT/${{ github.event.release.name }}
59 |
60 | steps:
61 | - name: Retrieve release distributions
62 | uses: actions/download-artifact@v4
63 | with:
64 | name: release-dists
65 | path: dist/
66 |
67 | - name: Publish release distributions to PyPI
68 | uses: pypa/gh-action-pypi-publish@release/v1
69 | with:
70 | packages-dir: dist/
71 |
--------------------------------------------------------------------------------
/examples/pydynet/autograd1d.py:
--------------------------------------------------------------------------------
1 | import pydynet as pdn
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 |
5 | device = f'cuda:{pdn.cuda.device_count() - 1}' if pdn.cuda.is_available(
6 | ) else 'cpu'
7 |
8 |
9 | def auto_grad(x: float, lr: float, n_iter: int):
10 | x_list = [x]
11 | x: pdn.Tensor = pdn.Tensor(float(x), requires_grad=True, device=device)
12 |
13 | for _ in range(n_iter):
14 | x.zero_grad()
15 | y = pdn.log((x - 7)**2 + 6)
16 | y.backward()
17 |
18 | with x.device:
19 | x.data -= lr * x.grad
20 | x_list.append(x.item())
21 |
22 | return x_list
23 |
24 |
25 | def manual_grad(x: float, lr: float, n_iter: int):
26 | x_list = [x]
27 | for _ in range(n_iter):
28 | grad = 2 * (x - 7) / ((x - 7)**2 + 6)
29 | x -= lr * grad
30 |
31 | x_list.append(x)
32 |
33 | return x_list
34 |
35 |
36 | x_ = np.linspace(0, 10, 101)
37 | f = np.log((x_ - 7)**2 + 6)
38 |
39 | x1 = np.array(auto_grad(1., 1.5, 20))
40 | x2 = np.array(manual_grad(1., 1.5, 20))
41 | y1 = np.log((x1 - 7)**2 + 6)
42 | y2 = np.log((x2 - 7)**2 + 6)
43 |
44 | plt.figure(figsize=(9, 3))
45 |
46 | plt.rcParams['font.sans-serif'] = ['Times New Roman']
47 | plt.rcParams['mathtext.fontset'] = 'stix'
48 | plt.rcParams['xtick.direction'] = 'in'
49 | plt.rcParams['ytick.direction'] = 'in'
50 | plt.rcParams['axes.linewidth'] = 0.5
51 |
52 | plt.subplot(1, 2, 1)
53 | plt.grid()
54 |
55 | plt.xlim(0, 10)
56 | plt.ylim(1.5, 4)
57 | plt.plot(x_, f, label=r"$f(x)=\log((x-7)^2+10)$", color='blue', lw=.7)
58 | plt.scatter(x1,
59 | y1,
60 | color='red',
61 | marker='^',
62 | s=50,
63 | zorder=10,
64 | label='Gradient descent with lr=1.5')
65 |
66 | plt.yticks([1.5, 2, 2.5, 3, 3.5, 4], size=13)
67 | plt.xticks([2, 4, 6, 8, 10], size=13)
68 | plt.title("Gradient descent by AutoGrad")
69 | plt.legend()
70 |
71 | plt.subplot(1, 2, 2)
72 |
73 | plt.grid()
74 |
75 | plt.xlim(0, 10)
76 | plt.ylim(1.5, 4)
77 | plt.plot(x_, f, label=r"$f(x)=\log((x-7)^2+10)$", color='blue', lw=.7)
78 | plt.scatter(x1,
79 | y1,
80 | color='green',
81 | marker='*',
82 | s=50,
83 | zorder=10,
84 | label='Gradient descent with lr=1.5')
85 | plt.yticks([1.5, 2, 2.5, 3, 3.5, 4], size=13)
86 | plt.xticks([2, 4, 6, 8, 10], size=13)
87 | plt.title("Gradient descent by Manual calculation")
88 | plt.legend()
89 |
90 | plt.savefig("imgs/ad1d.png")
91 |
--------------------------------------------------------------------------------
/examples/pydynet/autograd2d.py:
--------------------------------------------------------------------------------
1 | import pydynet as pdn
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 |
5 | np.random.seed(42)
6 |
7 | device = f'cuda:{pdn.cuda.device_count() - 1}' if pdn.cuda.is_available(
8 | ) else 'cpu'
9 |
10 | x = np.random.randn(2)
11 | A = pdn.Tensor([
12 | [3, 1.],
13 | [1, 2.],
14 | ]).to(device)
15 | b = pdn.Tensor([-1., 1]).to(device)
16 |
17 |
18 | def auto_grad(x, lr: float, n_iter: float):
19 | Xs, ys = [], []
20 | x = pdn.Tensor(x, requires_grad=True, device=device)
21 |
22 | for _ in range(n_iter):
23 | obj = x @ A @ x / 2 + b @ x
24 | obj.backward()
25 |
26 | Xs.append(x.numpy())
27 | ys.append(obj.item())
28 | with x.device:
29 | x.data -= lr * x.grad
30 | x.zero_grad()
31 |
32 | Xs, ys = np.array(Xs), np.array(ys)
33 | return Xs[:, 0], Xs[:, 1], ys
34 |
35 |
36 | def manual_grad(x, lr: float, n_iter: float):
37 | Xs, ys = [], []
38 |
39 | for _ in range(n_iter):
40 | obj = x @ A @ x / 2 + b @ x
41 |
42 | Xs.append(x.copy())
43 | ys.append(obj.item())
44 |
45 | grad = A.numpy() @ x + b.numpy()
46 | x -= lr * grad
47 |
48 | Xs, ys = np.array(Xs), np.array(ys)
49 | return Xs[:, 0], Xs[:, 1], ys
50 |
51 |
52 | plt.rcParams['font.sans-serif'] = ['Times New Roman']
53 | plt.rcParams['mathtext.fontset'] = 'stix'
54 |
55 | fig = plt.figure(figsize=(8, 4))
56 | ax1 = fig.add_subplot(1, 2, 1, projection='3d')
57 | ax1.plot3D(
58 | *auto_grad(x, .1, 30),
59 | color='red',
60 | lw=0.7,
61 | label=r'$f(x)=\frac{1}{2}x^\top Ax+b^\top x$',
62 | marker='^',
63 | markersize=6,
64 | )
65 |
66 | ax1.tick_params(direction='in')
67 | ax1.set_xlim(.45, .60)
68 | ax1.set_ylim(-.8, 0)
69 | ax1.set_zlim(-.8, -.3)
70 | ax1.set_xticks([.45, .5, .55, .6])
71 | ax1.set_yticks([-.8, -.6, -.4, -.2, 0])
72 |
73 | plt.title('Gradient descent by AutoGrad')
74 | plt.legend(prop={'size': 11})
75 |
76 | ax1 = fig.add_subplot(1, 2, 2, projection='3d')
77 | ax1.plot3D(
78 | *manual_grad(x, .1, 30),
79 | color='blue',
80 | lw=0.7,
81 | label=r'$f(x)=\frac{1}{2}x^\top Ax+b^\top x$',
82 | marker='^',
83 | markersize=6,
84 | )
85 |
86 | ax1.tick_params(direction='in')
87 | ax1.set_xlim(.45, .60)
88 | ax1.set_ylim(-.8, 0)
89 | ax1.set_zlim(-.8, -.3)
90 | ax1.set_xticks([.45, .5, .55, .6])
91 | ax1.set_yticks([-.8, -.6, -.4, -.2, 0])
92 |
93 | plt.title('Gradient descent by Manual calculation')
94 | plt.legend(prop={'size': 11})
95 |
96 | plt.savefig("imgs/ad2d.png")
97 |
--------------------------------------------------------------------------------
/pydynet/nn/modules/pool.py:
--------------------------------------------------------------------------------
1 | from .module import Module
2 | from .. import functional as F
3 |
4 |
5 | class MaxPool1d(Module):
6 |
7 | def __init__(self, kernel_size: int, stride: int, padding: int) -> None:
8 | super().__init__()
9 | self.kernel_size = kernel_size
10 | self.stride = stride
11 | self.padding = padding
12 |
13 | def forward(self, x):
14 | return F.max_pool1d(x, self.kernel_size, self.stride, self.padding)
15 |
16 | def __repr__(self) -> str:
17 | return "{}(kernel_size={}, stride={}, padding={})".format(
18 | self.__class__.__name__,
19 | self.kernel_size,
20 | self.stride,
21 | self.padding,
22 | )
23 |
24 |
25 | class AvgPool1d(Module):
26 |
27 | def __init__(self, kernel_size: int, stride: int, padding: int) -> None:
28 | super().__init__()
29 | self.kernel_size = kernel_size
30 | self.stride = stride
31 | self.padding = padding
32 |
33 | def forward(self, x):
34 | return F.avg_pool1d(x, self.kernel_size, self.stride, self.padding)
35 |
36 | def __repr__(self) -> str:
37 | return "{}(kernel_size={}, stride={}, padding={})".format(
38 | self.__class__.__name__,
39 | self.kernel_size,
40 | self.stride,
41 | self.padding,
42 | )
43 |
44 |
45 | class MaxPool2d(Module):
46 |
47 | def __init__(self, kernel_size: int, stride: int, padding: int) -> None:
48 | super().__init__()
49 | self.kernel_size = kernel_size
50 | self.stride = stride
51 | self.padding = padding
52 |
53 | def forward(self, x):
54 | return F.max_pool2d(x, self.kernel_size, self.stride, self.padding)
55 |
56 | def __repr__(self) -> str:
57 | return "{}(kernel_size={}, stride={}, padding={})".format(
58 | self.__class__.__name__,
59 | self.kernel_size,
60 | self.stride,
61 | self.padding,
62 | )
63 |
64 |
65 | class AvgPool2d(Module):
66 |
67 | def __init__(self, kernel_size: int, stride: int, padding: int) -> None:
68 | super().__init__()
69 | self.kernel_size = kernel_size
70 | self.stride = stride
71 | self.padding = padding
72 |
73 | def forward(self, x):
74 | return F.avg_pool2d(x, self.kernel_size, self.stride, self.padding)
75 |
76 | def __repr__(self) -> str:
77 | return "{}(kernel_size={}, stride={}, padding={})".format(
78 | self.__class__.__name__,
79 | self.kernel_size,
80 | self.stride,
81 | self.padding,
82 | )
83 |
--------------------------------------------------------------------------------
/pydynet/special.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from .core import Tensor
3 |
4 |
5 | # 一些包装的特殊矩阵
6 | def zeros(shape, dtype=None, device=None, requires_grad=False):
7 | '''全0张量
8 |
9 | Parameters
10 | ----------
11 | shape :
12 | 张量形状
13 | require_grad : bool, default=False
14 | 是否需要求导
15 | '''
16 | return Tensor(np.zeros(shape),
17 | dtype=dtype,
18 | device=device,
19 | requires_grad=requires_grad)
20 |
21 |
22 | def ones(shape, dtype=None, device=None, requires_grad=False):
23 | '''全1张量
24 |
25 | Parameters
26 | ----------
27 | shape :
28 | 张量形状
29 | require_grad : bool, default=False
30 | 是否需要求导
31 | '''
32 | return Tensor(np.ones(shape),
33 | dtype=dtype,
34 | device=device,
35 | requires_grad=requires_grad)
36 |
37 |
38 | def randn(*shape, dtype=None, device=None, requires_grad=False):
39 | '''0-1正态分布张量
40 |
41 | Parameters
42 | ----------
43 | *shape :
44 | 张量形状
45 | require_grad : bool, default=False
46 | 是否需要求导
47 | '''
48 | return Tensor(np.random.randn(*shape),
49 | dtype=dtype,
50 | device=device,
51 | requires_grad=requires_grad)
52 |
53 |
54 | def rand(*shape, dtype=None, device=None, requires_grad=False):
55 | '''[0, 1)均匀分布张量
56 |
57 | Parameters
58 | ----------
59 | *shape :
60 | 张量形状
61 | require_grad : bool, default=False
62 | 是否需要求导
63 | '''
64 | return Tensor(np.random.rand(*shape),
65 | dtype=dtype,
66 | device=device,
67 | requires_grad=requires_grad)
68 |
69 |
70 | def uniform(low: float,
71 | high: float,
72 | shape=None,
73 | dtype=None,
74 | device=None,
75 | requires_grad=False):
76 | '''均匀分布张量
77 |
78 | Parameters
79 | ----------
80 | low : float
81 | 均匀分布下界;
82 | high : float
83 | 均匀分布下界;
84 | *shape :
85 | 张量形状
86 | require_grad : bool, default=False
87 | 是否需要求导
88 | '''
89 | return Tensor(np.random.uniform(low, high, size=shape),
90 | dtype=dtype,
91 | device=device,
92 | requires_grad=requires_grad)
93 |
94 |
95 | def empty(shape, dtype=None, device=None, requires_grad=False):
96 | return Tensor(np.empty(shape, dtype=dtype),
97 | dtype=dtype,
98 | device=device,
99 | requires_grad=requires_grad)
100 |
--------------------------------------------------------------------------------
/pydynet/nn/modules/linear.py:
--------------------------------------------------------------------------------
1 | from .module import Module
2 | from ..parameter import Parameter
3 | from .. import init, functional as F
4 | from ...core import Tensor
5 | from ...special import empty
6 | from ...cuda import Device
7 | from ...autograd import no_grad
8 |
9 | import math
10 |
11 |
12 | class Linear(Module):
13 |
14 | def __init__(
15 | self,
16 | in_features: int,
17 | out_features: int,
18 | bias: bool = True,
19 | device=None,
20 | dtype=None,
21 | ) -> None:
22 | super().__init__()
23 | self.in_features = in_features
24 | self.out_features = out_features
25 | kwargs = {"device": Device(device), "dtype": dtype}
26 | self.weight = Parameter(
27 | empty((self.in_features, self.out_features), **kwargs))
28 | self.bias = Parameter(empty(self.out_features, **
29 | kwargs)) if bias else None
30 | self.reset_paramters()
31 |
32 | def reset_paramters(self):
33 | init.kaiming_uniform_(self.weight, a=math.sqrt(5))
34 | if self.bias is not None:
35 | fan_in, _ = init._calculate_fan(self.weight)
36 | bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
37 | init.uniform_(self.bias, -bound, bound)
38 |
39 | def forward(self, x: Tensor):
40 | return F.linear(x, self.weight, self.bias)
41 |
42 | def __repr__(self) -> str:
43 | return "Linear(in_features={}, out_features={}, bias={})".format(
44 | self.in_features, self.out_features, self.bias is not None)
45 |
46 |
47 | class Embedding(Module):
48 |
49 | def __init__(
50 | self,
51 | num_embeddings: int,
52 | embedding_dim: int,
53 | padding_idx=None,
54 | device=None,
55 | dtype=None,
56 | ) -> None:
57 | super().__init__()
58 | self.num_embedding = num_embeddings
59 | self.embedding_dim = embedding_dim
60 | self.padding_idx = padding_idx
61 |
62 | kwargs = {"device": Device(device), "dtype": dtype}
63 | self.weight = Parameter(
64 | empty((self.num_embedding, self.embedding_dim), **kwargs))
65 |
66 | def forward(self, x: Tensor):
67 | return F.embedding(x, self.weight, self.padding_idx)
68 |
69 | def reset_parameters(self) -> None:
70 | init.normal_(self.weight)
71 | self._fill_padding_idx_with_zero()
72 |
73 | def _fill_padding_idx_with_zero(self) -> None:
74 | if self.padding_idx is not None:
75 | with no_grad():
76 | self.weight[self.padding_idx].data = self.weight.xp.zeros(
77 | self.weight[self.padding_idx].shape,
78 | dtype=self.weight.dtype,
79 | )
80 |
--------------------------------------------------------------------------------
/pydynet/nn/init.py:
--------------------------------------------------------------------------------
1 | from ..core import Tensor
2 | from ..autograd import no_grad
3 | import math
4 |
5 |
6 | def calculate_gain(nonlinearity: str, param: float = None) -> float:
7 | return {
8 | "linear": 1,
9 | "conv1d": 1,
10 | "conv2d": 1,
11 | "sigmoid": 1,
12 | "tanh": 5 / 3,
13 | "relu": math.sqrt(2.),
14 | "leaky_relu":
15 | math.sqrt(2. / (1 + (param if param != None else 0.01)**2))
16 | }[nonlinearity]
17 |
18 |
19 | def _calculate_fan(tensor: Tensor):
20 | assert tensor.ndim >= 2
21 | fan_in, fan_out = tensor.shape[:2]
22 | if tensor.ndim > 2:
23 | receptive_field_size = math.prod(tensor.shape[2:])
24 | fan_in *= receptive_field_size
25 | fan_out *= receptive_field_size
26 | return fan_in, fan_out
27 |
28 |
29 | @no_grad()
30 | def uniform_(tensor: Tensor, a=0., b=1.) -> Tensor:
31 | tensor.data[...] = tensor.xp.random.uniform(a, b, tensor.shape)
32 | return tensor
33 |
34 |
35 | @no_grad()
36 | def normal_(tensor: Tensor, mean=0., std=1.) -> Tensor:
37 | tensor.data[...] = tensor.xp.random.normal(mean, std, size=tensor.shape)
38 | return tensor
39 |
40 |
41 | @no_grad()
42 | def constant_(tensor: Tensor, val: float) -> Tensor:
43 | tensor.data[...] = val
44 | return tensor
45 |
46 |
47 | def ones_(tensor: Tensor) -> Tensor:
48 | return constant_(tensor, 1.)
49 |
50 |
51 | def zeros_(tensor: Tensor) -> Tensor:
52 | return constant_(tensor, 0.)
53 |
54 |
55 | def xavier_uniform_(tensor: Tensor, gain: float = 1.) -> Tensor:
56 | fan_in, fan_out = _calculate_fan(tensor)
57 | bound = gain * math.sqrt(6. / (fan_in + fan_out))
58 | return uniform_(tensor, -bound, bound)
59 |
60 |
61 | def xavier_normal_(tensor: Tensor, gain: float = 1.) -> Tensor:
62 | fan_in, fan_out = _calculate_fan(tensor)
63 | std = gain * math.sqrt(2 / (fan_in + fan_out))
64 | return normal_(tensor, std=std)
65 |
66 |
67 | def kaiming_uniform_(tensor: Tensor,
68 | a: float = 0.,
69 | mode='fan_in',
70 | nonlinearity='relu') -> Tensor:
71 | fan_in, fan_out = _calculate_fan(tensor)
72 | fan = {
73 | "fan_in": fan_in,
74 | "fan_out": fan_out,
75 | }[mode]
76 | gain = calculate_gain(nonlinearity, a)
77 | bound = gain * math.sqrt(3. / fan)
78 | return uniform_(tensor, -bound, bound)
79 |
80 |
81 | def kaiming_normal_(tensor: Tensor,
82 | a: float = 0.,
83 | mode='fan_in',
84 | nonlinearity='relu'):
85 | fan_in, fan_out = _calculate_fan(tensor)
86 | fan = {
87 | "fan_in": fan_in,
88 | "fan_out": fan_out,
89 | }[mode]
90 | gain = calculate_gain(nonlinearity, a)
91 | std = gain / math.sqrt(fan)
92 | return normal_(tensor, std=std)
93 |
--------------------------------------------------------------------------------
/pydynet/cuda.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import warnings
3 |
4 | try:
5 | import cupy as cp
6 | cuda_available: bool = True
7 | except ModuleNotFoundError:
8 | warnings.warn(
9 | "Cupy is not installed. You can install it with:\n"
10 | " pip install cupy-cuda12x # or appropriate version for your CUDA",
11 | category=UserWarning)
12 | cuda_available: bool = False
13 | cp = object()
14 |
15 |
16 | def is_available() -> bool:
17 | return cuda_available
18 |
19 |
20 | def device_count() -> int:
21 | if is_available():
22 | return cp.cuda.runtime.getDeviceCount()
23 | else:
24 | return 0
25 |
26 |
27 | def current_device() -> int:
28 | return cp.cuda.runtime.getDevice()
29 |
30 |
31 | def set_device(device: int) -> None:
32 | return cp.cuda.runtime.setDevice(device)
33 |
34 |
35 | class Device:
36 |
37 | def __init__(self, device=None) -> None:
38 | if isinstance(device, str):
39 | if device == "cpu":
40 | self.device = "cpu"
41 | elif device[:4] == "cuda":
42 | self.device = "cuda"
43 | if len(device) == 4:
44 | device += ':0'
45 |
46 | cuda_id = device.split(':')[-1]
47 | if not cuda_id.isdigit():
48 | raise ValueError(f'Wrong cuda id \"{cuda_id}\"!')
49 |
50 | self.device_id = int(cuda_id)
51 | else:
52 | raise ValueError(f"Unknown device \"{device}\"!")
53 |
54 | elif isinstance(device, int):
55 | self.device = "cuda"
56 | self.device_id = device
57 |
58 | elif device is None:
59 | self.device = "cpu"
60 |
61 | elif isinstance(device, Device):
62 | self.device = device.device
63 | if self.device != "cpu":
64 | self.device_id = device.device_id
65 |
66 | if self.device == "cuda":
67 | if not is_available():
68 | raise RuntimeError(
69 | "Cuda device is not supported on this system.")
70 | self.device = cp.cuda.Device(self.device_id)
71 | assert self.device == "cpu" or is_available()
72 |
73 | def __repr__(self) -> str:
74 | if self.device == "cpu":
75 | return "Device(type='cpu')"
76 | else:
77 | return "Device(type='cuda', index={})".format(self.device_id)
78 |
79 | def __eq__(self, device) -> bool:
80 | if not isinstance(device, Device):
81 | device = Device(device)
82 | if self.device == "cpu":
83 | return device.device == "cpu"
84 | else:
85 | if device.device == "cpu":
86 | return False
87 | return self.device == device.device
88 |
89 | @property
90 | def xp(self):
91 | return np if self.device == "cpu" else cp
92 |
93 | def __enter__(self):
94 | if self.device != "cpu" and self.device_id != current_device():
95 | return self.device.__enter__()
96 |
97 | def __exit__(self, type, value, trace):
98 | if self.device != "cpu" and self.device_id != current_device():
99 | return self.device.__exit__(type, value, trace)
100 |
--------------------------------------------------------------------------------
/pydynet/data.py:
--------------------------------------------------------------------------------
1 | from numpy.random import permutation
2 |
3 |
4 | class Dataset:
5 |
6 | def __init__(self) -> None:
7 | pass
8 |
9 | def __getitem__(self, index):
10 | raise NotImplementedError
11 |
12 | def __len__(self):
13 | raise NotImplementedError
14 |
15 |
16 | class Sampler:
17 |
18 | def __init__(self, dataset: Dataset) -> None:
19 | pass
20 |
21 | def __iter__(self):
22 | raise NotImplementedError
23 |
24 |
25 | class SequentialSampler(Sampler):
26 |
27 | def __init__(self, dataset: Dataset) -> None:
28 | self.dataset = dataset
29 |
30 | def __iter__(self):
31 | return iter(range(len(self.dataset)))
32 |
33 | def __len__(self) -> int:
34 | return len(self.dataset)
35 |
36 |
37 | class RandomSampler(Sampler):
38 |
39 | def __init__(self, dataset: Dataset) -> None:
40 | self.dataset = dataset
41 |
42 | def __iter__(self):
43 | yield from permutation(len(self.dataset)).tolist()
44 |
45 | def __len__(self):
46 | return len(self.dataset)
47 |
48 |
49 | class BatchSampler(Sampler):
50 |
51 | def __init__(self, sampler: Sampler, batch_size: int,
52 | drop_last: bool) -> None:
53 | self.sampler = sampler
54 | self.batch_size = batch_size
55 | self.drop_last = drop_last
56 |
57 | def __iter__(self):
58 | batch = []
59 | for idx in self.sampler:
60 | batch.append(idx)
61 | if len(batch) == self.batch_size:
62 | yield batch
63 | batch = []
64 | if len(batch) > 0 and not self.drop_last:
65 | yield batch
66 |
67 | def __len__(self):
68 | if self.drop_last:
69 | return len(self.sampler) // self.batch_size
70 | return (len(self.sampler) + self.batch_size - 1) // self.batch_size
71 |
72 |
73 | class _DataLoaderIter:
74 |
75 | def __init__(self, loader) -> None:
76 | self.loader = loader
77 | self.sample_iter = iter(self.loader.batch_sampler)
78 |
79 | def __next__(self):
80 | index = next(self.sample_iter)
81 | return self.loader.dataset[index]
82 |
83 |
84 | class DataLoader:
85 |
86 | def __init__(self,
87 | dataset: Dataset,
88 | batch_size: int = 1,
89 | shuffle: bool = False,
90 | drop_last: bool = False) -> None:
91 | self.dataset = dataset
92 | self.batch_size = batch_size
93 | self.shuffle = shuffle
94 | self.drop_last = drop_last
95 |
96 | if shuffle:
97 | self.sampler = RandomSampler(dataset)
98 | else:
99 | self.sampler = SequentialSampler(dataset)
100 |
101 | self.batch_sampler = BatchSampler(self.sampler, batch_size, drop_last)
102 | self.batch_size = batch_size
103 | self.drop_last = drop_last
104 |
105 | def __iter__(self):
106 | return _DataLoaderIter(self)
107 |
108 |
109 | def data_loader(X, y, batch_size: int, shuffle: bool = False) -> list:
110 |
111 | class TrainSet(Dataset):
112 |
113 | def __init__(self, X, y) -> None:
114 | self.data = X
115 | self.target = y
116 |
117 | def __getitem__(self, index):
118 | return self.data[index], self.target[index]
119 |
120 | def __len__(self):
121 | return len(self.data)
122 |
123 | return DataLoader(TrainSet(X, y), batch_size, shuffle)
124 |
--------------------------------------------------------------------------------
/llm/llama/infer.py:
--------------------------------------------------------------------------------
1 | import sys, time, argparse
2 | from .tokenizer import Tokenizer
3 | from .model import Llama
4 |
5 | import pydynet as pdn
6 | import numpy as np
7 |
8 |
9 | @pdn.no_grad()
10 | def load_model(llama: Llama, model_path: str) -> Llama:
11 | weight = np.load(model_path)
12 |
13 | llama.tok_embedding.weight.data[...] = weight['model.embed_tokens.weight']
14 | llama.lm_head.weight.data[...] = weight['lm_head.weight'].T
15 |
16 | for i in range(llama.n_layers):
17 | (
18 | llama.layers[i].attention.Q.weight.data[...],
19 | llama.layers[i].attention.K.weight.data[...],
20 | llama.layers[i].attention.V.weight.data[...],
21 | llama.layers[i].attention.O.weight.data[...],
22 | llama.layers[i].ffn.up.weight.data[...],
23 | llama.layers[i].ffn.gate.weight.data[...],
24 | llama.layers[i].ffn.down.weight[...],
25 | llama.layers[i].input_norm.weight.data[...],
26 | llama.layers[i].post_attn_norm.weight.data[...],
27 | ) = (
28 | weight[f'model.layers.{i}.self_attn.q_proj.weight'].T,
29 | weight[f'model.layers.{i}.self_attn.k_proj.weight'].T,
30 | weight[f'model.layers.{i}.self_attn.v_proj.weight'].T,
31 | weight[f'model.layers.{i}.self_attn.o_proj.weight'].T,
32 | weight[f'model.layers.{i}.mlp.up_proj.weight'].T,
33 | weight[f'model.layers.{i}.mlp.gate_proj.weight'].T,
34 | weight[f'model.layers.{i}.mlp.down_proj.weight'].T,
35 | weight[f'model.layers.{i}.input_layernorm.weight'],
36 | weight[f'model.layers.{i}.post_attention_layernorm.weight'],
37 | )
38 |
39 | llama.norm.weight.data[...] = weight['model.norm.weight']
40 |
41 | return llama
42 |
43 |
44 | if __name__ == '__main__':
45 | parser = argparse.ArgumentParser(
46 | description="Prompt input, e.g. There was a boy")
47 | parser.add_argument("--prompt", type=str, default='There was a boy')
48 | parser.add_argument("--cuda", action='store_true')
49 | args = parser.parse_args()
50 |
51 | dim: int = 288 # D
52 | n_layers: int = 6
53 | n_heads: int = 6
54 | vocab_size: int = 32000 # VS
55 | max_seq_len: int = 1024 # M
56 | max_new_tokens: int = 1024
57 | max_batch_size: int = 1
58 | datatype = np.float32
59 |
60 | tokenizer = Tokenizer("llm/llama/data/tokenizer.model.np")
61 | model = load_model(
62 | Llama(vocab_size,
63 | dim,
64 | n_heads,
65 | 768,
66 | max_seq_len,
67 | max_batch_size,
68 | n_layers,
69 | dtype=datatype), "llm/llama/data/stories15M.model.npz")
70 |
71 | # If cuda is available
72 | if args.cuda and pdn.cuda.is_available():
73 | model: Llama = model.to('cuda:2')
74 |
75 | model.eval()
76 | with pdn.no_grad():
77 | print(f"\n{args.prompt}", end="")
78 | input_ids = np.array([tokenizer.encode(args.prompt)])
79 |
80 | _, L = input_ids.shape
81 | start = time.time()
82 | for id in model.generate(input_ids, max_new_tokens):
83 | L += 1
84 | output_id = id[0].numpy().tolist()
85 |
86 | if output_id[-1] in [tokenizer.eos_id, tokenizer.bos_id]:
87 | break
88 | print(tokenizer.decode(output_id), end="")
89 | sys.stdout.flush()
90 | elapsed = time.time() - start
91 | print(
92 | f"\n\nToken count: {L}, elapsed: {elapsed:.2f}s, {round(L / elapsed)} tokens/s"
93 | )
94 |
--------------------------------------------------------------------------------
/examples/pytorch/ts_prediction.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | from sklearn.model_selection import train_test_split
4 | from tqdm import tqdm
5 |
6 | import torch
7 | import pydynet.nn as nn
8 | from pydynet.optim import Adam
9 |
10 |
11 | def windowize(y, input_len, horizon=1, stride=1, step=1):
12 |
13 | y = np.asarray(y)
14 | max_i = len(y) - (input_len + horizon) * step + step
15 | idx_inputs = []
16 | idx_targets = []
17 | for i in range(0, max_i, stride):
18 | inp_idx = i + np.arange(0, input_len * step, step)
19 | tgt_idx = i + input_len * step + np.arange(0, horizon * step, step)
20 | idx_inputs.append(inp_idx)
21 | idx_targets.append(tgt_idx)
22 | X = y[np.array(idx_inputs)]
23 | Y = y[np.array(idx_targets)]
24 | return (
25 | torch.tensor(X[..., np.newaxis]),
26 | torch.tensor(Y),
27 | )
28 |
29 |
30 | TIME_STEP = 40 # rnn 时序步长数
31 | INPUT_SIZE = 1 # rnn 的输入维度
32 | H_SIZE = 32 # rnn 隐藏单元个数
33 | EPOCHS = 50 # 总共训练次数
34 | h_state = None # 隐藏层状态
35 |
36 |
37 | def f(t):
38 | return np.sin(np.pi * t) + 0.5 * np.cos(2 * np.pi * t)
39 |
40 |
41 | steps = np.arange(0, 100, .05)
42 | X, Y = windowize(f(steps), input_len=TIME_STEP, horizon=1, stride=1, step=1)
43 |
44 | X_train, X_test, Y_train, Y_test = train_test_split(
45 | X,
46 | Y,
47 | test_size=0.2,
48 | random_state=42,
49 | )
50 |
51 |
52 | class RNN(nn.Module):
53 |
54 | def __init__(self):
55 | super(RNN, self).__init__()
56 | self.rnn = nn.GRU(
57 | input_size=INPUT_SIZE,
58 | hidden_size=H_SIZE,
59 | num_layers=1,
60 | batch_first=True,
61 | dtype=np.float32,
62 | )
63 | self.out = nn.Linear(H_SIZE, 1)
64 |
65 | def forward(self, x, h_state):
66 | _, h_state = self.rnn(x, h_state)
67 | out = self.out(h_state[:, self.rnn.num_layers - 1, :])
68 | return out
69 |
70 |
71 | rnn = RNN()
72 | optimizer = Adam(rnn.parameters(), lr=0.01)
73 | criterion = nn.MSELoss()
74 |
75 | loss_list = []
76 |
77 | plt.rcParams['font.sans-serif'] = ['Times New Roman']
78 | plt.rcParams['mathtext.fontset'] = 'stix'
79 | plt.rcParams['xtick.direction'] = 'in'
80 | plt.rcParams['ytick.direction'] = 'in'
81 | plt.rcParams['axes.linewidth'] = 0.5
82 |
83 | bar = tqdm(range(EPOCHS))
84 | visual_steps = np.arange(0, 10, .05)
85 | visual_X, visual_Y = windowize(f(visual_steps),
86 | TIME_STEP,
87 | horizon=1,
88 | stride=1,
89 | step=1)
90 |
91 | for step in bar:
92 |
93 | rnn.train()
94 | prediction = rnn(X_train, h_state)
95 | train_loss = criterion(prediction, Y_train)
96 |
97 | optimizer.zero_grad()
98 | train_loss.backward()
99 | optimizer.step()
100 |
101 | plt.figure(figsize=(5, 3))
102 | plt.grid()
103 |
104 | rnn.eval()
105 | with torch.no_grad():
106 | test_loss = criterion(rnn(X_test, h_state), Y_test)
107 |
108 | plt.plot(visual_steps[TIME_STEP:],
109 | visual_Y.numpy(),
110 | 'r-',
111 | lw=0.7,
112 | label=r'$f(x)=\sin(\pi x)+\cos(2\pi x)/2$')
113 | plt.plot(
114 | visual_steps[TIME_STEP:],
115 | rnn(visual_X, h_state).numpy(),
116 | 'b-.',
117 | lw=0.7,
118 | label='Prediction',
119 | )
120 |
121 | plt.xticks([4, 6, 8, 10])
122 | plt.yticks([-1.6, -.8, 0, .8])
123 |
124 | plt.legend(loc=1)
125 | plt.ylim(-1.6, 0.8)
126 | plt.xlim(visual_steps[TIME_STEP], 10)
127 | plt.title('Prediction with GRU')
128 | plt.tight_layout()
129 | plt.savefig("imgs/rnn.png")
130 | plt.close()
131 |
132 | bar.set_postfix(
133 | train_loss="{:.5f}".format(train_loss.item()),
134 | test_loss="{:.5f}".format(test_loss.item()),
135 | )
136 |
--------------------------------------------------------------------------------
/pydynet/nn/modules/conv.py:
--------------------------------------------------------------------------------
1 | from .module import Module
2 | from ..parameter import Parameter
3 | from .. import init
4 | from .. import functional as F
5 | from ...special import empty
6 | from ...cuda import Device
7 |
8 | import math
9 |
10 |
11 | class Conv1d(Module):
12 |
13 | def __init__(
14 | self,
15 | in_channels: int,
16 | out_channels: int,
17 | kernel_size: int,
18 | stride: int = 1,
19 | padding: int = 0,
20 | bias: bool = True,
21 | device=None,
22 | dtype=None,
23 | ) -> None:
24 | super().__init__()
25 | kwargs = {"device": Device(device), "dtype": dtype}
26 | self.in_channels = in_channels
27 | self.out_channels = out_channels
28 | self.kernel_size = kernel_size
29 | self.padding = padding
30 | self.stride = stride
31 | self.weight = Parameter(
32 | empty((self.out_channels, self.in_channels, self.kernel_size),
33 | **kwargs))
34 | self.bias = Parameter(empty(
35 | (1, self.out_channels, 1), **kwargs)) if bias else None
36 | self.reset_parameters()
37 |
38 | def reset_parameters(self):
39 | init.kaiming_uniform_(self.weight, a=math.sqrt(5))
40 | if self.bias is not None:
41 | fan_in, _ = init._calculate_fan(self.weight)
42 | if fan_in != 0:
43 | bound = 1 / math.sqrt(fan_in)
44 | init.uniform_(self.bias, -bound, bound)
45 |
46 | def forward(self, x):
47 | conv1d = F.conv1d(x, self.weight, self.padding, self.stride)
48 | if self.bias is not None:
49 | return conv1d + self.bias
50 | return conv1d
51 |
52 | def __repr__(self) -> str:
53 | return "{}(in_channels={}, out_channels={}, kernel_size={}, padding={}, stride={}, bias={})".format(
54 | self.__class__.__name__,
55 | self.in_channels,
56 | self.out_channels,
57 | self.kernel_size,
58 | self.padding,
59 | self.stride,
60 | self.bias is not None,
61 | )
62 |
63 |
64 | class Conv2d(Module):
65 |
66 | def __init__(
67 | self,
68 | in_channels: int,
69 | out_channels: int,
70 | kernel_size: int,
71 | stride: int = 1,
72 | padding: int = 0,
73 | bias: bool = True,
74 | device=None,
75 | dtype=None,
76 | ) -> None:
77 | super().__init__()
78 | kwargs = {"device": Device(device), "dtype": dtype}
79 | self.in_channels = in_channels
80 | self.out_channels = out_channels
81 | self.kernel_size = kernel_size
82 | self.padding = padding
83 | self.stride = stride
84 | self.weight = Parameter(
85 | empty((self.out_channels, self.in_channels, self.kernel_size,
86 | self.kernel_size), **kwargs))
87 | self.bias = Parameter(empty(
88 | (1, self.out_channels, 1, 1), **kwargs)) if bias else None
89 | self.reset_parameters()
90 |
91 | def reset_parameters(self):
92 | init.kaiming_uniform_(self.weight, a=math.sqrt(5))
93 | if self.bias is not None:
94 | fan_in, _ = init._calculate_fan(self.weight)
95 | if fan_in != 0:
96 | bound = 1 / math.sqrt(fan_in)
97 | init.uniform_(self.bias, -bound, bound)
98 |
99 | def forward(self, x):
100 | conv2d = F.conv2d(x, self.weight, self.padding, self.stride)
101 | if self.bias is not None:
102 | return conv2d + self.bias
103 | return conv2d
104 |
105 | def __repr__(self) -> str:
106 | return "{}(in_channels={}, out_channels={}, kernel_size={}, padding={}, stride={}, bias={})".format(
107 | self.__class__.__name__,
108 | self.in_channels,
109 | self.out_channels,
110 | self.kernel_size,
111 | self.padding,
112 | self.stride,
113 | self.bias is not None,
114 | )
115 |
--------------------------------------------------------------------------------
/examples/pydynet/ts_prediction.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | from sklearn.model_selection import train_test_split
4 | from tqdm import tqdm
5 |
6 | import pydynet as pdn
7 | from pydynet import Tensor
8 | import pydynet.nn as nn
9 | from pydynet.optim import Adam
10 |
11 |
12 | def windowize(y, input_len, horizon=1, stride=1, step=1):
13 |
14 | y = np.asarray(y)
15 | max_i = len(y) - (input_len + horizon) * step + step
16 | idx_inputs = []
17 | idx_targets = []
18 | for i in range(0, max_i, stride):
19 | inp_idx = i + np.arange(0, input_len * step, step)
20 | tgt_idx = i + input_len * step + np.arange(0, horizon * step, step)
21 | idx_inputs.append(inp_idx)
22 | idx_targets.append(tgt_idx)
23 | X = y[np.array(idx_inputs)]
24 | Y = y[np.array(idx_targets)]
25 | return (
26 | Tensor(X[..., np.newaxis], dtype=np.float32),
27 | Tensor(Y, dtype=np.float32),
28 | )
29 |
30 |
31 | TIME_STEP = 40 # rnn 时序步长数
32 | INPUT_SIZE = 1 # rnn 的输入维度
33 | H_SIZE = 32 # rnn 隐藏单元个数
34 | EPOCHS = 50 # 总共训练次数
35 | h_state = None # 隐藏层状态
36 |
37 |
38 | def f(t):
39 | return np.sin(np.pi * t) + 0.5 * np.cos(2 * np.pi * t)
40 |
41 |
42 | steps = np.arange(0, 100, .05)
43 | X, Y = windowize(f(steps), input_len=TIME_STEP, horizon=1, stride=1, step=1)
44 |
45 | X_train, X_test, Y_train, Y_test = train_test_split(
46 | X,
47 | Y,
48 | test_size=0.2,
49 | random_state=42,
50 | )
51 |
52 |
53 | class RNN(nn.Module):
54 |
55 | def __init__(self):
56 | super(RNN, self).__init__()
57 | self.rnn = nn.GRU(
58 | input_size=INPUT_SIZE,
59 | hidden_size=H_SIZE,
60 | num_layers=1,
61 | batch_first=True,
62 | dtype=np.float32,
63 | )
64 | self.out = nn.Linear(H_SIZE, 1, dtype=np.float32)
65 |
66 | def forward(self, x, h_state):
67 | _, h_state = self.rnn(x, h_state)
68 | out = self.out(h_state[:, self.rnn.num_layers - 1, :])
69 | return out
70 |
71 |
72 | rnn = RNN()
73 | optimizer = Adam(rnn.parameters(), lr=0.01)
74 | criterion = nn.MSELoss()
75 |
76 | loss_list = []
77 |
78 | plt.rcParams['font.sans-serif'] = ['Times New Roman']
79 | plt.rcParams['mathtext.fontset'] = 'stix'
80 | plt.rcParams['xtick.direction'] = 'in'
81 | plt.rcParams['ytick.direction'] = 'in'
82 | plt.rcParams['axes.linewidth'] = 0.5
83 |
84 | bar = tqdm(range(EPOCHS))
85 | visual_steps = np.arange(0, 10, .05)
86 | visual_X, visual_Y = windowize(f(visual_steps),
87 | TIME_STEP,
88 | horizon=1,
89 | stride=1,
90 | step=1)
91 |
92 | for step in bar:
93 |
94 | rnn.train()
95 | prediction = rnn(X_train, h_state)
96 | train_loss = criterion(prediction, Y_train)
97 |
98 | optimizer.zero_grad()
99 | train_loss.backward()
100 | optimizer.step()
101 |
102 | plt.figure(figsize=(5, 3))
103 | plt.grid()
104 |
105 | rnn.eval()
106 | with pdn.no_grad():
107 | test_loss = criterion(rnn(X_test, h_state), Y_test)
108 |
109 | plt.plot(visual_steps[TIME_STEP:],
110 | visual_Y.numpy(),
111 | 'r-',
112 | lw=0.7,
113 | label=r'$f(x)=\sin(\pi x)+\cos(2\pi x)/2$')
114 | plt.plot(
115 | visual_steps[TIME_STEP:],
116 | rnn(visual_X, h_state).numpy(),
117 | 'b-.',
118 | lw=0.7,
119 | label='Prediction',
120 | )
121 |
122 | plt.xticks([4, 6, 8, 10])
123 | plt.yticks([-1.6, -.8, 0, .8])
124 |
125 | plt.legend(loc=1)
126 | plt.ylim(-1.6, 0.8)
127 | plt.xlim(visual_steps[TIME_STEP], 10)
128 | plt.title('Prediction with GRU')
129 | plt.tight_layout()
130 | plt.savefig("imgs/rnn.png")
131 | plt.close()
132 |
133 | bar.set_postfix(
134 | train_loss="{:.5f}".format(train_loss.item()),
135 | test_loss="{:.5f}".format(test_loss.item()),
136 | )
137 |
--------------------------------------------------------------------------------
/pydynet/nn/modules/module.py:
--------------------------------------------------------------------------------
1 | from collections import OrderedDict
2 |
3 | from ..parameter import Parameter
4 | from ...core import Tensor
5 | from ...autograd import set_grad_enabled
6 | from ...cuda import Device, current_device
7 |
8 |
9 | class Module:
10 |
11 | def __init__(self) -> None:
12 | self._train = True
13 | self.device = Device("cpu")
14 | self._parameters = OrderedDict()
15 |
16 | def __call__(self, *x) -> Tensor:
17 | return self.forward(*x)
18 |
19 | def __setattr__(self, __name: str, __value) -> None:
20 | self.__dict__[__name] = __value
21 | if isinstance(__value, Parameter):
22 | self._parameters[__name] = __value
23 | if isinstance(__value, Module):
24 | for key in __value._parameters:
25 | self._parameters[__name + "." + key] = __value._parameters[key]
26 |
27 | def __repr__(self) -> str:
28 | module_list = [
29 | module for module in self.__dict__.items()
30 | if isinstance(module[1], Module)
31 | ]
32 | return "{}(\n{}\n)".format(
33 | self.__class__.__name__,
34 | "\n".join([
35 | "{:>10} : {}".format(module_name, module)
36 | for module_name, module in module_list
37 | ]),
38 | )
39 |
40 | def parameters(self):
41 | for param in self._parameters.values():
42 | if param.requires_grad:
43 | yield param
44 |
45 | def train(self, mode: bool = True):
46 | set_grad_enabled(mode)
47 | self.set_module_state(mode)
48 |
49 | def set_module_state(self, mode: bool):
50 | self._train = mode
51 | for module in self.__dict__.values():
52 | if isinstance(module, Module):
53 | module.set_module_state(mode)
54 |
55 | def forward(self, x: Tensor) -> Tensor:
56 | raise NotImplementedError
57 |
58 | def eval(self):
59 | return self.train(False)
60 |
61 | def to(self, device):
62 | if not isinstance(device, Device):
63 | device = Device(device)
64 | if self.device != device:
65 | self.move(device)
66 | return self
67 |
68 | def move(self, device):
69 | self.device = device
70 | for module in self.__dict__.values():
71 | if isinstance(module, Module):
72 | module.move(device)
73 | if isinstance(module, Parameter):
74 | module.to(device)
75 |
76 | def cuda(self):
77 | return self.to(current_device())
78 |
79 | def cpu(self):
80 | return self.to('cpu')
81 |
82 |
83 | class Sequential(Module):
84 |
85 | def __init__(self, *args) -> None:
86 | super().__init__()
87 | self.module_list = []
88 | if len(args) == 1 and isinstance(args[0], OrderedDict):
89 | for name, module in args[0].items():
90 | self.__setattr__(name, module)
91 | self.module_list.append(module)
92 | else:
93 | for idx, module in enumerate(args):
94 | self.__setattr__(str(idx), module)
95 | self.module_list.append(module)
96 |
97 | def forward(self, x: Tensor) -> Tensor:
98 | for module in self.module_list:
99 | x = module(x)
100 | return x
101 |
102 | def __len__(self):
103 | return len(self.module_list)
104 |
105 |
106 | class ModuleList(Module):
107 |
108 | def __init__(self, module_list: list) -> None:
109 | super().__init__()
110 | self.module_list = module_list
111 |
112 | for idx, module in enumerate(module_list):
113 | self.__setattr__(str(idx), module)
114 |
115 | def __getitem__(self, index):
116 | return self.module_list[index]
117 |
118 | def __len__(self):
119 | return len(self.module_list)
120 |
121 | def append(self, module):
122 | self.module_list.append(module)
123 | self.__setattr__(str(len(self.module_list) - 1), module)
124 |
125 | def index(self, module):
126 | return self.module_list.index(module)
127 |
--------------------------------------------------------------------------------
/tests/test_tensor_basic.py:
--------------------------------------------------------------------------------
1 | import sys, pytest, random
2 | import numpy as np
3 | from itertools import product
4 |
5 | sys.path.append('../pydynet')
6 |
7 | import pydynet as pdn
8 |
9 | np.random.seed(0)
10 | random.seed(0)
11 |
12 | type_list = [np.float16, np.float32, np.float64]
13 |
14 |
15 | def matmul_shape_pair(max_dim=4, max_size=5):
16 | ndim = random.randint(0, max_dim)
17 |
18 | shape1 = []
19 | shape2 = []
20 | for _ in range(ndim):
21 | if random.random() < 0.5:
22 | # 50% 概率设置为 1, 确保广播可能
23 | s1, s2 = random.choice([(1, random.randint(1, max_size)),
24 | (random.randint(1, max_size), 1)])
25 | else:
26 | # 否则两边相同
27 | val = random.randint(1, max_size)
28 | s1, s2 = val, val
29 | shape1.append(s1)
30 | shape2.append(s2)
31 | shape1, shape2 = tuple(shape1), tuple(shape2)
32 |
33 | m = random.randint(1, max_size)
34 | n = random.randint(1, max_size)
35 | p = random.randint(1, max_size)
36 |
37 | shape1 = shape1 + (m, n)
38 | shape2 = shape2 + (n, p)
39 |
40 | shape1 = shape1[random.randint(0, len(shape1) - 2):]
41 |
42 | return shape1, shape2
43 |
44 |
45 | def broadcastable_shape_pair(max_dim=4, max_size=5):
46 | ndim = random.randint(0, max_dim) # 随机维数
47 | shape1 = []
48 | shape2 = []
49 | for _ in range(ndim):
50 | if random.random() < 0.5:
51 | # 50% 概率设置为 1, 确保广播可能
52 | s1, s2 = random.choice([(1, random.randint(1, max_size)),
53 | (random.randint(1, max_size), 1)])
54 | else:
55 | # 否则两边相同
56 | val = random.randint(1, max_size)
57 | s1, s2 = val, val
58 | shape1.append(s1)
59 | shape2.append(s2)
60 | shape1, shape2 = tuple(shape1), tuple(shape2)
61 |
62 | # 随机缺失维度
63 | shape1 = shape1[random.randint(0, len(shape1)):]
64 | return shape1, shape2
65 |
66 |
67 | def array_pair_generator(pair_gen_func,
68 | max_dim=4,
69 | max_size=5,
70 | n_iter=4,
71 | seed=None):
72 | rng = np.random.default_rng(seed)
73 | count = 0
74 | while n_iter is None or count < n_iter:
75 | shape1, shape2 = pair_gen_func(max_dim, max_size)
76 | a = rng.standard_normal(size=shape1).astype(rng.choice(type_list))
77 | b = rng.standard_normal(size=shape2).astype(rng.choice(type_list))
78 | yield a, b
79 | count += 1
80 |
81 |
82 | test_list = array_pair_generator(broadcastable_shape_pair, 4, 5, 8, seed=42)
83 | func_list = [(pdn.add, np.add), (pdn.sub, np.subtract), (pdn.mul, np.multiply),
84 | (pdn.div, np.divide), (pdn.pow, np.power),
85 | (pdn.maximum, np.maximum), (pdn.minimum, np.minimum)]
86 | test_list = [(*array, *funcs)
87 | for (array, funcs) in product(test_list, func_list)]
88 |
89 |
90 | @pytest.mark.parametrize("operand1, operand2, pdn_func, np_func", test_list)
91 | @pytest.mark.filterwarnings("ignore:invalid value")
92 | @pytest.mark.filterwarnings("ignore:divide by zero")
93 | def test_binary_operator(operand1: np.ndarray, operand2: np.ndarray,
94 | pdn_func: callable, np_func: callable):
95 | pdn_operand1, pdn_operand2 = pdn.Tensor(operand1), pdn.Tensor(operand2)
96 | pdn_output: pdn.Tensor = pdn_func(pdn_operand1, pdn_operand2)
97 | np_output: np.ndarray = np_func(operand1, operand2)
98 | assert pdn_output.shape == np_output.shape
99 | assert pdn_output.dtype == np_output.dtype
100 | assert np.allclose(pdn_output.data, np_output, equal_nan=True)
101 |
102 |
103 | test_list = array_pair_generator(matmul_shape_pair, 4, 5, 8, seed=42)
104 |
105 |
106 | @pytest.mark.parametrize("operand1, operand2", test_list)
107 | def test_matmul(operand1: np.ndarray, operand2: np.ndarray):
108 | pdn_operand1, pdn_operand2 = pdn.Tensor(operand1), pdn.Tensor(operand2)
109 | pdn_output: pdn.Tensor = pdn.matmul(pdn_operand1, pdn_operand2)
110 | np_output: np.ndarray = np.matmul(operand1, operand2)
111 | assert pdn_output.shape == np_output.shape
112 | assert pdn_output.dtype == np_output.dtype
113 | assert np.allclose(pdn_output.data, np_output, equal_nan=True)
114 |
115 |
--------------------------------------------------------------------------------
/cnREADME.md:
--------------------------------------------------------------------------------
1 | # PyDyNet:NumPy-based Dynamic Deep Learning Framework
2 |
3 | **PyDyNet已被多个技术公众号和社区分享**:[居然用Numpy实现了一个深度学习框架](https://segmentfault.com/a/1190000042108301).
4 |
5 | [](https://pepy.tech/project/pydynet)
6 | [](https://pepy.tech/project/pydynet)
7 | 
8 | 
9 | 
10 | 
11 |
12 | ## Towards Large Language Model
13 |
14 | **2025.8.12**: 实现了纯推理的llama3 (6-layer Transformer, vocab-size=32000). 参考了[这里](https://github.com/likejazz/llama3.np)的NumPy实现和数据集. 将数据集下载到`llama`文件夹即可运行:
15 |
16 | ```bash
17 | >>> python -m llama.infer
18 | There was a boy named Timmy. He loved to play with hi toy and run around outside. One day, Timmy' mom asked him to help her with the laundry. Timmy didn't want to help because he wanted to play. But hi mom said, "Timmy, you need to help me. It' important to help out."
19 | Timmy didn't want to help, but he knew he had to. So, he put on hi shoe and went outside to help hi mom. A they were folding the clothe, Timmy saw a big pile of laundry on the floor. He wanted to help, so he started to pick it up. But then, he accidentally knocked over a pile of clothe and they fell on him. Timmy wa okay, but he felt bad.
20 | Hi mom saw what happened and said, "Timmy, you need to be more careful. You could have hurt yourself." Timmy felt bad and said sorry. Hi mom hugged him and said, "It' okay, accident happen. Let' clean up the laundry together." Timmy learned that it' important to be careful and help out when you need it.
21 |
22 | Token count: 262, elapsed: 0.87s, 300 tokens/s
23 | ```
24 |
25 | ## Overview
26 |
27 | PyDyNet也是纯NumPy(0.0.7版本后加入CuPy,其用法和NumPy一致)实现的神经网络,语法受PyTorch的启发,大致结构如下:
28 |
29 | ```mermaid
30 | graph LR
31 | N(numpy/cupy.ndarray)--Backend--> A(Tensor) --> ds(Dataset) ---> Data(DataLoader)---> Mission
32 | A --Eager execution--> B(Basic operators:
add, exp, etc)
33 | B -.Autograd-.-> A
34 |
35 | B --> CO(Complex
operators)
36 | --> f(Function:
img2col, etc)
37 | --> M(Basic Module:
Linear, etc)
38 | --> CM(Advanced Module: CNN, RNN, Transformer, etc)
39 | --> Mission(Learning task)
40 | A --> GD(Optimizer:
SGD, Adam, etc) ---> LS(lr_scheduler:
StepLR, etc)---> Mission
41 | ```
42 |
43 | 虚线表示用户可以通过`no_grad`来关闭自动微分功能.
44 |
45 | ## Install
46 |
47 | ```bash
48 | git clone https://github.com/Kaslanarian/PyDyNet
49 | cd PyDyNet
50 | python setup.py install
51 | ```
52 |
53 | ## Example
54 |
55 | [examples/pydynet](./examples/pydynet)中是一些例子,[examples/pytorch](./examples/pytorch)给出等价的pytorch实现. 运行`python examples.pydynet.xxx`即可:
56 |
57 | ### AutoDiff
58 |
59 | [autodiff1d.py](examples/pydynet/autodiff1d.py)利用自动微分,对一个一维凸函数进行梯度下降:
60 |
61 |
62 |
63 | 以及一个多元凸函数的例子: [autodiff2d.py](examples/pydynet/autodiff2d.py)
64 |
65 |
66 |
67 | ### MLP & LeNet
68 |
69 | [mlp_cnn.py](examples/pydynet/mnist.py)使用MLP和LeNet对MNIST进行分类. 训练准确率和测试准确率:
70 |
71 |
72 |
73 | ### Dropout & BN
74 |
75 | [mlp_dropout_bn.py](examples/pydynet/dropout_bn.py)使用三种网络对`fetch_olivetti_faces`人脸(64×64)数据集进行分类并进行性能对比:
76 |
77 | 1. 三层MLP;
78 | 2. 三层MLP + Dropout;
79 | 3. 三层MLP + BatchNormalization.
80 |
81 | 学习效果对比:
82 |
83 |
84 |
85 | ### RNN
86 |
87 | [ts_prediction](examples/pydynet/ts_prediction.py)中是一个用GRU做时序预测例子:
88 |
89 |
90 |
91 | ### Transformer
92 |
93 | [transformer.py](examples/pydynet/transformer.py)中是一个用Transformer训练文本分类模型的例子. 训练结果:
94 |
95 |
96 |
97 | > 数据集 (CoLA) 链接:
98 |
99 | ## cuda加速
100 |
101 | 在训练batch size为256, 测试batch size为1024情况下,模型在CPU和GPU上的训练速度比较:
102 |
103 | | Network structure | Dataset | CPU time (s) per epoch | GPU time (s) per epoch |
104 | | :-----------------: | :---------------: | :--------------------: | :--------------------: |
105 | | 3-layer MLP | MNIST (80000×574) | 7.256±0.138 | 1.203±.0181 |
106 | | LeNet | MNIST (80000×574) | 239.664±2.108 | 2.841±0.026 |
107 | | 1-layer Transformer (dim=512, head=4) | CoLA (8551×45×64) | 17.503±0.251 | 1.075±0.002 |
108 |
109 | 设备: Nvidia GeForce RTX 4090.
110 |
--------------------------------------------------------------------------------
/pydynet/optim/lr_scheduler.py:
--------------------------------------------------------------------------------
1 | '''学习率调节器类,我们目前实现了\n
2 | - ExponentialLR;\n
3 | - StepLR;\n
4 | - MultiStepLR;\n
5 | - CosineAnnealingLR.\n
6 | '''
7 |
8 | from typing import List
9 | from .optimizer import Optimizer
10 | import weakref
11 | from functools import wraps
12 | from collections import Counter
13 | from math import cos, pi
14 |
15 |
16 | class _LRScheduler:
17 | def __init__(self, optimizer: Optimizer, last_epoch: int = -1) -> None:
18 | self.optimizer = optimizer
19 | self.last_epoch = last_epoch
20 |
21 | if self.last_epoch == -1:
22 | self.optimizer.initial_lr = self.optimizer.lr
23 | else:
24 | assert hasattr(
25 | self.optimizer, "initial_lr"
26 | ), "last_epoch=1 but no 'initial_lr' attribute in optimizer!"
27 |
28 | def with_counter(method):
29 | if getattr(method, '_with_counter', False):
30 | # `optimizer.step()` has already been replaced, return.
31 | return method
32 |
33 | # 建立一个method的弱引用。弱引用不增加对象的引用计数,只存在弱引用的对象是可被垃圾回收的;
34 | # 弱引用可以解决循环引用的问题。
35 | instance_ref = weakref.ref(method.__self__)
36 | # Get the unbound method for the same purpose.
37 | func = method.__func__ # __func__是method的底层实现,不跟具体的实例绑定
38 | cls = instance_ref().__class__ # method的所属类
39 | del method
40 |
41 | @wraps(func)
42 | def wrapper(*args, **kwargs):
43 | instance = instance_ref()
44 | instance._step_count += 1
45 | wrapped = func.__get__(instance, cls)
46 | return wrapped(*args, **kwargs)
47 |
48 | # Note that the returned function here is no longer a bound method,
49 | # so attributes like `__func__` and `__self__` no longer exist.
50 | wrapper._with_counter = True
51 | return wrapper
52 |
53 | # 通过装饰器来为optimizer.step添加计数功能,并初始化计数器
54 | self.optimizer.step = with_counter(self.optimizer.step)
55 | self.optimizer._step_count = 0
56 | self._step_count = 0
57 |
58 | self.step()
59 |
60 | def step(self):
61 | self._step_count += 1 # lr_scheduler的step计数
62 |
63 | # 支持上下文管理器协议的类
64 | class _enable_get_lr_call:
65 | def __init__(self, o):
66 | self.o = o
67 |
68 | def __enter__(self):
69 | self.o._get_lr_called_within_step = True
70 | return self
71 |
72 | def __exit__(self, type, value, traceback):
73 | self.o._get_lr_called_within_step = False
74 |
75 | with _enable_get_lr_call(self):
76 | self.last_epoch += 1 # 更新epoch
77 | lr = self.get_lr() # 计算新的lr,与具体的lr_scheduler类型有关
78 |
79 | # _last_lr记录上一轮次更新的lr值
80 | self._last_lr = self.optimizer.lr
81 | self.optimizer.lr = lr
82 |
83 | def get_lr(self):
84 | raise NotImplementedError
85 |
86 | def get_last_lr(self):
87 | return self._last_lr
88 |
89 |
90 | class ExponentialLR(_LRScheduler):
91 | def __init__(
92 | self,
93 | optimizer: Optimizer,
94 | gamma: float = 0.1,
95 | last_epoch: int = -1,
96 | ) -> None:
97 | self.gamma = gamma
98 | super().__init__(optimizer, last_epoch)
99 |
100 | def get_lr(self):
101 | return self.optimizer.lr * self.gamma**self.last_epoch
102 |
103 |
104 | class StepLR(_LRScheduler):
105 | def __init__(
106 | self,
107 | optimizer: Optimizer,
108 | step_size: int,
109 | gamma=0.1,
110 | last_epoch: int = -1,
111 | ) -> None:
112 | self.step_size = step_size
113 | self.gamma = gamma
114 | super().__init__(optimizer, last_epoch)
115 |
116 | def get_lr(self):
117 | return self.optimizer.lr * self.gamma**(self.last_epoch //
118 | self.step_size)
119 |
120 |
121 | class MultiStepLR(_LRScheduler):
122 | def __init__(
123 | self,
124 | optimizer: Optimizer,
125 | milestones: List[int],
126 | gamma=0.1,
127 | last_epoch: int = -1,
128 | ) -> None:
129 | self.milestones = Counter(milestones)
130 | self.gamma = gamma
131 | super().__init__(optimizer, last_epoch)
132 |
133 | def get_lr(self):
134 | if self.last_epoch not in self.milestones:
135 | return self.optimizer.lr
136 | return self.optimizer.lr * self.gamma**self.milestones[self.last_epoch]
137 |
138 |
139 | class CosineAnnealingLR(_LRScheduler):
140 | def __init__(
141 | self,
142 | optimizer: Optimizer,
143 | T_max: int,
144 | eta_min: float = 0,
145 | last_epoch: int = -1,
146 | ) -> None:
147 | self.T_max = T_max
148 | self.eta_min = eta_min
149 | super().__init__(optimizer, last_epoch)
150 |
151 | def get_lr(self):
152 | base_lr = self.optimizer.initial_lr
153 | if self.last_epoch == 0:
154 | return base_lr
155 | elif (self.last_epoch - 1 - self.T_max) % (2 * self.T_max) == 0:
156 | return self.get_last_lr() + (base_lr - self.eta_min) * (
157 | 1 - cos(pi / self.T_max)) / 2
158 | return (1 + cos(pi * self.last_epoch / self.T_max)) / (
159 | 1 + cos(pi * (self.last_epoch - 1) / self.T_max)) * (
160 | self.get_last_lr() - self.eta_min) + self.eta_min
161 |
--------------------------------------------------------------------------------
/pydynet/optim/optimizer.py:
--------------------------------------------------------------------------------
1 | '''优化器类,我们目前实现了\n
2 | - SGD with momentum and Nestrov;\n
3 | - Adagrad;\n
4 | - Adadelta;\n
5 | - Adam.\n
6 |
7 | Reference
8 | ---------
9 | 论文: https://arxiv.org/abs/1609.04747;\n
10 | 博客: https://xingcy.net/2021/08/20/gd/.
11 | '''
12 |
13 | from math import sqrt
14 | from ..core import Tensor
15 |
16 |
17 | class Optimizer:
18 | '''优化器基类'''
19 |
20 | def __init__(self, params: list[Tensor]) -> None:
21 | self.params: list[Tensor] = list(params)
22 |
23 | def step(self):
24 | raise NotImplementedError
25 |
26 | def zero_grad(self):
27 | '''针对self.params梯度清零.'''
28 | for param in self.params:
29 | param.zero_grad()
30 |
31 |
32 | class SGD(Optimizer):
33 | '''带动量的梯度下降
34 |
35 | Parameters
36 | ----------
37 | params : list[Parameter]
38 | 待优化参数;
39 | lr : float
40 | 学习率;
41 | momentum : float
42 | 动量系数;
43 | weight_decay : float, default=0.
44 | 权重衰减系数.
45 | nesterov : bool, defallt=True.
46 | 是否采用Nesterov加速.
47 | '''
48 |
49 | def __init__(
50 | self,
51 | params: list[Tensor],
52 | lr: float,
53 | momentum: float = .5,
54 | weight_decay: float = 0.,
55 | nesterov=True,
56 | ) -> None:
57 | super().__init__(params)
58 | self.lr = lr
59 | self.momentum = momentum
60 | self.weight_decay = weight_decay
61 | self.nesterov = nesterov
62 | self.v = [
63 | param.xp.zeros(param.shape, dtype=param.dtype)
64 | for param in self.params
65 | ]
66 |
67 | def step(self):
68 | for i in range(len(self.params)):
69 | with self.params[i].device:
70 | grad = self.params[i].grad + self.weight_decay * self.params[i].data
71 | self.v[i] *= self.momentum
72 | self.v[i] += self.lr * grad
73 | self.params[i].data -= self.v[i]
74 | if self.nesterov:
75 | self.params[i].data -= self.lr * grad
76 |
77 |
78 | class Adagrad(Optimizer):
79 | '''Adaptive Gradient Descent
80 |
81 | Parameters
82 | ----------
83 | params : list[Parameter]
84 | 待优化参数;
85 | lr : float, default=1e-2.
86 | 学习率;
87 | weight_decay : float, default=0.
88 | 权重衰减系数.
89 | eps : float, default=1e-10
90 | epsilon.
91 | '''
92 |
93 | def __init__(
94 | self,
95 | params: list[Tensor],
96 | lr: float = 1e-2,
97 | weight_decay: float = 0,
98 | eps: float = 1e-10,
99 | ) -> None:
100 | super().__init__(params)
101 | self.lr = lr
102 | self.weight_decay = weight_decay
103 | self.eps = eps
104 | self.G = [
105 | param.xp.zeros(param.shape, dtype=param.dtype)
106 | for param in self.params
107 | ]
108 |
109 | def step(self):
110 | for i in range(len(self.params)):
111 | with self.params[i].device:
112 | grad = self.params[i].grad + self.weight_decay * self.params[i].data
113 | self.G[i] += grad**2
114 | self.params[i].data -= self.lr * grad / (self.eps + self.G[i])**0.5
115 |
116 |
117 | class Adadelta(Optimizer):
118 | '''
119 | Adadelta优化器
120 |
121 | params : list[Parameter]
122 | 待优化参数;
123 | lr : float, default=1e-2.
124 | 学习率;
125 | rho :float, default=
126 | weight_decay : float, default=0.
127 | 权重衰减系数.
128 | eps : float, default=1e-10
129 | epsilon.
130 | '''
131 |
132 | def __init__(
133 | self,
134 | params: list[Tensor],
135 | lr: float = 1.0,
136 | rho: float = 0.9,
137 | weight_decay: float = 0,
138 | eps: float = 1e-6,
139 | ) -> None:
140 | super().__init__(params)
141 | self.lr = lr
142 | self.rho = rho
143 | self.eps = eps
144 | self.eps = eps
145 | self.weight_decay = weight_decay
146 | self.G = [
147 | param.xp.zeros(param.shape, dtype=param.dtype)
148 | for param in self.params
149 | ]
150 |
151 | def step(self):
152 | for i in range(len(self.params)):
153 | with self.params[i].device:
154 | grad = self.params[i].grad + self.weight_decay * self.params[i].data
155 |
156 | self.G[i] = self.rho * self.G[i] + (1 - self.rho) * grad**2
157 | self.params[i].data -= self.lr * grad / (self.G[i] + self.eps)**0.5
158 |
159 |
160 | class Adam(Optimizer):
161 |
162 | def __init__(
163 | self,
164 | params: list[Tensor],
165 | lr: float = 1e-3,
166 | betas: tuple[float] = (0.9, 0.999),
167 | eps: float = 1e-8,
168 | weight_decay: float = 0,
169 | ) -> None:
170 | super().__init__(params)
171 | self.lr = lr
172 | self.beta1, self.beta2 = betas
173 | self.eps = eps
174 | self.weight_decay = weight_decay
175 | self.m = [
176 | param.xp.zeros(param.shape, dtype=param.dtype)
177 | for param in self.params
178 | ]
179 | self.v = [
180 | param.xp.zeros(param.shape, dtype=param.dtype)
181 | for param in self.params
182 | ]
183 | self.t = 1
184 |
185 | def step(self):
186 | for i in range(len(self.params)):
187 | with self.params[i].device:
188 | grad = self.params[i].grad + self.weight_decay * self.params[i].data
189 | self.m[i] *= self.beta1
190 | self.m[i] += (1 - self.beta1) * grad
191 | self.v[i] *= self.beta2
192 | self.v[i] += (1 - self.beta2) * grad**2
193 | a_t = sqrt(1 - self.beta2**self.t) / (1 - self.beta1**self.t)
194 | self.params[i].data -= self.lr * a_t * self.m[i] / (
195 | self.v[i]**0.5 + self.eps)
196 | self.t += 1
197 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # PyDyNet:NumPy-based Dynamic Deep Learning Framework
2 |
3 | Chinese README: [cnREADME.md](./cnREADME.md)
4 |
5 | [](https://pepy.tech/project/pydynet)
6 | [](https://pepy.tech/project/pydynet)
7 | 
8 | 
9 | 
10 | 
11 |
12 | ## Towards Large Language Model
13 |
14 | **In the summer of 2025, I restart the development of PyDyNet after two years.** PyDyNet implemented a pure inference version of Llama3 (6-layer Transformer, vocab-size=32000). The implementation is inspired by the NumPy version and dataset available [here](https://github.com/likejazz/llama3.np). To run it, download the dataset into the `llm/llama` folder and execute:
15 |
16 | ```bash
17 | >>> python -m llm.llama.infer
18 |
19 | There was a boy named Timmy. He loved to play with hi toy and run around outside. One day, Timmy' mom asked him to help her with the laundry. Timmy didn't want to help because he wanted to play. But hi mom said, "Timmy, you need to help me. It' important to help out."
20 | Timmy didn't want to help, but he knew he had to. So, he put on hi shoe and went outside to help hi mom. A they were folding the clothe, Timmy saw a big pile of laundry on the floor. He wanted to help, so he started to pick it up. But then, he accidentally knocked over a pile of clothe and they fell on him. Timmy wa okay, but he felt bad.
21 | Hi mom saw what happened and said, "Timmy, you need to be more careful. You could have hurt yourself." Timmy felt bad and said sorry. Hi mom hugged him and said, "It' okay, accident happen. Let' clean up the laundry together." Timmy learned that it' important to be careful and help out when you need it.
22 |
23 | Token count: 262, elapsed: 0.87s, 300 tokens/s
24 | ```
25 |
26 | We also implemented a pure inference version of CLIP, inspired by the NumPy version and dataset available [NPCLIP](https://github.com/99991/NPCLIP). To run it, imigrate `data` folder of `MPCLIP` into `llm/clip` folder and execute:
27 |
28 | ```bash
29 | >>> python -m llm.clip.infer
30 | Label probs: [0.000953 0.48176003 0.51728696]
31 | ```
32 |
33 | for the following image and query ["a fish", "a dog", "a cat"]
34 |
35 |
36 |
37 | ## Overview
38 |
39 | PyDyNet is a neural network framework implemented entirely in NumPy (with CuPy support since version 0.0.7, using the same API). Its syntax is inspired by PyTorch, and its structure is as follows:
40 |
41 | ```mermaid
42 | graph LR
43 | N(numpy/cupy.ndarray)--Backend--> A(Tensor) --> ds(Dataset) ---> Data(DataLoader)---> Mission
44 | A --Eager execution--> B(Basic operators:
add, exp, etc)
45 | B -.Autograd-.-> A
46 |
47 | B --> CO(Complex
operators)
48 | --> f(Function:
img2col, etc)
49 | --> M(Basic Module:
Linear, etc)
50 | --> CM(Advanced Module: CNN, RNN, Transformer, etc)
51 | --> Mission(Learning task)
52 | A --> GD(Optimizer:
SGD, Adam, etc) ---> LS(lr_scheduler:
StepLR, etc)---> Mission
53 | ```
54 |
55 | Dashed lines indicate that users can disable automatic differentiation using `no_grad`.
56 |
57 | ## Install
58 |
59 | Just
60 |
61 | ```bash
62 | pip install pydynet
63 | ```
64 |
65 | or
66 |
67 | ```bash
68 | git clone https://github.com/Kaslanarian/PyDyNet
69 | cd PyDyNet
70 | python setup.py install
71 | ```
72 |
73 | ## Example
74 |
75 | Examples can be found in the [examples/pydynet](./examples/pydynet) directory, with equivalent PyTorch implementations in [examples/pytorch](./examples/pytorch). To run an example, use:
76 |
77 | ```bash
78 | python -m examples.pydynet.xxx
79 | ```
80 |
81 | ### Automatic Differentiation
82 |
83 | The example [autodiff1d.py](examples/pydynet/autodiff1d.py) demonstrates automatic differentiation by performing gradient descent on a one-dimensional convex function:
84 |
85 |
86 |
87 | A multi-variable convex function example is provided in [autodiff2d.py](examples/pydynet/autodiff2d.py):
88 |
89 |
90 |
91 | ### MLP & LeNet
92 |
93 | The example [mlp_cnn.py](examples/pydynet/mnist.py) uses MLP and LeNet to classify MNIST digits. The training and testing accuracies are shown below:
94 |
95 |
96 |
97 | ### Dropout & Batch Normalization
98 |
99 | The example [mlp_dropout_bn.py](examples/pydynet/dropout_bn.py) compares the performance of three networks on the `fetch_olivetti_faces` dataset (64×64 pixel images):
100 |
101 | 1. Three-layer MLP;
102 | 2. Three-layer MLP with Dropout;
103 | 3. Three-layer MLP with Batch Normalization.
104 |
105 |
106 |
107 | ### Recurrent Neural Network (RNN)
108 |
109 | The example [ts_prediction.py](examples/pydynet/ts_prediction.py) demonstrates time series prediction using a GRU:
110 |
111 |
112 |
113 | ### Transformer
114 |
115 | The example [transformer.py](examples/pydynet/transformer.py) shows how to train a text classification model using a Transformer. The training results are as follows:
116 |
117 |
118 |
119 | > Dataset (CoLA) link:
120 |
121 | ## Cuda Acceleration
122 |
123 | PyDyNet supports CUDA acceleration through CuPy. To use it, simply install CuPy and use the same API as NumPy. We compare the performance of PyDyNet with CuPy and NumPy as follows on **Nvidia GeForce RTX 4090**:
124 |
125 | | Network structure | Dataset | CPU time (s) per epoch | GPU time (s) per epoch |
126 | | :-----------------: | :---------------: | :--------------------: | :--------------------: |
127 | | 3-layer MLP | MNIST (80000×574) | 7.256±0.138 | 1.203±.0181 |
128 | | LeNet | MNIST (80000×574) | 239.664±2.108 | 2.841±0.026 |
129 | | 1-layer Transformer (dim=512, head=4) | CoLA (8551×45×64) | 17.503±0.251 | 1.075±0.002 |
130 |
--------------------------------------------------------------------------------
/llm/clip/tokenizer.py:
--------------------------------------------------------------------------------
1 | import gzip
2 | import html
3 | import os
4 | import re
5 | import typing
6 | from functools import lru_cache
7 |
8 |
9 | @lru_cache()
10 | def default_bpe() -> str:
11 | return os.path.join(os.path.dirname(os.path.abspath(__file__)),
12 | "data/bpe_simple_vocab_16e6.txt.gz")
13 |
14 |
15 | @lru_cache()
16 | def bytes_to_unicode() -> typing.Dict[int, str]:
17 | """
18 | Returns list of utf-8 byte and a corresponding list of unicode strings.
19 | The reversible bpe codes work on unicode strings.
20 | This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
21 | When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
22 | This is a signficant percentage of your normal, say, 32K bpe vocab.
23 | To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
24 | And avoids mapping to whitespace/control characters the bpe code barfs on.
25 | """
26 | byte_ints = list(range(ord("!"),
27 | ord("~") + 1)) + list(range(ord("¡"),
28 | ord("¬") + 1)) + list(
29 | range(
30 | ord("®"),
31 | ord("ÿ") + 1))
32 | char_ints = byte_ints[:]
33 | n = 0
34 | for b in range(2**8):
35 | if b not in byte_ints:
36 | byte_ints.append(b)
37 | char_ints.append(2**8 + n)
38 | n += 1
39 | chars = [chr(n) for n in char_ints]
40 | return dict(zip(byte_ints, chars))
41 |
42 |
43 | def get_pairs(
44 | word: typing.Tuple[str, ...]) -> typing.Set[typing.Tuple[str, str]]:
45 | """Return set of symbol pairs in a word.
46 | Word is represented as tuple of symbols (symbols being variable-length strings).
47 | """
48 | pairs = set()
49 | prev_char = word[0]
50 | for char in word[1:]:
51 | pairs.add((prev_char, char))
52 | prev_char = char
53 | return pairs
54 |
55 |
56 | def basic_clean(text: str) -> str:
57 | import ftfy
58 |
59 | text = ftfy.fix_text(text)
60 | text = html.unescape(html.unescape(text))
61 | return text.strip()
62 |
63 |
64 | def whitespace_clean(text: str) -> str:
65 | text = re.sub(r"\s+", " ", text)
66 | text = text.strip()
67 | return text
68 |
69 |
70 | def read_text(path: str) -> str:
71 | with open(path, "r", encoding="utf-8") as f:
72 | return f.read()
73 |
74 |
75 | class SimpleTokenizer(object):
76 |
77 | def __init__(self, bpe_path: str = default_bpe()) -> None:
78 | self.byte_encoder = bytes_to_unicode()
79 | self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
80 | with gzip.open(bpe_path) as f:
81 | lines = f.read().decode("utf-8").split("\n")
82 | lines = lines[1:49152 - 256 - 2 + 1]
83 | merges = [tuple(line.split()) for line in lines]
84 | vocab = list(bytes_to_unicode().values())
85 | vocab = vocab + [v + "" for v in vocab]
86 | for merge in merges:
87 | vocab.append("".join(merge))
88 | vocab.extend(["<|startoftext|>", "<|endoftext|>"])
89 | self.encoder: typing.Dict[str,
90 | int] = dict(zip(vocab, range(len(vocab))))
91 | self.decoder = {v: k for k, v in self.encoder.items()}
92 | self.bpe_ranks = dict(zip(merges, range(len(merges))))
93 | self.cache = {
94 | "<|startoftext|>": "<|startoftext|>",
95 | "<|endoftext|>": "<|endoftext|>"
96 | }
97 | pattern = r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+"""
98 | pattern = pattern.replace(r"\p{N}", read_text("llm/clip/data/pN.txt"))
99 | pattern = pattern.replace(r"\p{L}", read_text("llm/clip/data/pL.txt"))
100 | self.pat = re.compile(pattern, re.IGNORECASE)
101 |
102 | def bpe(self, token: str) -> str:
103 | if token in self.cache:
104 | return self.cache[token]
105 | word = tuple(token[:-1]) + (token[-1] + "", )
106 | pairs = get_pairs(word)
107 |
108 | if not pairs:
109 | return token + ""
110 |
111 | while True:
112 | bigram = min(
113 | pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
114 | if bigram not in self.bpe_ranks:
115 | break
116 | first, second = bigram
117 | new_word: typing.List[str] = []
118 | i = 0
119 | while i < len(word):
120 | try:
121 | j = word.index(first, i)
122 | new_word.extend(word[i:j])
123 | i = j
124 | except:
125 | new_word.extend(word[i:])
126 | break
127 |
128 | if word[i] == first and i < len(word) - 1 and word[
129 | i + 1] == second:
130 | new_word.append(first + second)
131 | i += 2
132 | else:
133 | new_word.append(word[i])
134 | i += 1
135 | word = tuple(new_word)
136 | if len(word) == 1:
137 | break
138 | else:
139 | pairs = get_pairs(word)
140 | joined_word = " ".join(word)
141 | self.cache[token] = joined_word
142 | return joined_word
143 |
144 | def encode(self,
145 | text: str,
146 | basic_cleaning: bool = False) -> typing.List[int]:
147 | bpe_tokens: typing.List[int] = []
148 | if basic_cleaning:
149 | text = basic_clean(text)
150 | text = whitespace_clean(text).lower()
151 | for token in re.findall(self.pat, text):
152 | token = "".join(self.byte_encoder[b]
153 | for b in token.encode("utf-8"))
154 | bpe_tokens.extend(self.encoder[bpe_token]
155 | for bpe_token in self.bpe(token).split(" "))
156 | return bpe_tokens
157 |
158 | def decode(self, tokens: typing.Iterable[int]) -> str:
159 | text = "".join([self.decoder[token] for token in tokens])
160 | text = bytearray([self.byte_decoder[c] for c in text
161 | ]).decode("utf-8",
162 | errors="replace").replace("", " ")
163 | return text
164 |
--------------------------------------------------------------------------------
/examples/pytorch/mnist.py:
--------------------------------------------------------------------------------
1 | import gzip, argparse
2 | from os.path import join
3 | from tqdm import tqdm
4 |
5 | import numpy as np
6 | import torch
7 | from torch import nn
8 | import torch.nn.functional as F
9 | from torch.optim import Adam
10 | from pydynet.data import data_loader
11 |
12 |
13 | class MNISTDataset:
14 |
15 | def __init__(self, root) -> None:
16 | self.root = root
17 | self.train_images_path = join(root, 'train-images-idx3-ubyte.gz')
18 | self.train_labels_path = join(root, 'train-labels-idx1-ubyte.gz')
19 | self.test_images_path = join(root, 't10k-images-idx3-ubyte.gz')
20 | self.test_labels_path = join(root, 't10k-labels-idx1-ubyte.gz')
21 |
22 | def load_train(self):
23 | return (
24 | MNISTDataset.load_mnist_images(self.train_images_path),
25 | MNISTDataset.load_mnist_labels(self.train_labels_path),
26 | )
27 |
28 | def load_test(self):
29 | return (
30 | MNISTDataset.load_mnist_images(self.test_images_path),
31 | MNISTDataset.load_mnist_labels(self.test_labels_path),
32 | )
33 |
34 | @staticmethod
35 | def load_mnist_images(file_path):
36 | with gzip.open(file_path, 'r') as f:
37 | # Skip the magic number and dimensions (4 bytes magic number + 4 bytes each for dimensions)
38 | f.read(16)
39 | # Read the rest of the file
40 | buffer = f.read()
41 | data = np.frombuffer(buffer, dtype=np.uint8).astype(np.float32)
42 | # Normalize the data to be in the range [0, 1]
43 | data = data / 255.0
44 | # Reshape the data to be in the shape (number_of_images, 28, 28)
45 | data = data.reshape(-1, 1, 28, 28)
46 | return torch.tensor(data)
47 |
48 | @staticmethod
49 | def load_mnist_labels(file_path):
50 | with gzip.open(file_path, 'r') as f:
51 | # Skip the magic number and number of items (4 bytes magic number + 4 bytes number of items)
52 | f.read(8)
53 | # Read the rest of the file
54 | buffer = f.read()
55 | labels = np.frombuffer(buffer, dtype=np.uint8)
56 | return torch.tensor(labels, dtype=int)
57 |
58 |
59 | class Flatten(nn.Module):
60 |
61 | def forward(self, x): # for batch only
62 | return x.reshape(x.shape[0], -1)
63 |
64 |
65 | class MLP(nn.Module):
66 |
67 | def __init__(self) -> None:
68 | super().__init__()
69 | self.layer1 = nn.Sequential(
70 | Flatten(),
71 | nn.Linear(28 * 28, 1024),
72 | )
73 | self.layer2 = nn.Linear(1024, 1024)
74 | self.layer3 = nn.Linear(1024, 10)
75 |
76 | def forward(self, x):
77 | z1 = F.relu(self.layer1(x))
78 | z2 = F.relu(self.layer2(z1))
79 | return self.layer3(z2)
80 |
81 |
82 | class ConvNet(nn.Module):
83 |
84 | def __init__(self):
85 | super().__init__()
86 | self.conv1 = nn.Conv2d(1, 20, 3, 1, 1)
87 | self.conv2 = nn.Conv2d(20, 50, 3, 1, 1)
88 | self.fc1 = nn.Linear(7 * 7 * 50, 500)
89 | self.fc2 = nn.Linear(500, 10)
90 |
91 | def forward(self, x):
92 | x = F.relu(self.conv1(x))
93 | x = F.max_pool2d(x, 2, 2)
94 | x = F.relu(self.conv2(x))
95 | x = F.max_pool2d(x, 2, 2)
96 | x = x.reshape(-1, 7 * 7 * 50)
97 | x = F.relu(self.fc1(x))
98 | return self.fc2(x)
99 |
100 |
101 | parser = argparse.ArgumentParser()
102 | parser.add_argument("--network",
103 | help="Network structure",
104 | choices=['mlp', 'conv'],
105 | default='conv')
106 | parser.add_argument('--batch-size',
107 | type=int,
108 | default=256,
109 | help='input batch size for training (default: 256)')
110 | parser.add_argument('--test-batch-size',
111 | type=int,
112 | default=1024,
113 | metavar='N',
114 | help='input batch size for testing (default: 1024)')
115 | parser.add_argument('--epochs',
116 | type=int,
117 | default=20,
118 | help='number of epochs to train (default: 20)')
119 | parser.add_argument('--lr',
120 | type=float,
121 | default=1e-4,
122 | help='learning rate (default: 1e-4)')
123 | parser.add_argument('--no-cuda',
124 | action='store_true',
125 | default=False,
126 | help='disables CUDA training')
127 | parser.add_argument('--seed',
128 | type=int,
129 | default=42,
130 | help='random seed (default: 1)')
131 | args = parser.parse_args()
132 |
133 | torch.manual_seed(42)
134 | torch.cuda.manual_seed(42)
135 |
136 | device = f'cuda:{torch.cuda.device_count() - 1}' if torch.cuda.is_available(
137 | ) and not args.no_cuda else 'cpu'
138 |
139 | net = {'mlp': MLP(), 'conv': ConvNet()}.get(args.network).to(device)
140 | print(net)
141 |
142 | optimizer = Adam(net.parameters(), lr=args.lr)
143 |
144 | dataset = MNISTDataset(r'./examples/data/MNIST/raw')
145 | train_loader = data_loader(
146 | *dataset.load_train(),
147 | shuffle=True,
148 | batch_size=args.batch_size,
149 | )
150 | test_loader = data_loader(
151 | *dataset.load_test(),
152 | shuffle=False,
153 | batch_size=args.test_batch_size,
154 | )
155 |
156 | bar = tqdm(range(args.epochs))
157 | info_list = []
158 | for epoch in bar:
159 |
160 | net.train()
161 |
162 | for batch_X, batch_y in train_loader:
163 | input_, label = batch_X.to(device), batch_y.to(device)
164 | loss = F.cross_entropy(net(input_), label)
165 | optimizer.zero_grad()
166 | loss.backward()
167 | optimizer.step()
168 |
169 | net.eval()
170 |
171 | train_right, train_size = 0, 0
172 | test_right, test_size = 0, 0
173 | with torch.no_grad():
174 | for batch_X, batch_y in train_loader:
175 | input_, label = batch_X.to(device), batch_y.to(device)
176 | pred: torch.Tensor = net(input_).argmax(-1)
177 | train_right += pred.eq(label).sum().item()
178 | train_size += batch_X.shape[0]
179 |
180 | for batch_X, batch_y in test_loader:
181 | input_, label = batch_X.to(device), batch_y.to(device)
182 | pred = net(input_).argmax(-1)
183 | test_right += pred.eq(label).sum().item()
184 | test_size += batch_X.shape[0]
185 |
186 | train_acc, test_acc = train_right / train_size, test_right / test_size
187 | bar.set_postfix(TEST_ACC="{:.4f}".format(test_acc),
188 | TRAIN_ACC="{:.4f}".format(train_acc),
189 | LOSS="{:.6f}".format(loss.item()))
190 |
--------------------------------------------------------------------------------
/examples/pydynet/mnist.py:
--------------------------------------------------------------------------------
1 | import gzip, argparse
2 | from os.path import join
3 | from tqdm import tqdm
4 |
5 | import numpy as np
6 | import pydynet as pdn
7 | from pydynet import nn
8 | import pydynet.nn.functional as F
9 | from pydynet.optim import Adam
10 | from pydynet.data import data_loader
11 |
12 |
13 | class MNISTDataset:
14 |
15 | def __init__(self, root) -> None:
16 | self.root = root
17 | self.train_images_path = join(root, 'train-images-idx3-ubyte.gz')
18 | self.train_labels_path = join(root, 'train-labels-idx1-ubyte.gz')
19 | self.test_images_path = join(root, 't10k-images-idx3-ubyte.gz')
20 | self.test_labels_path = join(root, 't10k-labels-idx1-ubyte.gz')
21 |
22 | def load_train(self):
23 | return (
24 | MNISTDataset.load_mnist_images(self.train_images_path),
25 | MNISTDataset.load_mnist_labels(self.train_labels_path),
26 | )
27 |
28 | def load_test(self):
29 | return (
30 | MNISTDataset.load_mnist_images(self.test_images_path),
31 | MNISTDataset.load_mnist_labels(self.test_labels_path),
32 | )
33 |
34 | @staticmethod
35 | def load_mnist_images(file_path):
36 | with gzip.open(file_path, 'r') as f:
37 | # Skip the magic number and dimensions (4 bytes magic number + 4 bytes each for dimensions)
38 | f.read(16)
39 | # Read the rest of the file
40 | buffer = f.read()
41 | data = np.frombuffer(buffer, dtype=np.uint8)
42 | # Normalize the data to be in the range [0, 1]
43 | data = data / 255.0
44 | # Reshape the data to be in the shape (number_of_images, 28, 28)
45 | data = data.reshape(-1, 1, 28, 28)
46 | return pdn.Tensor(data).astype(DTYPE)
47 |
48 | @staticmethod
49 | def load_mnist_labels(file_path):
50 | with gzip.open(file_path, 'r') as f:
51 | # Skip the magic number and number of items (4 bytes magic number + 4 bytes number of items)
52 | f.read(8)
53 | # Read the rest of the file
54 | buffer = f.read()
55 | labels = np.frombuffer(buffer, dtype=np.uint8)
56 | return pdn.Tensor(labels, dtype=int)
57 |
58 |
59 | class Flatten(nn.Module):
60 |
61 | def forward(self, x): # for batch only
62 | return x.reshape(x.shape[0], -1)
63 |
64 |
65 | class MLP(nn.Module):
66 |
67 | def __init__(self) -> None:
68 | super().__init__()
69 | self.layer1 = nn.Sequential(
70 | Flatten(),
71 | nn.Linear(28 * 28, 1024, dtype=DTYPE),
72 | )
73 | self.layer2 = nn.Linear(1024, 1024, dtype=DTYPE)
74 | self.layer3 = nn.Linear(1024, 10, dtype=DTYPE)
75 |
76 | def forward(self, x):
77 | z1 = F.relu(self.layer1(x))
78 | z2 = F.relu(self.layer2(z1))
79 | return self.layer3(z2)
80 |
81 |
82 | class ConvNet(nn.Module):
83 |
84 | def __init__(self):
85 | super().__init__()
86 | self.conv1 = nn.Conv2d(1, 20, 3, 1, 1, dtype=DTYPE)
87 | self.conv2 = nn.Conv2d(20, 50, 3, 1, 1, dtype=DTYPE)
88 | self.fc1 = nn.Linear(7 * 7 * 50, 500, dtype=DTYPE)
89 | self.fc2 = nn.Linear(500, 10, dtype=DTYPE)
90 |
91 | def forward(self, x):
92 | x = F.relu(self.conv1(x))
93 | x = F.max_pool2d(x, 2, 2)
94 | x = F.relu(self.conv2(x))
95 | x = F.max_pool2d(x, 2, 2)
96 | x = x.reshape(-1, 7 * 7 * 50)
97 | x = F.relu(self.fc1(x))
98 | return self.fc2(x)
99 |
100 |
101 | parser = argparse.ArgumentParser()
102 | parser.add_argument("--network",
103 | help="Network structure",
104 | choices=['mlp', 'conv'],
105 | default='conv')
106 | parser.add_argument('--batch-size',
107 | type=int,
108 | default=256,
109 | help='input batch size for training (default: 256)')
110 | parser.add_argument('--test-batch-size',
111 | type=int,
112 | default=1024,
113 | metavar='N',
114 | help='input batch size for testing (default: 1024)')
115 | parser.add_argument('--epochs',
116 | type=int,
117 | default=20,
118 | help='number of epochs to train (default: 20)')
119 | parser.add_argument('--lr',
120 | type=float,
121 | default=1e-4,
122 | help='learning rate (default: 1e-4)')
123 | parser.add_argument('--no-cuda',
124 | action='store_true',
125 | default=False,
126 | help='disables CUDA training')
127 | parser.add_argument('--seed',
128 | type=int,
129 | default=42,
130 | help='random seed (default: 1)')
131 | args = parser.parse_args()
132 |
133 | DTYPE = np.float32
134 | np.random.seed(args.seed)
135 | device = f'cuda:{pdn.cuda.device_count() - 1}' if pdn.cuda.is_available(
136 | ) and not args.no_cuda else 'cpu'
137 |
138 | net = {'mlp': MLP(), 'conv': ConvNet()}.get(args.network).to(device)
139 | print(net)
140 |
141 | optimizer = Adam(net.parameters(), lr=args.lr)
142 |
143 | dataset = MNISTDataset(r'./examples/data/MNIST/raw')
144 | train_loader = data_loader(
145 | *dataset.load_train(),
146 | shuffle=True,
147 | batch_size=args.batch_size,
148 | )
149 | test_loader = data_loader(
150 | *dataset.load_test(),
151 | shuffle=False,
152 | batch_size=args.test_batch_size,
153 | )
154 |
155 | bar = tqdm(range(args.epochs))
156 | info_list = []
157 | for epoch in bar:
158 |
159 | net.train()
160 |
161 | for batch_X, batch_y in train_loader:
162 | input_, label = batch_X.to(device), batch_y.to(device)
163 | loss = F.cross_entropy_loss(net(input_), label)
164 | optimizer.zero_grad()
165 | loss.backward()
166 | optimizer.step()
167 |
168 | net.eval()
169 |
170 | train_right, train_size = 0, 0
171 | test_right, test_size = 0, 0
172 | with pdn.no_grad():
173 | for batch_X, batch_y in train_loader:
174 | input_, label = batch_X.to(device), batch_y.to(device)
175 | pred: pdn.Tensor = net(input_).argmax(-1)
176 | train_right += pred.eq(label).sum().item()
177 | train_size += batch_X.shape[0]
178 |
179 | for batch_X, batch_y in test_loader:
180 | input_, label = batch_X.to(device), batch_y.to(device)
181 | pred = net(input_).argmax(-1)
182 | test_right += pred.eq(label).sum().item()
183 | test_size += batch_X.shape[0]
184 |
185 | train_acc, test_acc = train_right / train_size, test_right / test_size
186 | bar.set_postfix(TEST_ACC="{:.4f}".format(test_acc),
187 | TRAIN_ACC="{:.4f}".format(train_acc),
188 | LOSS="{:.6f}".format(loss.item()))
189 |
--------------------------------------------------------------------------------
/llm/clip/model.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import numpy as np
4 | import pydynet as pdn
5 | from pydynet import nn
6 | import pydynet.nn.functional as F
7 |
8 |
9 | def build_attention_mask(context_length: int):
10 | mask = np.full((context_length, context_length),
11 | fill_value=-np.inf,
12 | dtype=np.float32)
13 | mask = np.triu(mask, 1)
14 | return pdn.Tensor(mask, dtype=np.float32)
15 |
16 |
17 | def patch_project(x: pdn.Tensor, kernel: pdn.Tensor):
18 | # Decompose images into 32x32 patches and multiply all patches by matrix.
19 |
20 | n, c, h, w = x.shape
21 | d, pc, ph, pw = kernel.shape
22 | p = pc * ph * pw
23 | gh = h // ph
24 | gw = w // pw
25 |
26 | assert c == pc and h % ph == 0 and w % pw == 0
27 |
28 | W = kernel.transpose(1, 2, 3, 0).reshape(p, d)
29 | x = x.reshape(n, c, gh, ph, gw, pw).transpose(0, 2, 4, 1, 3,
30 | 5).reshape(n, gh, gw, p)
31 | x = x @ W
32 | return x.reshape(n, gh * gw, d)
33 |
34 |
35 | class MultiHeadAttention(nn.Module):
36 |
37 | def __init__(self, n_dim: int, n_heads: int):
38 | super().__init__()
39 | self.n_dim = n_dim
40 | self.n_heads = n_heads
41 | self.head_dim = n_dim // n_heads
42 |
43 | self.QKV = nn.Linear(self.n_dim, self.n_dim * 3, dtype=np.float32)
44 | self.O = nn.Linear(self.n_dim, self.n_dim, dtype=np.float32)
45 |
46 | def forward(self, x, mask):
47 | B, L, _ = x.shape
48 | xq, xk, xv = pdn.split(self.QKV(x), 3, -1)
49 | xq = xq.reshape(B, L, self.n_heads, self.head_dim)
50 | xk = xk.reshape(B, L, self.n_heads, self.head_dim)
51 | xv = xv.reshape(B, L, self.n_heads, self.head_dim)
52 |
53 | xq, xkT = xq.transpose(0, 2, 1, 3), xk.transpose(0, 2, 3, 1)
54 | attention = xq @ xkT / math.sqrt(self.head_dim)
55 |
56 | if mask is not None:
57 | attention = attention + mask
58 |
59 | attention = F.softmax(attention, axis=-1)
60 | output = attention @ xv.transpose(0, 2, 1, 3)
61 |
62 | output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
63 | return self.O(output)
64 |
65 |
66 | class CLIPLayerNorm(nn.LayerNorm):
67 |
68 | def __init__(self,
69 | normalized_shape,
70 | eps=0.000001,
71 | momentum=0.1,
72 | device=None,
73 | dtype=None):
74 | super().__init__(normalized_shape, eps, momentum, device, dtype)
75 |
76 | def forward(self, x):
77 | mean = x.mean(axis=-1, keepdims=True)
78 | var = pdn.square(x - mean).mean(axis=-1, keepdims=True)
79 | x = (x - mean) / pdn.sqrt(var + self.eps) * self.scale + self.shift
80 | return x
81 |
82 |
83 | class MLP(nn.Module):
84 |
85 | def __init__(self, d_in: int, d_proj: int):
86 | super().__init__()
87 | self.d_in = d_in
88 | self.d_proj = d_proj
89 | self.fc1 = nn.Linear(d_in, d_proj, dtype=np.float32)
90 | self.fc2 = nn.Linear(d_proj, d_in, dtype=np.float32)
91 |
92 | def forward(self, x):
93 | x = self.fc1(x)
94 | x = x * pdn.sigmoid(1.702 * x)
95 | return self.fc2(x)
96 |
97 |
98 | class Transformer(nn.Module):
99 |
100 | def __init__(self, n_dim: int, n_head: int, mlp_dim: int):
101 | super().__init__()
102 | self.mha = MultiHeadAttention(n_dim, n_head)
103 | self.mlp = MLP(n_dim, mlp_dim)
104 | self.layer_norm1 = CLIPLayerNorm((n_dim, ), eps=1e-5, dtype=np.float32)
105 | self.layer_norm2 = CLIPLayerNorm((n_dim, ), eps=1e-5, dtype=np.float32)
106 |
107 | def forward(self, x, mask):
108 | x = x + self.mha(self.layer_norm1(x), mask)
109 | x = x + self.mlp(self.layer_norm2(x))
110 | return x
111 |
112 |
113 | class ImageEncoder(nn.Module):
114 |
115 | def __init__(self, n_dim, n_head, mlp_dim, kernel_size, n_layer,
116 | final_dim):
117 | super().__init__()
118 | self.kernel = nn.Parameter(
119 | pdn.randn(n_dim, 3, kernel_size, kernel_size, dtype=np.float32))
120 |
121 | self.pre_norm = CLIPLayerNorm((n_dim, ), 1e-5, dtype=np.float32)
122 | self.transformers: list[Transformer] = nn.ModuleList(
123 | [Transformer(n_dim, n_head, mlp_dim) for _ in range(n_layer)])
124 |
125 | self.post_norm = CLIPLayerNorm((n_dim, ), 1e-5, dtype=np.float32)
126 | self.proj = nn.Linear(n_dim, final_dim, bias=False, dtype=np.float32)
127 |
128 | def forward(self, x, class_emb, position_emb):
129 | x = patch_project(x, self.kernel)
130 | x = pdn.concat([class_emb, x], axis=-2) + position_emb
131 |
132 | x = self.pre_norm(x)
133 | for model in self.transformers:
134 | x = model(x, None)
135 |
136 | x = self.post_norm(x[:, 0])
137 | return self.proj(x)
138 |
139 |
140 | class TextEncoder(nn.Module):
141 |
142 | def __init__(self, n_dim, n_head, mlp_dim, n_layer, final_dim, vocab_size):
143 | super().__init__()
144 | self.token_embed = nn.Embedding(vocab_size, n_dim, dtype=np.float32)
145 | self.transformers: list[Transformer] = nn.ModuleList(
146 | [Transformer(n_dim, n_head, mlp_dim) for _ in range(n_layer)])
147 |
148 | self.post_norm = CLIPLayerNorm((n_dim, ), 1e-5, dtype=np.float32)
149 | self.proj = nn.Linear(n_dim, final_dim, bias=False, dtype=np.float32)
150 |
151 | def forward(self, idx, position_emb):
152 | x = self.token_embed(idx) + position_emb
153 | mask = build_attention_mask(x.shape[1])
154 |
155 | for model in self.transformers:
156 | x = model(x, mask)
157 |
158 | x = self.post_norm(x)
159 |
160 | return self.proj(x[np.arange(x.shape[0]), x.xp.argmax(idx, axis=-1)])
161 |
162 |
163 | class CLIP(nn.Module):
164 |
165 | def __init__(self):
166 | super().__init__()
167 | self.class_embed = nn.Parameter(pdn.randn(1, 1, 768, dtype=np.float32))
168 | self.v_pos_emb = nn.Parameter(pdn.randn(50, 768, dtype=np.float32))
169 | self.t_pos_emb = nn.Parameter(pdn.randn(77, 512, dtype=np.float32))
170 | self.image_encoder = ImageEncoder(768, 12, 3072, 32, 12, 512)
171 | self.text_encoder = TextEncoder(512, 8, 2048, 12, 512, 49408)
172 | self.scale = 1
173 |
174 | def forward(self, img, idx):
175 | img_feature = self.image_encoder(img, self.class_embed, self.v_pos_emb)
176 | txt_feature = self.text_encoder(idx, self.t_pos_emb)
177 |
178 | norm_img = pdn.sqrt(pdn.square(img_feature).sum(1, keepdims=True))
179 | norm_txt = pdn.sqrt(pdn.square(txt_feature).sum(1, keepdims=True))
180 |
181 | img_feature = img_feature / norm_img
182 | txt_feature = txt_feature / norm_txt
183 | logits_per_image = self.scale * img_feature @ txt_feature.T
184 | return logits_per_image
185 |
--------------------------------------------------------------------------------
/examples/pytorch/dropout_bn.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | from sklearn.datasets import fetch_olivetti_faces
3 | from sklearn.preprocessing import MinMaxScaler
4 | from sklearn.model_selection import train_test_split
5 | from tqdm import tqdm
6 |
7 | import numpy as np
8 | import torch
9 | import torch.nn.functional as F
10 | import torch.nn as nn
11 | from torch.optim import Adam
12 | from pydynet.data import data_loader
13 |
14 | data_X, data_y = fetch_olivetti_faces(return_X_y=True)
15 | print(data_X.shape)
16 | train_X, test_X, train_y, test_y = train_test_split(
17 | data_X,
18 | data_y,
19 | train_size=0.8,
20 | stratify=data_y,
21 | random_state=42,
22 | )
23 | scaler = MinMaxScaler()
24 | train_X = scaler.fit_transform(train_X)
25 | test_X = scaler.transform(test_X)
26 |
27 |
28 | class DNN(nn.Module):
29 |
30 | def __init__(self) -> None:
31 | super().__init__()
32 | self.fc1 = nn.Linear(4096, 512)
33 | self.fc2 = nn.Linear(512, 128)
34 | self.fc3 = nn.Linear(128, 40)
35 |
36 | def forward(self, x):
37 | x = F.relu(self.fc1(x))
38 | x = F.relu(self.fc2(x))
39 | return self.fc3(x)
40 |
41 |
42 | class DNN_dropout(DNN):
43 |
44 | def __init__(self) -> None:
45 | super().__init__()
46 | self.dropout = nn.Dropout(p=0.05)
47 |
48 | def forward(self, x):
49 | x = F.relu(self.dropout(self.fc1(x)))
50 | x = F.relu(self.dropout(self.fc2(x)))
51 | return self.fc3(x)
52 |
53 |
54 | class DNN_BN(DNN):
55 |
56 | def __init__(self) -> None:
57 | super().__init__()
58 | self.bn1 = nn.BatchNorm1d(512)
59 | self.bn2 = nn.BatchNorm1d(128)
60 |
61 | def forward(self, x):
62 | x = F.relu(self.bn1(self.fc1(x)))
63 | x = F.relu(self.bn2(self.fc2(x)))
64 | return self.fc3(x)
65 |
66 |
67 | np.random.seed(42)
68 | use_cuda = True
69 | device = f'cuda:{torch.cuda.device_count() - 1}' if torch.cuda.is_available(
70 | ) else 'cpu'
71 |
72 | net1 = DNN().to(device)
73 | net2 = DNN_dropout().to(device)
74 | net3 = DNN_BN().to(device)
75 | print(net1)
76 | print(net2)
77 | print(net3)
78 | optim1 = Adam(net1.parameters(), lr=5e-5)
79 | optim2 = Adam(net2.parameters(), lr=5e-5)
80 | optim3 = Adam(net3.parameters(), lr=5e-5)
81 | loss = nn.CrossEntropyLoss()
82 | EPOCHES = 50
83 | BATCH_SIZE = 40
84 |
85 | train_loader = data_loader(torch.tensor(train_X), torch.tensor(train_y),
86 | BATCH_SIZE, True)
87 |
88 | train_accs, test_accs = [], []
89 | test_X_cuda = torch.tensor(test_X, device=device)
90 | test_y_cuda = torch.tensor(test_y, device=device)
91 |
92 | bar = tqdm(range(EPOCHES))
93 |
94 | for epoch in bar:
95 | # 相同数据训练3个网络
96 | net1.train()
97 | net2.train()
98 | net3.train()
99 |
100 | for batch_X, batch_y in train_loader:
101 | input_, label = batch_X.to(device), batch_y.to(device)
102 |
103 | output1 = net1(input_)
104 | l1 = loss(output1, label)
105 | output2 = net2(input_)
106 | l2 = loss(output2, label)
107 | output3 = net3(input_)
108 | l3 = loss(output3, label)
109 |
110 | optim1.zero_grad()
111 | optim2.zero_grad()
112 | optim3.zero_grad()
113 | (l1 + l2 + l3).backward()
114 | optim1.step()
115 | optim2.step()
116 | optim3.step()
117 |
118 | net1.eval()
119 | net2.eval()
120 | net3.eval()
121 |
122 | # train
123 | train_right = [0, 0, 0]
124 | with torch.no_grad():
125 | for batch_X, batch_y in train_loader:
126 | input_, label = batch_X.to(device), batch_y.to(device)
127 | pred1 = net1(input_).argmax(-1)
128 | pred2 = net2(input_).argmax(-1)
129 | pred3 = net3(input_).argmax(-1)
130 |
131 | train_right[0] += pred1.eq(label).sum().item()
132 | train_right[1] += pred2.eq(label).sum().item()
133 | train_right[2] += pred3.eq(label).sum().item()
134 |
135 | train_acc = np.array(train_right) / len(train_X)
136 |
137 | pred1, pred2, pred3 = (
138 | net1(test_X_cuda).argmax(-1),
139 | net2(test_X_cuda).argmax(-1),
140 | net3(test_X_cuda).argmax(-1),
141 | )
142 | test_acc = np.array([
143 | pred1.eq(test_y_cuda).float().mean().item(),
144 | pred2.eq(test_y_cuda).float().mean().item(),
145 | pred3.eq(test_y_cuda).float().mean().item(),
146 | ])
147 |
148 | bar.set_postfix(
149 | TRAIN_ACC="{:.3f}, {:.3f}, {:.3f}".format(*train_acc),
150 | TEST_ACC="{:.3f}, {:.3f}, {:.3f}".format(*test_acc),
151 | )
152 | train_accs.append(train_acc)
153 | test_accs.append(test_acc)
154 |
155 | train_accs = np.array(train_accs)
156 | test_accs = np.array(test_accs)
157 |
158 | plt.figure(figsize=(9, 3))
159 |
160 | plt.rcParams['font.sans-serif'] = ['Times New Roman']
161 | plt.rcParams['mathtext.fontset'] = 'stix'
162 | plt.rcParams['xtick.direction'] = 'in'
163 | plt.rcParams['ytick.direction'] = 'in'
164 | plt.rcParams['axes.linewidth'] = 0.5
165 |
166 | plot_kwargs = {'linewidth': 0.7, 'zorder': 10}
167 |
168 | plt.subplot(1, 2, 1)
169 | plt.grid(zorder=-10)
170 |
171 | plt.xlim(2, 50)
172 | plt.ylim(0, 1.05)
173 |
174 | x = np.arange(0, 50, 2) + 2
175 | plt.plot(x,
176 | train_accs[::2, 0],
177 | label="MLP",
178 | color='blue',
179 | marker='^',
180 | **plot_kwargs)
181 | plt.plot(x,
182 | train_accs[::2, 1],
183 | label="MLP with Dropout",
184 | color='green',
185 | marker='s',
186 | **plot_kwargs)
187 | plt.plot(x,
188 | train_accs[::2, 2],
189 | label="MLP with BN",
190 | color='red',
191 | marker='*',
192 | **plot_kwargs)
193 |
194 | plt.yticks([0, .2, .4, .6, .8, 1], size=13)
195 | plt.xticks([10, 20, 30, 40, 50], size=13)
196 | plt.xlabel("Epochs", size=13)
197 | plt.title("Training Accuracy on Olivetti Faces Dataset")
198 | plt.legend()
199 | plt.tight_layout()
200 |
201 | plt.subplot(1, 2, 2)
202 | plt.grid(zorder=-10)
203 |
204 | plt.xlim(2, 50)
205 | plt.ylim(0, 1.)
206 |
207 | plt.plot(x,
208 | test_accs[::2, 0],
209 | label="MLP",
210 | color='blue',
211 | marker='^',
212 | **plot_kwargs)
213 | plt.plot(x,
214 | test_accs[::2, 1],
215 | label="MLP with Dropout",
216 | color='green',
217 | marker='s',
218 | **plot_kwargs)
219 | plt.plot(x,
220 | test_accs[::2, 2],
221 | label="MLP with BN",
222 | color='red',
223 | marker='*',
224 | **plot_kwargs)
225 |
226 | plt.yticks([0, .2, .4, .6, .8, 1], size=13)
227 | plt.xticks([10, 20, 30, 40, 50], size=13)
228 | plt.xlabel("Epochs", size=13)
229 | plt.title("Test Accuracy on Olivetti Faces Dataset")
230 | plt.legend()
231 | plt.tight_layout()
232 |
233 | plt.savefig("imgs/dropout_bn.png")
234 |
--------------------------------------------------------------------------------
/examples/pydynet/dropout_bn.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | from sklearn.datasets import fetch_olivetti_faces
3 | from sklearn.preprocessing import MinMaxScaler
4 | from sklearn.model_selection import train_test_split
5 | from tqdm import tqdm
6 |
7 | import numpy as np
8 | import pydynet as pdn
9 | import pydynet.nn.functional as F
10 | import pydynet.nn as nn
11 | from pydynet.optim import Adam
12 | from pydynet.data import data_loader
13 |
14 | data_X, data_y = fetch_olivetti_faces(return_X_y=True)
15 | print(data_X.shape)
16 | train_X, test_X, train_y, test_y = train_test_split(
17 | data_X,
18 | data_y,
19 | train_size=0.8,
20 | stratify=data_y,
21 | random_state=42,
22 | )
23 | scaler = MinMaxScaler()
24 | train_X = scaler.fit_transform(train_X)
25 | test_X = scaler.transform(test_X)
26 |
27 |
28 | class DNN(nn.Module):
29 |
30 | def __init__(self) -> None:
31 | super().__init__()
32 | self.fc1 = nn.Linear(4096, 512, dtype=np.float32)
33 | self.fc2 = nn.Linear(512, 128, dtype=np.float32)
34 | self.fc3 = nn.Linear(128, 40, dtype=np.float32)
35 |
36 | def forward(self, x):
37 | x = F.relu(self.fc1(x))
38 | x = F.relu(self.fc2(x))
39 | return self.fc3(x)
40 |
41 |
42 | class DNN_dropout(DNN):
43 |
44 | def __init__(self) -> None:
45 | super().__init__()
46 | self.dropout = nn.Dropout(p=0.05)
47 |
48 | def forward(self, x):
49 | x = F.relu(self.dropout(self.fc1(x)))
50 | x = F.relu(self.dropout(self.fc2(x)))
51 | return self.fc3(x)
52 |
53 |
54 | class DNN_BN(DNN):
55 |
56 | def __init__(self) -> None:
57 | super().__init__()
58 | self.bn1 = nn.BatchNorm1d(512, dtype=np.float32)
59 | self.bn2 = nn.BatchNorm1d(128, dtype=np.float32)
60 |
61 | def forward(self, x):
62 | x = F.relu(self.bn1(self.fc1(x)))
63 | x = F.relu(self.bn2(self.fc2(x)))
64 | return self.fc3(x)
65 |
66 |
67 | np.random.seed(42)
68 | use_cuda = True
69 | device = f'cuda:{pdn.cuda.device_count() - 1}' if pdn.cuda.is_available(
70 | ) else 'cpu'
71 |
72 | net1 = DNN().to(device)
73 | net2 = DNN_dropout().to(device)
74 | net3 = DNN_BN().to(device)
75 | print(net1)
76 | print(net2)
77 | print(net3)
78 | optim1 = Adam(net1.parameters(), lr=5e-5)
79 | optim2 = Adam(net2.parameters(), lr=5e-5)
80 | optim3 = Adam(net3.parameters(), lr=5e-5)
81 | loss = nn.CrossEntropyLoss()
82 | EPOCHES = 50
83 | BATCH_SIZE = 40
84 |
85 | train_loader = data_loader(pdn.Tensor(train_X), pdn.Tensor(train_y),
86 | BATCH_SIZE, True)
87 |
88 | train_accs, test_accs = [], []
89 | test_X_cuda = pdn.Tensor(test_X, device=device)
90 | test_y_cuda = pdn.Tensor(test_y, device=device)
91 |
92 | bar = tqdm(range(EPOCHES))
93 |
94 | for epoch in bar:
95 | # 相同数据训练3个网络
96 | net1.train()
97 | net2.train()
98 | net3.train()
99 |
100 | for batch_X, batch_y in train_loader:
101 | input_, label = batch_X.to(device), batch_y.to(device)
102 |
103 | output1 = net1(input_)
104 | l1 = loss(output1, label)
105 | output2 = net2(input_)
106 | l2 = loss(output2, label)
107 | output3 = net3(input_)
108 | l3 = loss(output3, label)
109 |
110 | optim1.zero_grad()
111 | optim2.zero_grad()
112 | optim3.zero_grad()
113 | (l1 + l2 + l3).backward()
114 | optim1.step()
115 | optim2.step()
116 | optim3.step()
117 |
118 | net1.eval()
119 | net2.eval()
120 | net3.eval()
121 |
122 | # train
123 | train_right = [0, 0, 0]
124 | with pdn.no_grad():
125 | for batch_X, batch_y in train_loader:
126 | input_, label = batch_X.to(device), batch_y.to(device)
127 | pred1 = net1(input_).argmax(-1)
128 | pred2 = net2(input_).argmax(-1)
129 | pred3 = net3(input_).argmax(-1)
130 |
131 | train_right[0] += pred1.eq(label).sum().item()
132 | train_right[1] += pred2.eq(label).sum().item()
133 | train_right[2] += pred3.eq(label).sum().item()
134 |
135 | train_acc = np.array(train_right) / len(train_X)
136 |
137 | pred1, pred2, pred3 = (
138 | net1(test_X_cuda).argmax(-1),
139 | net2(test_X_cuda).argmax(-1),
140 | net3(test_X_cuda).argmax(-1),
141 | )
142 | test_acc = np.array([
143 | pred1.eq(test_y_cuda.data).mean().item(),
144 | pred2.eq(test_y_cuda.data).mean().item(),
145 | pred3.eq(test_y_cuda.data).mean().item(),
146 | ])
147 |
148 | bar.set_postfix(
149 | TRAIN_ACC="{:.3f}, {:.3f}, {:.3f}".format(*train_acc),
150 | TEST_ACC="{:.3f}, {:.3f}, {:.3f}".format(*test_acc),
151 | )
152 | train_accs.append(train_acc)
153 | test_accs.append(test_acc)
154 |
155 | train_accs = np.array(train_accs)
156 | test_accs = np.array(test_accs)
157 |
158 | plt.figure(figsize=(9, 3))
159 |
160 | plt.rcParams['font.sans-serif'] = ['Times New Roman']
161 | plt.rcParams['mathtext.fontset'] = 'stix'
162 | plt.rcParams['xtick.direction'] = 'in'
163 | plt.rcParams['ytick.direction'] = 'in'
164 | plt.rcParams['axes.linewidth'] = 0.5
165 |
166 | plot_kwargs = {'linewidth': 0.7, 'zorder': 10}
167 |
168 | plt.subplot(1, 2, 1)
169 | plt.grid(zorder=-10)
170 |
171 | plt.xlim(2, 50)
172 | plt.ylim(0, 1.05)
173 |
174 | x = np.arange(0, 50, 2) + 2
175 | plt.plot(x,
176 | train_accs[::2, 0],
177 | label="MLP",
178 | color='blue',
179 | marker='^',
180 | **plot_kwargs)
181 | plt.plot(x,
182 | train_accs[::2, 1],
183 | label="MLP with Dropout",
184 | color='green',
185 | marker='s',
186 | **plot_kwargs)
187 | plt.plot(x,
188 | train_accs[::2, 2],
189 | label="MLP with BN",
190 | color='red',
191 | marker='*',
192 | **plot_kwargs)
193 |
194 | plt.yticks([0, .2, .4, .6, .8, 1], size=13)
195 | plt.xticks([10, 20, 30, 40, 50], size=13)
196 | plt.xlabel("Epochs", size=13)
197 | plt.title("Training Accuracy on Olivetti Faces Dataset")
198 | plt.legend()
199 | plt.tight_layout()
200 |
201 | plt.subplot(1, 2, 2)
202 | plt.grid(zorder=-10)
203 |
204 | plt.xlim(2, 50)
205 | plt.ylim(0, 1.)
206 |
207 | plt.plot(x,
208 | test_accs[::2, 0],
209 | label="MLP",
210 | color='blue',
211 | marker='^',
212 | **plot_kwargs)
213 | plt.plot(x,
214 | test_accs[::2, 1],
215 | label="MLP with Dropout",
216 | color='green',
217 | marker='s',
218 | **plot_kwargs)
219 | plt.plot(x,
220 | test_accs[::2, 2],
221 | label="MLP with BN",
222 | color='red',
223 | marker='*',
224 | **plot_kwargs)
225 |
226 | plt.yticks([0, .2, .4, .6, .8, 1], size=13)
227 | plt.xticks([10, 20, 30, 40, 50], size=13)
228 | plt.xlabel("Epochs", size=13)
229 | plt.title("Test Accuracy on Olivetti Faces Dataset")
230 | plt.legend()
231 | plt.tight_layout()
232 |
233 | plt.savefig("imgs/dropout_bn.png")
234 |
--------------------------------------------------------------------------------
/pydynet/core/function.py:
--------------------------------------------------------------------------------
1 | from .tensor import Tensor, swapaxes
2 |
3 |
4 | def sqrt(x: Tensor):
5 | '''平方根函数'''
6 | return x**0.5
7 |
8 |
9 | def square(x: Tensor):
10 | '''平方函数'''
11 | return x * x
12 |
13 |
14 | def vsplit(x: Tensor, indices_or_sections: int | tuple) -> list[Tensor]:
15 | if not isinstance(x, Tensor):
16 | x = Tensor(x)
17 |
18 | try:
19 | len(indices_or_sections)
20 | except TypeError:
21 | sections = indices_or_sections
22 | N = x.shape[0]
23 | assert N % sections == 0, 'array split does not result in an equal division'
24 |
25 | Ntotal = x.shape[0]
26 | try:
27 | # handle array case.
28 | Nsections = len(indices_or_sections) + 1
29 | div_points = [0] + list(indices_or_sections) + [Ntotal]
30 | except TypeError:
31 | # indices_or_sections is a scalar, not an array.
32 | Nsections = int(indices_or_sections)
33 | if Nsections <= 0:
34 | raise ValueError(
35 | 'number sections must be larger than 0.') from None
36 | Neach_section, extras = divmod(Ntotal, Nsections)
37 | section_sizes = ([0] + extras * [Neach_section + 1] +
38 | (Nsections - extras) * [Neach_section])
39 | div_points = x.xp.array(section_sizes, dtype=x.xp.intp).cumsum()
40 |
41 | sub_tensors = []
42 | for i in range(Nsections):
43 | st = div_points[i]
44 | end = div_points[i + 1]
45 | sub_tensors.append(x[st:end])
46 |
47 | return sub_tensors
48 |
49 |
50 | def hsplit(x: Tensor, indices_or_sections: int | tuple) -> list[Tensor]:
51 | if not isinstance(x, Tensor):
52 | x = Tensor(x)
53 |
54 | try:
55 | len(indices_or_sections)
56 | except TypeError:
57 | sections = indices_or_sections
58 | N = x.shape[1]
59 | assert N % sections == 0, 'array split does not result in an equal division'
60 |
61 | Ntotal = x.shape[1]
62 | try:
63 | # handle array case.
64 | Nsections = len(indices_or_sections) + 1
65 | div_points = [0] + list(indices_or_sections) + [Ntotal]
66 | except TypeError:
67 | # indices_or_sections is a scalar, not an array.
68 | Nsections = int(indices_or_sections)
69 | if Nsections <= 0:
70 | raise ValueError(
71 | 'number sections must be larger than 0.') from None
72 | Neach_section, extras = divmod(Ntotal, Nsections)
73 | section_sizes = ([0] + extras * [Neach_section + 1] +
74 | (Nsections - extras) * [Neach_section])
75 | div_points = x.xp.array(section_sizes, dtype=x.xp.intp).cumsum()
76 |
77 | sub_tensors = []
78 | for i in range(Nsections):
79 | st = div_points[i]
80 | end = div_points[i + 1]
81 | sub_tensors.append(x[:, st:end])
82 |
83 | return sub_tensors
84 |
85 |
86 | def dsplit(x: Tensor, indices_or_sections: int | tuple) -> list[Tensor]:
87 | if not isinstance(x, Tensor):
88 | x = Tensor(x)
89 |
90 | try:
91 | len(indices_or_sections)
92 | except TypeError:
93 | sections = indices_or_sections
94 | N = x.shape[2]
95 | assert N % sections == 0, 'array split does not result in an equal division'
96 |
97 | Ntotal = x.shape[2]
98 | try:
99 | # handle array case.
100 | Nsections = len(indices_or_sections) + 1
101 | div_points = [0] + list(indices_or_sections) + [Ntotal]
102 | except TypeError:
103 | # indices_or_sections is a scalar, not an array.
104 | Nsections = int(indices_or_sections)
105 | if Nsections <= 0:
106 | raise ValueError(
107 | 'number sections must be larger than 0.') from None
108 | Neach_section, extras = divmod(Ntotal, Nsections)
109 | section_sizes = ([0] + extras * [Neach_section + 1] +
110 | (Nsections - extras) * [Neach_section])
111 | div_points = x.xp.array(section_sizes, dtype=x.xp.intp).cumsum()
112 |
113 | sub_tensors = []
114 | for i in range(Nsections):
115 | st = div_points[i]
116 | end = div_points[i + 1]
117 | sub_tensors.append(x[:, :, st:end])
118 |
119 | return sub_tensors
120 |
121 |
122 | def split(
123 | x: Tensor,
124 | indices_or_sections: int | tuple,
125 | axis: int = 0,
126 | ) -> list[Tensor]:
127 | if not isinstance(x, Tensor):
128 | x = Tensor(x)
129 |
130 | if axis == 0 or axis == -x.ndim:
131 | return vsplit(x, indices_or_sections)
132 | elif axis == 1 or axis == -x.ndim + 1:
133 | return hsplit(x, indices_or_sections)
134 | elif axis == 2 or axis == -x.ndim + 2:
135 | return dsplit(x, indices_or_sections)
136 |
137 | try:
138 | len(indices_or_sections)
139 | except TypeError:
140 | sections = indices_or_sections
141 | N = x.shape[axis]
142 | assert N % sections == 0, 'array split does not result in an equal division'
143 |
144 | Ntotal = x.shape[axis]
145 | try:
146 | # handle array case.
147 | Nsections = len(indices_or_sections) + 1
148 | div_points = [0] + list(indices_or_sections) + [Ntotal]
149 | except TypeError:
150 | # indices_or_sections is a scalar, not an array.
151 | Nsections = int(indices_or_sections)
152 | if Nsections <= 0:
153 | raise ValueError(
154 | 'number sections must be larger than 0.') from None
155 | Neach_section, extras = divmod(Ntotal, Nsections)
156 | section_sizes = ([0] + extras * [Neach_section + 1] +
157 | (Nsections - extras) * [Neach_section])
158 | div_points = x.xp.array(section_sizes, dtype=x.xp.intp).cumsum()
159 |
160 | sub_tensors = []
161 | stensor = swapaxes(x, 0, axis)
162 | for i in range(Nsections):
163 | st = div_points[i]
164 | end = div_points[i + 1]
165 | sub_tensors.append(swapaxes(stensor[st:end], axis, 0))
166 | return sub_tensors
167 |
168 |
169 | def unsqueeze(x: Tensor, axis):
170 | '''等价于numpy的expand_dims, 因此我们借用了expand_dims的源码'''
171 | from numpy.core.numeric import normalize_axis_tuple
172 |
173 | if type(axis) not in (tuple, list):
174 | axis = (axis, )
175 |
176 | out_ndim = len(axis) + x.ndim
177 | axis = normalize_axis_tuple(axis, out_ndim)
178 |
179 | shape_it = iter(x.shape)
180 | shape = [1 if ax in axis else next(shape_it) for ax in range(out_ndim)]
181 | return x.reshape(*shape)
182 |
183 |
184 | def squeeze(x: Tensor, axis=None):
185 | shape = x.shape
186 | if axis is None:
187 | new_shape = tuple(dim for dim in shape if dim != 1)
188 | else:
189 | if isinstance(axis, int):
190 | axis = (axis, )
191 | axis = tuple(axis)
192 |
193 | for ax in axis:
194 | if ax >= len(shape) or ax < -len(shape):
195 | raise ValueError("Axis out of range")
196 | if shape[ax] != 1:
197 | raise ValueError(
198 | f"Cannot squeeze axis {ax} with size {shape[ax]}")
199 |
200 | # 构造新形状,排除指定轴
201 | new_shape = tuple(dim for i, dim in enumerate(shape) if i not in axis)
202 |
203 | # 返回重塑后的数组
204 | return x.reshape(*new_shape)
205 |
--------------------------------------------------------------------------------
/llm/llama/model.py:
--------------------------------------------------------------------------------
1 | import math
2 | import numpy as np
3 |
4 | import pydynet as pdn
5 | from pydynet.core.tensor import Tensor
6 | import pydynet.nn as nn
7 | import pydynet.nn.functional as F
8 |
9 |
10 | def compute_cos_sin_cache(head_dim: int,
11 | max_seq_len: int,
12 | base: int = 10000,
13 | dtype=None):
14 | inv_freq = 1.0 / (base**(np.arange(0, head_dim, 2)[:(head_dim // 2)] /
15 | head_dim))
16 | t = np.arange(max_seq_len)
17 | freqs = np.outer(t, inv_freq).astype(dtype)
18 |
19 | return Tensor(np.cos(freqs)), Tensor(np.sin(freqs))
20 |
21 |
22 | def apply_rotary_emb(xq: Tensor, xk: Tensor, freqs_cos, freqs_sin):
23 | xqri = xq.reshape(*(xq.shape[:-1] + (-1, 2)))
24 | xkri = xk.reshape(*(xk.shape[:-1] + (-1, 2)))
25 |
26 | xq_r, xq_i = xqri[..., 0], xqri[..., 1]
27 | xk_r, xk_i = xkri[..., 0], xkri[..., 1]
28 |
29 | freqs_cos = pdn.unsqueeze(freqs_cos, axis=-2)
30 | freqs_sin = pdn.unsqueeze(freqs_sin, axis=-2)
31 |
32 | # Apply rotation using real numbers.
33 | xq_out_r = pdn.unsqueeze(xq_r * freqs_cos - xq_i * freqs_sin, -1)
34 | xq_out_i = pdn.unsqueeze(xq_r * freqs_sin + xq_i * freqs_cos, -1)
35 | xk_out_r = pdn.unsqueeze(xk_r * freqs_cos - xk_i * freqs_sin, -1)
36 | xk_out_i = pdn.unsqueeze(xk_r * freqs_sin + xk_i * freqs_cos, -1)
37 |
38 | # Flatten last two dimensions.
39 | xq_out = pdn.concat([xq_out_r, xq_out_i], axis=-1)
40 | xk_out = pdn.concat([xk_out_r, xk_out_i], axis=-1)
41 | xq_out = xq_out.reshape(*(xq_out.shape[:-2] + (-1, )))
42 | xk_out = xk_out.reshape(*(xk_out.shape[:-2] + (-1, )))
43 | return xq_out, xk_out
44 |
45 |
46 | class FeedForward(nn.Module):
47 |
48 | def __init__(self, dim, up_dim, dtype=None):
49 | super().__init__()
50 | self.dim, self.up_dim = dim, up_dim
51 | self.up = nn.Linear(dim, up_dim, bias=False, dtype=dtype)
52 | self.gate = nn.Linear(dim, up_dim, bias=False, dtype=dtype)
53 | self.down = nn.Linear(up_dim, dim, bias=False, dtype=dtype)
54 |
55 | def forward(self, x):
56 | swish, x_V = F.silu(self.gate(x)), self.up(x)
57 | return self.down(swish * x_V)
58 |
59 |
60 | class Attention(nn.Module):
61 |
62 | def __init__(
63 | self,
64 | dim: int,
65 | n_heads: int,
66 | max_seq_len: int,
67 | max_batch_size: int = None,
68 | dtype=None,
69 | ):
70 | super().__init__()
71 | self.dim = dim
72 | self.n_heads = n_heads
73 |
74 | assert dim % n_heads == 0
75 | self.head_dim = dim // n_heads
76 |
77 | self.Q = nn.Linear(self.dim, self.dim, bias=False, dtype=dtype)
78 | self.K = nn.Linear(self.dim, self.dim, bias=False, dtype=dtype)
79 | self.V = nn.Linear(self.dim, self.dim, bias=False, dtype=dtype)
80 | self.O = nn.Linear(self.dim, self.dim, bias=False, dtype=dtype)
81 |
82 | self.max_seq_len = max_seq_len
83 | self.max_batch_size = max_batch_size if max_batch_size is not None else 1
84 |
85 | self.cache_k = nn.Parameter(pdn.special.zeros(
86 | (self.max_batch_size, max_seq_len, self.n_heads, self.head_dim),
87 | dtype=dtype),
88 | requires_grad=False)
89 | self.cache_v = nn.Parameter(pdn.special.zeros(
90 | (self.max_batch_size, max_seq_len, self.n_heads, self.head_dim),
91 | dtype=dtype),
92 | requires_grad=False)
93 |
94 | def __call__(self, x, start_pos: int, mask, freqs_cos, freqs_sin):
95 | B, L, _ = x.shape
96 | xq, xk, xv = (
97 | self.Q(x).reshape(B, L, self.n_heads, self.head_dim),
98 | self.K(x).reshape(B, L, self.n_heads, self.head_dim),
99 | self.V(x).reshape(B, L, self.n_heads, self.head_dim),
100 | )
101 |
102 | xq, xk = apply_rotary_emb(xq, xk, freqs_cos, freqs_sin)
103 |
104 | if not self._train:
105 | self.cache_k[:B, start_pos:start_pos + L] = xk
106 | self.cache_v[:B, start_pos:start_pos + L] = xv
107 |
108 | xk = self.cache_k[:B, :start_pos + L]
109 | xv = self.cache_v[:B, :start_pos + L]
110 |
111 | xq, xkT = xq.transpose(0, 2, 1, 3), xk.transpose(0, 2, 3, 1)
112 | attention = xq @ xkT / math.sqrt(self.head_dim)
113 |
114 | if mask is not None:
115 | attention = attention + mask
116 | attention = F.softmax(attention, axis=-1)
117 | output = attention @ xv.transpose(0, 2, 1, 3)
118 |
119 | output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
120 | return self.O(output)
121 |
122 |
123 | class TransformerBlock(nn.Module):
124 |
125 | def __init__(
126 | self,
127 | dim: int,
128 | n_heads: int,
129 | ffn_dim: int,
130 | max_seq_len: int,
131 | max_batch_size: int = None,
132 | dtype=None,
133 | ):
134 | super().__init__()
135 | self.attention = Attention(dim, n_heads, max_seq_len, max_batch_size,
136 | dtype)
137 | self.ffn = FeedForward(dim, ffn_dim, dtype)
138 | self.input_norm = nn.RMSNorm(dim, dtype=dtype)
139 | self.post_attn_norm = nn.RMSNorm(dim, dtype=dtype)
140 |
141 | def forward(self, x, start_pos: int, mask, freqs_cos, freqs_sin):
142 | norm_x = self.input_norm(x)
143 |
144 | h1 = self.attention(norm_x, start_pos, mask, freqs_cos, freqs_sin)
145 | z = x + h1
146 |
147 | norm_z = self.post_attn_norm(z)
148 | h2 = self.ffn(norm_z)
149 | return z + h2
150 |
151 |
152 | class Llama(nn.Module):
153 |
154 | def __init__(
155 | self,
156 | vocab_size,
157 | embed_dim,
158 | n_heads,
159 | ffn_dim: int,
160 | max_seq_len: int,
161 | max_batch_size: int = None,
162 | n_layers: int = 6,
163 | dtype=None,
164 | ):
165 | super().__init__()
166 | self.vocab_size = vocab_size
167 | self.embed_dim = embed_dim
168 | self.n_heads = n_heads
169 | self.ffn_dim = ffn_dim
170 | self.max_seq_len = max_seq_len
171 | self.max_batch_size = max_batch_size
172 | self.n_layers = n_layers
173 |
174 | self.tok_embedding = nn.Embedding(vocab_size, embed_dim, dtype=dtype)
175 | freqs_cos, freqs_sin = compute_cos_sin_cache(embed_dim // n_heads,
176 | max_seq_len,
177 | dtype=dtype)
178 |
179 | self.freqs_cos = nn.parameter.Parameter(freqs_cos, False)
180 | self.freqs_sin = nn.parameter.Parameter(freqs_sin, False)
181 |
182 | self.layers = nn.ModuleList([
183 | TransformerBlock(embed_dim, n_heads, ffn_dim, max_seq_len,
184 | max_batch_size, dtype)
185 | for _ in range(self.n_layers)
186 | ])
187 |
188 | self.norm = nn.RMSNorm(embed_dim, dtype=dtype)
189 | self.lm_head = nn.Linear(embed_dim, vocab_size, dtype=dtype)
190 |
191 | def forward(self, input_ids, start_pos: int):
192 | L = input_ids.shape[-1]
193 | h = self.tok_embedding(input_ids)
194 |
195 | freqs_cos = self.freqs_cos[start_pos:start_pos + L]
196 | freqs_sin = self.freqs_sin[start_pos:start_pos + L]
197 |
198 | mask = None
199 | if L > 1:
200 | mask = np.triu(np.full((L, L), float("-inf")), k=1)
201 | mask = np.concatenate([np.zeros((L, start_pos)), mask], axis=1)
202 | mask = pdn.Tensor(mask, device=h.device, dtype=h.dtype)
203 |
204 | for layer in self.layers:
205 | h = layer(h, start_pos, mask, freqs_cos, freqs_sin)
206 |
207 | logit = self.lm_head(self.norm(h)[:, [-1], :])
208 | return logit
209 |
210 | def generate(self, input_ids, max_new_tokens: int):
211 | _, L = input_ids.shape
212 | for i, curr_pos in enumerate(range(L, max_new_tokens)):
213 | if i == 0: # Prefill Phase
214 | inputs = input_ids
215 | pos = 0
216 | else: # Decode Phase
217 | inputs = next_id
218 | pos = curr_pos
219 | logits = self(inputs, pos)
220 | next_id = logits[:, -1, :].argmax(-1, True)
221 | yield next_id
222 |
--------------------------------------------------------------------------------
/pydynet/nn/modules/norm.py:
--------------------------------------------------------------------------------
1 | from .module import Module
2 | from ..parameter import Parameter
3 | from .. import init
4 | from ...special import empty
5 | from ... import core
6 | from ...cuda import Device
7 |
8 |
9 | class BatchNorm1d(Module):
10 | '''
11 | 一维Batch Normalization层
12 |
13 | Parameters
14 | ----------
15 | num_features : int
16 | 输入特征数.
17 | eps : float, default=1e-5
18 | 防止除数为0的极小项.
19 | momentum : float, default=0.5
20 | 计算累积均值和方差的动量项.
21 | device : Optional[Device], default=None
22 | 层数据所在的设备.
23 | dtype : default=Nonr
24 | 层数据的类型.
25 | '''
26 |
27 | def __init__(
28 | self,
29 | num_features: int,
30 | eps: float = 1e-6,
31 | momentum: float = 0.1,
32 | device=None,
33 | dtype=None,
34 | ) -> None:
35 | super().__init__()
36 | kwargs = {"device": Device(device), "dtype": dtype}
37 | self.num_features = num_features
38 | self.eps = eps
39 | self.momentum = momentum
40 | self.running_mean = Parameter(
41 | empty(self.num_features, **kwargs),
42 | requires_grad=False,
43 | )
44 | self.running_var = Parameter(
45 | empty(self.num_features, **kwargs),
46 | requires_grad=False,
47 | )
48 | self.scale = Parameter(empty(self.num_features, **kwargs))
49 | self.shift = Parameter(empty(self.num_features, **kwargs))
50 | self.reset_parameters()
51 |
52 | def reset_parameters(self):
53 | init.zeros_(self.running_mean)
54 | init.ones_(self.running_var)
55 | init.zeros_(self.shift)
56 | init.ones_(self.scale)
57 |
58 | def forward(self, x):
59 | if self._train:
60 | mean = x.mean(0)
61 | center_data = x - mean
62 | var = core.mean(core.square(center_data), 0)
63 | std_data = center_data / core.sqrt(var + self.eps)
64 |
65 | self.running_mean *= (1 - self.momentum)
66 | self.running_mean += self.momentum * mean
67 | self.running_var *= (1 - self.momentum)
68 | self.running_var += self.momentum * var
69 |
70 | return std_data * self.scale + self.shift
71 | else:
72 | return (x - self.running_mean) * self.scale / core.sqrt(
73 | self.running_var + self.eps) + self.shift
74 |
75 | def __repr__(self) -> str:
76 | return "{}(num_features={}, momentum={})".format(
77 | self.__class__.__name__,
78 | self.num_features,
79 | self.momentum,
80 | )
81 |
82 |
83 | class BatchNorm2d(Module):
84 | '''
85 | 二维Batch Normalization层
86 |
87 | Parameters
88 | ----------
89 | num_features : int
90 | 输入特征数(通道数).
91 | eps : float, default=1e-5
92 | 防止除数为0的极小项.
93 | momentum : float, default=0.5
94 | 计算累积均值和方差的动量项.
95 | device : Optional[Device], default=None
96 | 层数据所在的设备.
97 | dtype : default=None
98 | 层数据的类型.
99 | '''
100 |
101 | def __init__(
102 | self,
103 | num_features: int,
104 | eps: float = 1e-6,
105 | momentum: float = 0.1,
106 | device=None,
107 | dtype=None,
108 | ) -> None:
109 | super().__init__()
110 | kwargs = {"device": Device(device), "dtype": dtype}
111 | self.num_features = num_features
112 | self.eps = eps
113 | self.momentum = momentum
114 | self.running_mean = Parameter(
115 | empty((1, self.num_features, 1, 1), **kwargs),
116 | requires_grad=False,
117 | )
118 | self.running_var = Parameter(
119 | empty((1, self.num_features, 1, 1), **kwargs),
120 | requires_grad=False,
121 | )
122 | self.scale = Parameter(empty(1, self.num_features, 1, 1, **kwargs))
123 | self.shift = Parameter(empty((1, self.num_features, 1, 1), **kwargs))
124 | self.reset_parameters()
125 |
126 | def reset_parameters(self):
127 | init.zeros_(self.running_mean)
128 | init.ones_(self.running_var)
129 | init.zeros_(self.shift)
130 | init.ones_(self.scale)
131 |
132 | def forward(self, x):
133 | if self._train:
134 | mean = x.mean((0, 2, 3), keepdims=True)
135 | center_data = x - mean
136 | var = core.mean(core.square(center_data), (0, 2, 3), keepdims=True)
137 | std_data = center_data / core.sqrt(var + self.eps)
138 |
139 | self.running_mean *= (1 - self.momentum)
140 | self.running_mean += self.momentum * mean
141 | self.running_var *= (1 - self.momentum)
142 | self.running_var += self.momentum * var
143 |
144 | return std_data * self.scale + self.shift
145 | else:
146 | return (x - self.running_mean) * self.scale / core.sqrt(
147 | self.running_var + self.eps) + self.shift
148 |
149 | def __repr__(self) -> str:
150 | return "{}(num_features={}, momentum={})".format(
151 | self.__class__.__name__,
152 | self.num_features,
153 | self.momentum,
154 | )
155 |
156 |
157 | class LayerNorm(Module):
158 | '''
159 | Layer Normalization
160 |
161 | Parameters
162 | ----------
163 | normalized_shape : Tuple[int]
164 | eps : float, default=1e-5
165 | momentum : float, default=0.5
166 | device : Optional[Device], default=None
167 | dtype : default=None
168 | '''
169 |
170 | def __init__(
171 | self,
172 | normalized_shape: int,
173 | eps: float = 1e-6,
174 | momentum: float = 0.1,
175 | device=None,
176 | dtype=None,
177 | ) -> None:
178 | super().__init__()
179 | kwargs = {"device": Device(device), "dtype": dtype}
180 | if isinstance(normalized_shape, int):
181 | normalized_shape = (normalized_shape, )
182 | self.normalized_shape = tuple(normalized_shape)
183 | self.eps = eps
184 | self.momentum = momentum
185 | self.running_mean = Parameter(
186 | empty(normalized_shape, **kwargs),
187 | requires_grad=False,
188 | )
189 | self.running_var = Parameter(
190 | empty(normalized_shape, **kwargs),
191 | requires_grad=False,
192 | )
193 | self.scale = Parameter(empty(*normalized_shape, **kwargs))
194 | self.shift = Parameter(empty(normalized_shape, **kwargs))
195 | self.reset_parameters()
196 |
197 | def reset_parameters(self):
198 | init.zeros_(self.running_mean)
199 | init.ones_(self.running_var)
200 | init.zeros_(self.shift)
201 | init.ones_(self.scale)
202 |
203 | def forward(self, x):
204 | if self._train:
205 | axis = tuple(range(x.ndim - len(self.normalized_shape)))
206 | mean = x.mean(axis)
207 | center_data = x - mean
208 | var = core.square(center_data).mean(axis)
209 | std_data = center_data / core.sqrt(var + self.eps)
210 | self.running_mean *= (1 - self.momentum)
211 | self.running_mean += self.momentum * mean
212 | self.running_var *= (1 - self.momentum)
213 | self.running_var += self.momentum * var
214 |
215 | return std_data * self.scale + self.shift
216 | else:
217 | return (x - self.running_mean) * self.scale / core.sqrt(
218 | self.running_var + self.eps) + self.shift
219 |
220 |
221 | class RMSNorm(Module):
222 |
223 | def __init__(
224 | self,
225 | normalized_shape: tuple,
226 | eps: float = 1e-6,
227 | device=None,
228 | dtype=None,
229 | ):
230 | super().__init__()
231 | kwargs = {"device": Device(device), "dtype": dtype}
232 | if isinstance(normalized_shape, int):
233 | normalized_shape = (normalized_shape, )
234 | self.normalized_shape = tuple(normalized_shape)
235 | self.sum_axis = tuple(
236 | [-(i + 1) for i in range(len(self.normalized_shape))])
237 | self.eps = eps
238 |
239 | self.weight = Parameter(empty(self.normalized_shape, **kwargs))
240 | self.reset_parameters()
241 |
242 | def reset_parameters(self):
243 | init.ones_(self.weight)
244 |
245 | def forward(self, x):
246 | z = core.square(x).mean(self.sum_axis, keepdims=True)
247 | z = x / core.sqrt(z + self.eps)
248 | return z * self.weight
249 |
--------------------------------------------------------------------------------
/examples/pytorch/transformer.py:
--------------------------------------------------------------------------------
1 | from os.path import join
2 | from tqdm import tqdm
3 |
4 | import torch
5 | import torch.nn as nn
6 | import torch.nn.functional as F
7 | from torch.optim import Adam
8 | from pydynet.data import data_loader
9 |
10 | import numpy as np
11 |
12 | import numpy as np
13 | import matplotlib.pyplot as plt
14 | from sklearn.model_selection import train_test_split
15 |
16 | np.random.seed(42)
17 |
18 | path = r'examples/data/CoLA/tokenized'
19 |
20 |
21 | def extract(line: str):
22 | lines = line.split('\t')
23 | y = int(lines[1])
24 | sentence = lines[-1][:-1]
25 | return sentence.split(), y
26 |
27 |
28 | def load_data():
29 |
30 | with open(join(path, 'in_domain_train.tsv'), 'r', encoding='utf-8') as f:
31 | lines = f.readlines()
32 |
33 | sens, ys = [], []
34 | max_len = -1
35 | word_dict = set()
36 | for line in tqdm(lines):
37 | x, y = extract(line)
38 | word_dict = word_dict.union(set(x))
39 | max_len = max(max_len, len(x))
40 | sens.append(x)
41 | ys.append(y)
42 | word_dict = list(word_dict)
43 |
44 | X = np.zeros((len(lines), max_len), dtype=int)
45 | for i in tqdm(range(len(lines))):
46 | for j, word in enumerate(sens[i]):
47 | X[i, j] = word_dict.index(word) + 1
48 | y = np.array(ys)
49 |
50 | return X, y
51 |
52 |
53 | class SelfAttention(nn.Module):
54 |
55 | def __init__(self, embed_size, heads):
56 | super(SelfAttention, self).__init__()
57 | self.embed_size = embed_size
58 | self.heads = heads
59 | self.head_dim = embed_size // heads
60 |
61 | assert (self.head_dim * heads == embed_size
62 | ), "Embedding size needs to be divisible by heads"
63 |
64 | self.Q = nn.Linear(self.embed_size, self.embed_size, bias=False)
65 | self.K = nn.Linear(self.embed_size, self.embed_size, bias=False)
66 | self.V = nn.Linear(self.embed_size, self.embed_size, bias=False)
67 | self.O = nn.Linear(self.embed_size, self.embed_size, bias=False)
68 |
69 | def forward(self, values, keys, query, mask):
70 | N = query.shape[0]
71 | value_len, key_len, query_len = values.shape[1], keys.shape[
72 | 1], query.shape[1]
73 |
74 | xq, xk, xv = (
75 | self.Q(values).reshape(N, value_len, self.heads, self.head_dim),
76 | self.K(values).reshape(N, key_len, self.heads, self.head_dim),
77 | self.V(values).reshape(N, query_len, self.heads, self.head_dim),
78 | )
79 |
80 | # Split the embedding into self.heads different pieces
81 | xq, xkT = xq.permute(0, 2, 1, 3), xk.permute(0, 2, 3, 1)
82 | attention = xq @ xkT / self.head_dim**.5
83 |
84 | if mask is not None:
85 | mask[mask.eq(1)] = -torch.inf
86 | attention = attention + mask
87 |
88 | attention = F.softmax(attention, dim=-1)
89 | output = attention @ xv.permute(0, 2, 1, 3)
90 |
91 | output = output.permute(0, 2, 1, 3).reshape(N, value_len, -1)
92 | return self.O(output)
93 |
94 |
95 | class TransformerBlock(nn.Module):
96 |
97 | def __init__(self, embed_size, heads, dropout, forward_expansion):
98 | super(TransformerBlock, self).__init__()
99 | self.attention = SelfAttention(embed_size, heads)
100 | self.norm1 = nn.LayerNorm(embed_size)
101 | self.norm2 = nn.LayerNorm(embed_size)
102 |
103 | self.feed_forward = nn.Sequential(
104 | nn.Linear(
105 | embed_size,
106 | forward_expansion * embed_size,
107 | ),
108 | nn.ReLU(),
109 | nn.Linear(
110 | forward_expansion * embed_size,
111 | embed_size,
112 | ),
113 | )
114 |
115 | def forward(self, value, key, query, mask):
116 | attention = self.attention(value, key, query, mask)
117 | x = (self.norm1(attention + query))
118 | forward = self.feed_forward(x)
119 | out = (self.norm2(forward + x))
120 | return out
121 |
122 |
123 | def sinusoidal_positional_encoding(max_len: int, d_model: int):
124 | position = np.arange(max_len)[:, np.newaxis]
125 | div_term = np.exp(np.arange(0, d_model, 2) * (-np.log(10000.0) / d_model))
126 | pe = np.zeros((max_len, d_model))
127 | pe[:, 0::2] = np.sin(position * div_term)
128 | pe[:, 1::2] = np.cos(position * div_term)
129 |
130 | return torch.tensor(pe.astype(np.float32))
131 |
132 |
133 | @torch.no_grad()
134 | def construct_mask(x: torch.Tensor, padding_idx=0):
135 | mask = x.eq(padding_idx) # [batch_size, seq_len]
136 | return torch.unsqueeze(mask, 1).unsqueeze(1) # [batch_size, 1, 1, seq_len]
137 |
138 |
139 | class Transformer(nn.Module):
140 |
141 | def __init__(
142 | self,
143 | embed_size,
144 | num_layers,
145 | heads,
146 | forward_expansion,
147 | dropout,
148 | vocab_size,
149 | max_length,
150 | ):
151 | super(Transformer, self).__init__()
152 | self.embed_size = embed_size
153 | self.word_embedding = nn.Embedding(
154 | vocab_size,
155 | embed_size,
156 | padding_idx=0,
157 | )
158 | self.position_embedding = nn.Parameter(
159 | sinusoidal_positional_encoding(max_length, embed_size), False)
160 |
161 | self.layers = nn.ModuleList([
162 | TransformerBlock(
163 | embed_size,
164 | heads,
165 | dropout=dropout,
166 | forward_expansion=forward_expansion,
167 | ) for _ in range(num_layers)
168 | ])
169 |
170 | self.fc_out = nn.Linear(embed_size, 1)
171 |
172 | def forward(self, x, mask):
173 | a = self.word_embedding(x)
174 | out = a + self.position_embedding
175 |
176 | for layer in self.layers:
177 | out = layer(out, out, out, mask)
178 |
179 | out = out[:, 0, :]
180 | return self.fc_out(out)
181 |
182 |
183 | if __name__ == "__main__":
184 | LR = 5e-4
185 | EPOCHES = 100
186 | TRAIN_BATCH_SIZE = 128
187 | TEST_BATCH_SIZE = 512
188 | use_cuda = True
189 |
190 | device = 'cuda' if torch.cuda.is_available() and use_cuda else 'cpu'
191 |
192 | X, y = load_data()
193 | y[y == 0] = -1
194 |
195 | train_X, test_X, train_y, test_y = train_test_split(
196 | torch.tensor(X),
197 | torch.tensor(y),
198 | train_size=0.8,
199 | stratify=y,
200 | shuffle=True,
201 | )
202 |
203 | ratio_pos = (train_y.float().mean() + 1) / 2
204 |
205 | train_loader = data_loader(
206 | train_X,
207 | train_y,
208 | shuffle=False,
209 | batch_size=TRAIN_BATCH_SIZE,
210 | )
211 | test_loader = data_loader(
212 | test_X,
213 | test_y,
214 | shuffle=False,
215 | batch_size=TEST_BATCH_SIZE,
216 | )
217 |
218 | net = Transformer(512, 1, 4, 3, 0.05, X.max() + 1, 44).to(device)
219 | optimizer = Adam(net.parameters(), lr=LR)
220 | bar = tqdm(range(EPOCHES))
221 | info_list = []
222 | for epoch in bar:
223 |
224 | net.train()
225 |
226 | for batch_X, batch_y in train_loader:
227 | input_, label = batch_X.to(device), batch_y.to(device)
228 | output = net(input_, construct_mask(input_))
229 | weight = torch.ones(label.shape)
230 | weight[label == -1] = 1 / (1 - ratio_pos)
231 | weight[label == 1] = 1 / ratio_pos
232 | loss = (weight.to(device) *
233 | torch.log(1 + torch.exp(-label * torch.squeeze(output)))
234 | ).mean()
235 | optimizer.zero_grad()
236 | loss.backward()
237 | optimizer.step()
238 |
239 | net.eval()
240 | train_right, train_size = 0, 0
241 | test_right, test_size = 0, 0
242 |
243 | with torch.no_grad():
244 | for batch_X, batch_y in train_loader:
245 | input_, label = batch_X.to(device), batch_y.to(device)
246 | pred = torch.sign(
247 | torch.squeeze(net(input_, construct_mask(input_))))
248 | train_right += (pred.data == label.data).sum()
249 | train_size += batch_X.shape[0]
250 |
251 | for batch_X, batch_y in test_loader:
252 | input_, label = batch_X.to(device), batch_y.to(device)
253 | pred = torch.sign(
254 | torch.squeeze(net(input_, construct_mask(input_))))
255 | test_right += (pred.data == label.data).sum()
256 | test_size += batch_X.shape[0]
257 |
258 | train_acc, test_acc = train_right / train_size, test_right / test_size
259 | bar.set_postfix(
260 | Loss="{:.6f}".format(loss.item()),
261 | TEST_ACC="{:.4f}".format(test_acc),
262 | TRAIN_ACC="{:.4f}".format(train_acc),
263 | )
264 | info_list.append([train_acc.item(), test_acc.item()])
265 |
266 | info_list = np.array(info_list)
267 |
268 | plt.figure(figsize=(5, 3))
269 |
270 | plt.rcParams['font.sans-serif'] = ['Times New Roman']
271 | plt.rcParams['mathtext.fontset'] = 'stix'
272 | plt.rcParams['xtick.direction'] = 'in'
273 | plt.rcParams['ytick.direction'] = 'in'
274 | plt.rcParams['axes.linewidth'] = 0.5
275 |
276 | plt.grid(zorder=-10)
277 | plot_kwargs = {'linewidth': 0.7, 'zorder': 10}
278 |
279 | x = np.arange(0, 100, 4) + 2
280 | plt.plot(x,
281 | info_list[::4, 0],
282 | label="Training accuracy",
283 | color='blue',
284 | marker='^',
285 | **plot_kwargs,
286 | linestyle='-')
287 | plt.plot(x,
288 | info_list[::4, 1],
289 | label="Test accuracy",
290 | color='red',
291 | marker='*',
292 | **plot_kwargs,
293 | linestyle='--')
294 |
295 | plt.xlim(0, 100)
296 | plt.ylim(.4, 1)
297 |
298 | plt.yticks([.4, .6, .8, 1], size=13)
299 | plt.xticks([20, 40, 60, 80, 100], size=13)
300 | plt.xlabel("Epochs", size=13)
301 | plt.legend()
302 | plt.tight_layout()
303 | plt.savefig("imgs/transformer.png")
304 |
--------------------------------------------------------------------------------
/examples/pydynet/transformer.py:
--------------------------------------------------------------------------------
1 | from os.path import join
2 | from tqdm import tqdm
3 |
4 | import pydynet as pdn
5 | import pydynet.nn as nn
6 | import pydynet.nn.functional as F
7 | from pydynet.optim import Adam
8 | from pydynet.data import data_loader
9 |
10 | import numpy as np
11 |
12 | import numpy as np
13 | import matplotlib.pyplot as plt
14 | from sklearn.model_selection import train_test_split
15 |
16 | np.random.seed(42)
17 |
18 | path = r'examples/data/CoLA/tokenized'
19 |
20 |
21 | def extract(line: str):
22 | lines = line.split('\t')
23 | y = int(lines[1])
24 | sentence = lines[-1][:-1]
25 | return sentence.split(), y
26 |
27 |
28 | def load_data():
29 |
30 | with open(join(path, 'in_domain_train.tsv'), 'r', encoding='utf-8') as f:
31 | lines = f.readlines()
32 |
33 | sens, ys = [], []
34 | max_len = -1
35 | word_dict = set()
36 | for line in tqdm(lines):
37 | x, y = extract(line)
38 | word_dict = word_dict.union(set(x))
39 | max_len = max(max_len, len(x))
40 | sens.append(x)
41 | ys.append(y)
42 | word_dict = list(word_dict)
43 |
44 | X = np.zeros((len(lines), max_len), dtype=int)
45 | for i in tqdm(range(len(lines))):
46 | for j, word in enumerate(sens[i]):
47 | X[i, j] = word_dict.index(word) + 1
48 | y = np.array(ys)
49 |
50 | return X, y
51 |
52 |
53 | class SelfAttention(nn.Module):
54 |
55 | def __init__(self, embed_size, heads):
56 | super(SelfAttention, self).__init__()
57 | self.embed_size = embed_size
58 | self.heads = heads
59 | self.head_dim = embed_size // heads
60 |
61 | assert (self.head_dim * heads == embed_size
62 | ), "Embedding size needs to be divisible by heads"
63 |
64 | self.Q = nn.Linear(self.embed_size,
65 | self.embed_size,
66 | bias=False,
67 | dtype=np.float32)
68 | self.K = nn.Linear(self.embed_size,
69 | self.embed_size,
70 | bias=False,
71 | dtype=np.float32)
72 | self.V = nn.Linear(self.embed_size,
73 | self.embed_size,
74 | bias=False,
75 | dtype=np.float32)
76 | self.O = nn.Linear(self.embed_size,
77 | self.embed_size,
78 | bias=False,
79 | dtype=np.float32)
80 |
81 | def forward(self, values, keys, query, mask):
82 | N = query.shape[0]
83 | value_len, key_len, query_len = values.shape[1], keys.shape[
84 | 1], query.shape[1]
85 |
86 | xq, xk, xv = (
87 | self.Q(values).reshape(N, value_len, self.heads, self.head_dim),
88 | self.K(values).reshape(N, key_len, self.heads, self.head_dim),
89 | self.V(values).reshape(N, query_len, self.heads, self.head_dim),
90 | )
91 |
92 | # Split the embedding into self.heads different pieces
93 | xq, xkT = xq.transpose(0, 2, 1, 3), xk.transpose(0, 2, 3, 1)
94 | attention = xq @ xkT / self.head_dim**.5
95 |
96 | if mask is not None:
97 | mask[mask.eq(1)] = np.float32('-inf')
98 | attention = attention + mask
99 |
100 | attention = F.softmax(attention, axis=-1)
101 | output = attention @ xv.transpose(0, 2, 1, 3)
102 |
103 | output = output.transpose(0, 2, 1, 3).reshape(N, value_len, -1)
104 | return self.O(output)
105 |
106 |
107 | class TransformerBlock(nn.Module):
108 |
109 | def __init__(self, embed_size, heads, dropout, forward_expansion):
110 | super(TransformerBlock, self).__init__()
111 | self.attention = SelfAttention(embed_size, heads)
112 | self.norm1 = nn.LayerNorm(embed_size, dtype=np.float32)
113 | self.norm2 = nn.LayerNorm(embed_size, dtype=np.float32)
114 |
115 | self.feed_forward = nn.Sequential(
116 | nn.Linear(embed_size,
117 | forward_expansion * embed_size,
118 | dtype=np.float32),
119 | nn.ReLU(),
120 | nn.Linear(forward_expansion * embed_size,
121 | embed_size,
122 | dtype=np.float32),
123 | )
124 |
125 | def forward(self, value, key, query, mask):
126 | attention = self.attention(value, key, query, mask)
127 | x = (self.norm1(attention + query))
128 | forward = self.feed_forward(x)
129 | out = (self.norm2(forward + x))
130 | return out
131 |
132 |
133 | def sinusoidal_positional_encoding(max_len: int, d_model: int):
134 | position = np.arange(max_len)[:, np.newaxis]
135 | div_term = np.exp(np.arange(0, d_model, 2) * (-np.log(10000.0) / d_model))
136 | pe = np.zeros((max_len, d_model))
137 | pe[:, 0::2] = np.sin(position * div_term)
138 | pe[:, 1::2] = np.cos(position * div_term)
139 |
140 | return pdn.Tensor(pe.astype(np.float32))
141 |
142 |
143 | @pdn.no_grad()
144 | def construct_mask(x: pdn.Tensor, padding_idx=0):
145 | mask = x.eq(padding_idx) # [batch_size, seq_len]
146 | return pdn.unsqueeze(mask, (1, 2)).astype(
147 | np.float32) # [batch_size, 1, 1, seq_len]
148 |
149 |
150 | class Transformer(nn.Module):
151 |
152 | def __init__(
153 | self,
154 | embed_size,
155 | num_layers,
156 | heads,
157 | forward_expansion,
158 | dropout,
159 | vocab_size,
160 | max_length,
161 | ):
162 | super(Transformer, self).__init__()
163 | self.embed_size = embed_size
164 | self.word_embedding = nn.Embedding(
165 | vocab_size,
166 | embed_size,
167 | padding_idx=0,
168 | dtype=np.float32,
169 | )
170 | self.position_embedding = nn.Parameter(
171 | sinusoidal_positional_encoding(max_length, embed_size), False)
172 |
173 | self.layers = nn.ModuleList([
174 | TransformerBlock(
175 | embed_size,
176 | heads,
177 | dropout=dropout,
178 | forward_expansion=forward_expansion,
179 | ) for _ in range(num_layers)
180 | ])
181 |
182 | self.fc_out = nn.Linear(embed_size, 1, dtype=np.float32)
183 |
184 | def forward(self, x, mask):
185 | a = self.word_embedding(x)
186 | out = a + self.position_embedding
187 |
188 | for layer in self.layers:
189 | out = layer(out, out, out, mask)
190 |
191 | out = out[:, 0, :]
192 | return self.fc_out(out)
193 |
194 |
195 | if __name__ == "__main__":
196 | LR = 5e-4
197 | EPOCHES = 100
198 | TRAIN_BATCH_SIZE = 128
199 | TEST_BATCH_SIZE = 512
200 | use_cuda = True
201 |
202 | device = 'cuda' if pdn.cuda.is_available() and use_cuda else 'cpu'
203 |
204 | X, y = load_data()
205 | y[y == 0] = -1
206 |
207 | train_X, test_X, train_y, test_y = train_test_split(
208 | pdn.Tensor(X),
209 | pdn.Tensor(y),
210 | train_size=0.8,
211 | stratify=y,
212 | shuffle=True,
213 | )
214 |
215 | ratio_pos = (train_y.mean() + 1) / 2
216 |
217 | train_loader = data_loader(
218 | train_X,
219 | train_y,
220 | shuffle=False,
221 | batch_size=TRAIN_BATCH_SIZE,
222 | )
223 | test_loader = data_loader(
224 | test_X,
225 | test_y,
226 | shuffle=False,
227 | batch_size=TEST_BATCH_SIZE,
228 | )
229 |
230 | net = Transformer(512, 1, 4, 3, 0.05, X.max() + 1, 44).to(device)
231 | optimizer = Adam(net.parameters(), lr=LR)
232 | bar = tqdm(range(EPOCHES))
233 | info_list = []
234 | for epoch in bar:
235 |
236 | net.train()
237 |
238 | for batch_X, batch_y in train_loader:
239 | input_, label = batch_X.to(device), batch_y.to(device)
240 | output = net(input_, construct_mask(input_))
241 | weight = pdn.ones(label.shape, dtype=np.float32)
242 | weight[label == -1] = 1 / (1 - ratio_pos)
243 | weight[label == 1] = 1 / ratio_pos
244 | loss = (weight.to(device) *
245 | pdn.log(1 + pdn.exp(-label * pdn.squeeze(output)))).mean()
246 | optimizer.zero_grad()
247 | loss.backward()
248 | optimizer.step()
249 |
250 | net.eval()
251 | train_right, train_size = 0, 0
252 | test_right, test_size = 0, 0
253 |
254 | with pdn.no_grad():
255 | for batch_X, batch_y in train_loader:
256 | input_, label = batch_X.to(device), batch_y.to(device)
257 | pred = pdn.sign(
258 | pdn.squeeze(net(input_, construct_mask(input_))))
259 | train_right += (pred.data == label.data).sum()
260 | train_size += batch_X.shape[0]
261 |
262 | for batch_X, batch_y in test_loader:
263 | input_, label = batch_X.to(device), batch_y.to(device)
264 | pred = pdn.sign(
265 | pdn.squeeze(net(input_, construct_mask(input_))))
266 | test_right += (pred.data == label.data).sum()
267 | test_size += batch_X.shape[0]
268 |
269 | train_acc, test_acc = train_right / train_size, test_right / test_size
270 | bar.set_postfix(
271 | Loss="{:.6f}".format(loss.item()),
272 | TEST_ACC="{:.4f}".format(test_acc),
273 | TRAIN_ACC="{:.4f}".format(train_acc),
274 | )
275 | info_list.append([train_acc.item(), test_acc.item()])
276 |
277 | info_list = np.array(info_list)
278 |
279 | plt.figure(figsize=(5, 3))
280 |
281 | plt.rcParams['font.sans-serif'] = ['Times New Roman']
282 | plt.rcParams['mathtext.fontset'] = 'stix'
283 | plt.rcParams['xtick.direction'] = 'in'
284 | plt.rcParams['ytick.direction'] = 'in'
285 | plt.rcParams['axes.linewidth'] = 0.5
286 |
287 | plt.grid(zorder=-10)
288 | plot_kwargs = {'linewidth': 0.7, 'zorder': 10}
289 |
290 | x = np.arange(0, 100, 4) + 2
291 | plt.plot(x,
292 | info_list[::4, 0],
293 | label="Training accuracy",
294 | color='blue',
295 | marker='^',
296 | **plot_kwargs,
297 | linestyle='-')
298 | plt.plot(x,
299 | info_list[::4, 1],
300 | label="Test accuracy",
301 | color='red',
302 | marker='*',
303 | **plot_kwargs,
304 | linestyle='--')
305 |
306 | plt.xlim(0, 100)
307 | plt.ylim(.4, 1)
308 |
309 | plt.yticks([.4, .6, .8, 1], size=13)
310 | plt.xticks([20, 40, 60, 80, 100], size=13)
311 | plt.xlabel("Epochs", size=13)
312 | plt.legend()
313 | plt.tight_layout()
314 | plt.savefig("imgs/transformer.png")
315 |
--------------------------------------------------------------------------------
/llm/clip/infer.py:
--------------------------------------------------------------------------------
1 | import os, json, urllib, zipfile
2 | import urllib.request
3 | from PIL import Image
4 |
5 | import numpy as np
6 | import pydynet as pdn
7 | import pydynet.nn.functional as F
8 |
9 | from .tokenizer import SimpleTokenizer
10 | from .model import CLIP
11 |
12 |
13 | def download(url: str, filename: str, chunk_size: int = 10**6) -> None:
14 | # Create directories if they don't exist yet
15 | directories = os.path.dirname(filename)
16 | if directories:
17 | os.makedirs(directories, exist_ok=True)
18 |
19 | # Download the file
20 | with urllib.request.urlopen(url) as response:
21 | total = int(response.info()["Content-Length"])
22 |
23 | buf = b""
24 | while True:
25 | data = response.read(chunk_size)
26 | if not data:
27 | break
28 | buf += data
29 | print(f"Downloading {filename} {len(buf) / total * 100:.2f} %")
30 |
31 | # Write the downloaded data to the file
32 | with open(filename, "wb") as f:
33 | f.write(buf)
34 |
35 |
36 | def load_zip(path: str):
37 | files = {}
38 |
39 | with zipfile.ZipFile(path) as z:
40 | for file_info in z.infolist():
41 | with z.open(file_info) as f:
42 | path = file_info.filename
43 | files[path] = f.read()
44 |
45 | return files
46 |
47 |
48 | class Params:
49 |
50 | def __init__(self, name: str, download_root: str = None) -> None:
51 | assert name == "ViT-B/32", f"Model {name} not supported yet. Only ViT-B-32 currently supported."
52 |
53 | model_urls = {
54 | "RN50":
55 | "https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt",
56 | "RN101":
57 | "https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt",
58 | "RN50x4":
59 | "https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt",
60 | "RN50x16":
61 | "https://openaipublic.azureedge.net/clip/models/52378b407f34354e150460fe41077663dd5b39c54cd0bfd2b27167a4a06ec9aa/RN50x16.pt",
62 | "RN50x64":
63 | "https://openaipublic.azureedge.net/clip/models/be1cfb55d75a9666199fb2206c106743da0f6468c9d327f3e0d0a543a9919d9c/RN50x64.pt",
64 | "ViT-B/32":
65 | "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
66 | "ViT-B/16":
67 | "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt",
68 | "ViT-L/14":
69 | "https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt",
70 | "ViT-L/14@336px":
71 | "https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt",
72 | }
73 |
74 | model_url = model_urls[name]
75 |
76 | name = name.replace("/", "-")
77 |
78 | if download_root is None:
79 | download_root = os.path.expanduser(f"~/.cache/clip")
80 | download_root = os.environ.get("CLIP_DIR", download_root)
81 |
82 | model_path = os.path.join(download_root, f"{name}.pt")
83 |
84 | if not os.path.isfile(model_path):
85 | print(f"Downloading {model_path} from {model_url}")
86 | download(model_url, model_path)
87 |
88 | self.files = load_zip(model_path)
89 |
90 | with open(f"{download_root}/{name}.json") as f:
91 | self.info = json.load(f)
92 |
93 | def get_int(self, name: str) -> int:
94 | info = self.info[name]
95 |
96 | value: int = info["value"]
97 |
98 | return value
99 |
100 | def __getitem__(self, name: str):
101 | info = self.info[name]
102 |
103 | path = info["path"]
104 | dtype = info["dtype"]
105 | shape = info["shape"]
106 | start = info["start"]
107 | end = info["end"]
108 |
109 | assert dtype in ["float16", "float32"]
110 |
111 | data = self.files[path][start:end]
112 |
113 | arr = np.frombuffer(data, dtype=dtype).reshape(shape)
114 | arr = arr.astype(np.float32)
115 |
116 | return arr
117 |
118 |
119 | def tokenize(texts: list[str], context_length: int = 77):
120 | tokenizer = SimpleTokenizer()
121 |
122 | sot_token = tokenizer.encoder["<|startoftext|>"]
123 | eot_token = tokenizer.encoder["<|endoftext|>"]
124 |
125 | all_tokens = [[sot_token] + tokenizer.encode(text) + [eot_token]
126 | for text in texts]
127 |
128 | result = np.zeros((len(all_tokens), context_length), dtype=np.int64)
129 |
130 | for i, tokens in enumerate(all_tokens):
131 | if len(tokens) > context_length:
132 | raise RuntimeError(
133 | f"Input {texts[i]} is too long for context length {context_length}"
134 | )
135 |
136 | result[i, :len(tokens)] = tokens
137 |
138 | return result
139 |
140 |
141 | def preprocess(image: Image.Image, image_size: int = 224):
142 | # Scale image such that length of smaller side is 224
143 | width, height = image.size
144 | scale = image_size / min(width, height)
145 | width = int(scale * width)
146 | height = int(scale * height)
147 | # Some Pillow versions have different interface
148 | if hasattr(Image, "Resampling"):
149 | image = image.resize((width, height), Image.Resampling.BICUBIC)
150 | else:
151 | image = image.resize((width, height), Image.BICUBIC)
152 |
153 | # Crop center
154 | x0 = round((width - image_size) / 2)
155 | y0 = round((height - image_size) / 2)
156 | x1 = x0 + image_size
157 | y1 = y0 + image_size
158 | image = image.crop((x0, y0, x1, y1))
159 |
160 | image = image.convert("RGB")
161 |
162 | # Normalize
163 | x = np.array(image, dtype=np.float32) / 255.0
164 | mean = np.array([0.48145466, 0.4578275, 0.40821073], dtype=np.float32)
165 | std = np.array([0.26862954, 0.26130258, 0.27577711], dtype=np.float32)
166 | x = (x - mean) / std
167 |
168 | x = x.transpose(2, 0, 1)
169 |
170 | return pdn.Tensor(x, copy=None)
171 |
172 |
173 | @pdn.no_grad()
174 | def load_model(model: CLIP, param: Params):
175 |
176 | # with pdn.no_grad():
177 | model.scale = pdn.exp(param["logit_scale"].astype(np.float32))
178 | model.class_embed.data[0, 0] = param["visual.class_embedding"]
179 | model.v_pos_emb.data[...] = param["visual.positional_embedding"]
180 | model.t_pos_emb.data[...] = param["positional_embedding"]
181 |
182 | model.image_encoder.kernel.data[...] = param["visual.conv1.weight"]
183 | model.image_encoder.pre_norm.scale[...] = param["visual.ln_pre.weight"]
184 | model.image_encoder.pre_norm.shift[...] = param["visual.ln_pre.bias"]
185 | model.image_encoder.post_norm.scale[...] = param["visual.ln_post.weight"]
186 | model.image_encoder.post_norm.shift[...] = param["visual.ln_post.bias"]
187 |
188 | model.image_encoder.proj.weight[...] = param["visual.proj"]
189 |
190 | model.text_encoder.token_embed.weight[
191 | ...] = param["token_embedding.weight"]
192 | model.text_encoder.post_norm.scale[...] = param["ln_final.weight"]
193 | model.text_encoder.post_norm.shift[...] = param["ln_final.bias"]
194 | model.text_encoder.proj.weight[...] = param["text_projection"]
195 |
196 | prefix = "transformer.resblocks."
197 | for i in range(12):
198 | (
199 | model.image_encoder.transformers[i].mha.QKV.weight.data[...],
200 | model.image_encoder.transformers[i].mha.QKV.bias.data[...],
201 | model.image_encoder.transformers[i].mha.O.weight.data[...],
202 | model.image_encoder.transformers[i].mha.O.bias.data[...],
203 | model.image_encoder.transformers[i].layer_norm1.scale.data[...],
204 | model.image_encoder.transformers[i].layer_norm1.shift.data[...],
205 | model.image_encoder.transformers[i].layer_norm2.scale.data[...],
206 | model.image_encoder.transformers[i].layer_norm2.shift.data[...],
207 | model.image_encoder.transformers[i].mlp.fc1.weight.data[...],
208 | model.image_encoder.transformers[i].mlp.fc1.bias.data[...],
209 | model.image_encoder.transformers[i].mlp.fc2.weight.data[...],
210 | model.image_encoder.transformers[i].mlp.fc2.bias.data[...],
211 | model.text_encoder.transformers[i].mha.QKV.weight.data[...],
212 | model.text_encoder.transformers[i].mha.QKV.bias.data[...],
213 | model.text_encoder.transformers[i].mha.O.weight.data[...],
214 | model.text_encoder.transformers[i].mha.O.bias.data[...],
215 | model.text_encoder.transformers[i].layer_norm1.scale.data[...],
216 | model.text_encoder.transformers[i].layer_norm1.shift.data[...],
217 | model.text_encoder.transformers[i].layer_norm2.scale.data[...],
218 | model.text_encoder.transformers[i].layer_norm2.shift.data[...],
219 | model.text_encoder.transformers[i].mlp.fc1.weight.data[...],
220 | model.text_encoder.transformers[i].mlp.fc1.bias.data[...],
221 | model.text_encoder.transformers[i].mlp.fc2.weight.data[...],
222 | model.text_encoder.transformers[i].mlp.fc2.bias.data[...],
223 | ) = (
224 | param["visual." + prefix + f"{i}.attn.in_proj_weight"].T,
225 | param["visual." + prefix + f"{i}.attn.in_proj_bias"],
226 | param["visual." + prefix + f"{i}.attn.out_proj.weight"].T,
227 | param["visual." + prefix + f"{i}.attn.out_proj.bias"],
228 | param["visual." + prefix + f"{i}.ln_1.weight"],
229 | param["visual." + prefix + f"{i}.ln_1.bias"],
230 | param["visual." + prefix + f"{i}.ln_2.weight"],
231 | param["visual." + prefix + f"{i}.ln_2.bias"],
232 | param["visual." + prefix + f"{i}.mlp.c_fc.weight"].T,
233 | param["visual." + prefix + f"{i}.mlp.c_fc.bias"],
234 | param["visual." + prefix + f"{i}.mlp.c_proj.weight"].T,
235 | param["visual." + prefix + f"{i}.mlp.c_proj.bias"],
236 | param[prefix + f"{i}.attn.in_proj_weight"].T,
237 | param[prefix + f"{i}.attn.in_proj_bias"],
238 | param[prefix + f"{i}.attn.out_proj.weight"].T,
239 | param[prefix + f"{i}.attn.out_proj.bias"],
240 | param[prefix + f"{i}.ln_1.weight"],
241 | param[prefix + f"{i}.ln_1.bias"],
242 | param[prefix + f"{i}.ln_2.weight"],
243 | param[prefix + f"{i}.ln_2.bias"],
244 | param[prefix + f"{i}.mlp.c_fc.weight"].T,
245 | param[prefix + f"{i}.mlp.c_fc.bias"],
246 | param[prefix + f"{i}.mlp.c_proj.weight"].T,
247 | param[prefix + f"{i}.mlp.c_proj.bias"],
248 | )
249 | return model
250 |
251 |
252 | image = preprocess(Image.open("llm/clip/picture.png"))[np.newaxis, :, :, :]
253 | text = tokenize(["a fish", "a dog", "a cat"])
254 | clip = load_model(CLIP(), Params("ViT-B/32", download_root='llm/clip/data'))
255 |
256 | with pdn.no_grad():
257 | logits_per_image = clip(image, text)
258 | probs = F.softmax(logits_per_image, axis=-1)
259 | print("Label probs:", probs.numpy()[0])
260 |
--------------------------------------------------------------------------------
/pydynet/nn/functional.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from ..core import tensor, function
4 | from .. import unsqueeze, no_grad
5 |
6 |
7 | def linear(x: tensor.Tensor, weight: tensor.Tensor, bias: tensor.Tensor):
8 | affine = x @ weight
9 | if bias is not None:
10 | affine = affine + bias
11 | return affine
12 |
13 |
14 | def embedding(x: tensor.Tensor, weight: tensor.Tensor, padding_idx: int):
15 | query = weight[x]
16 | if padding_idx is not None:
17 | with tensor.no_grad():
18 | mask = unsqueeze(x.ne(padding_idx), -1)
19 | query = query * mask
20 | return query
21 |
22 |
23 | def sigmoid(x: tensor.Tensor):
24 | return tensor.sigmoid(x)
25 |
26 |
27 | def tanh(x: tensor.Tensor):
28 | return tensor.tanh(x)
29 |
30 |
31 | def relu(x: tensor.Tensor):
32 | return tensor.maximum(0., x)
33 |
34 |
35 | def leaky_relu(x: tensor.Tensor, alpha: float):
36 | return tensor.maximum(x, alpha * x)
37 |
38 |
39 | def silu(x: tensor.Tensor):
40 | return x / (1 + tensor.exp(-x))
41 |
42 |
43 | def softmax(x: tensor.Tensor, axis=None):
44 | '''Softmax函数'''
45 | with no_grad():
46 | max_ = x.max(axis, keepdims=True)
47 | x_sub_max = x - max_
48 | exp_ = tensor.exp(x_sub_max)
49 | return exp_ / tensor.sum(exp_, axis=axis, keepdims=True)
50 |
51 |
52 | def log_softmax(x: tensor.Tensor, axis=None, keepdims=False):
53 | '''log-softmax函数'''
54 | with no_grad():
55 | max_ = x.max(axis, keepdims=True)
56 | x_sub_max = x - max_
57 | return x_sub_max - tensor.log(
58 | tensor.sum(tensor.exp(x_sub_max), axis=axis, keepdims=keepdims))
59 |
60 |
61 | class __im2col1d(tensor._UnaryOperator):
62 |
63 | def __init__(
64 | self,
65 | x: tensor.Tensor,
66 | kernel_size: int,
67 | stride: int,
68 | ) -> None:
69 | self.N, self.in_channels, self.n_features = x.shape
70 | self.kernel_size = kernel_size
71 | self.stride = stride
72 | self.n_output = (self.n_features - self.kernel_size) // stride + 1
73 | super().__init__(x)
74 |
75 | def forward_(self, x: tensor.Tensor) -> np.ndarray:
76 | s0, s1, s2 = x.strides
77 | shape = (x.shape[0], self.in_channels, self.kernel_size, self.n_output)
78 | self.__strides = (s0, s1, s2, s2 * self.stride)
79 |
80 | col = self.xp.lib.stride_tricks.as_strided(
81 | x.data,
82 | shape=shape,
83 | strides=self.__strides,
84 | ).copy()
85 | return col
86 |
87 | def grad_fn(self, x: tensor.Tensor, grad: np.ndarray) -> np.ndarray:
88 | grad_x = self.xp.zeros(x.shape, dtype=self.dtype)
89 | view = self.xp.lib.stride_tricks.as_strided(
90 | grad_x,
91 | shape=self.shape,
92 | strides=self.__strides,
93 | )
94 | self.xp.add.at(view, (..., ), grad)
95 | return grad_x
96 |
97 |
98 | class __pad1d(tensor._UnaryOperator):
99 |
100 | def __init__(self, x: tensor.Tensor, pad_width=0) -> None:
101 | self.pad_width = pad_width
102 | super().__init__(x)
103 |
104 | def forward_(self, x: tensor.Tensor) -> np.ndarray:
105 | return self.xp.pad(x.data, [(0, 0), (0, 0),
106 | (self.pad_width, self.pad_width)],
107 | 'constant')
108 |
109 | def grad_fn(self, x: tensor.Tensor, grad: np.ndarray) -> np.ndarray:
110 | if self.pad_width == 0:
111 | return grad[...]
112 | return grad[..., self.pad_width:-self.pad_width]
113 |
114 |
115 | def conv1d(
116 | x: tensor.Tensor,
117 | kernel: tensor.Tensor,
118 | padding: int = 0,
119 | stride: int = 1,
120 | ):
121 | '''一维卷积函数
122 |
123 | 基于im2col实现的一维卷积.
124 |
125 | Parameters
126 | ----------
127 | x : Tensor
128 | 输入数据, 形状为(N, in_channels, n_features);
129 | kernel : Tensor
130 | 卷积核, 形状为(out_channels, in_channels, kernel_size);
131 | padding : int, default=0
132 | 对输入特征两边补0数量;
133 | stride : int, default=1
134 | 卷积步长.
135 | '''
136 | kernel_size = kernel.shape[-1]
137 | pad_x = __pad1d(x, padding)
138 | col = __im2col1d(pad_x, kernel_size, stride)
139 | return (col @ kernel.transpose(1, 2, 0)).sum(1).swapaxes(1, 2)
140 |
141 |
142 | def max_pool1d(
143 | x: tensor.Tensor,
144 | kernel_size: int,
145 | stride: int,
146 | padding: int = 0,
147 | ):
148 | '''一维池化函数
149 |
150 | 基于im2col实现的一维池化.`
151 |
152 | Parameters
153 | ----------
154 | x : Tensor
155 | 输入数据, 形状为(N, in_channels, n_features);
156 | kernel_size : int
157 | 池化核大小;
158 | stride : int
159 | 卷积步长;
160 | padding : int, default=0
161 | 对输入特征两边补0数量.
162 | '''
163 | pad_x = __pad1d(x, padding)
164 | col = __im2col1d(pad_x, kernel_size, stride)
165 | return col.max(-1)
166 |
167 |
168 | def avg_pool1d(
169 | x: tensor.Tensor,
170 | kernel_size: int,
171 | stride: int,
172 | padding: int = 0,
173 | ):
174 | '''一维平均池化函数
175 |
176 | 基于im2col实现的一维池化.`
177 |
178 | Parameters
179 | ----------
180 | x : Tensor
181 | 输入数据, 形状为(N, in_channels, n_features);
182 | kernel_size : int
183 | 池化核大小;
184 | stride : int
185 | 卷积步长;
186 | padding : int, default=0
187 | 对输入特征两边补0数量.
188 | '''
189 | pad_x = __pad1d(x, padding)
190 | col = __im2col1d(pad_x, kernel_size, stride)
191 | return col.mean(-1)
192 |
193 |
194 | class __im2col2d(tensor._UnaryOperator):
195 |
196 | def __init__(
197 | self,
198 | x: tensor.Tensor,
199 | kernel_size: int,
200 | stride: int,
201 | ) -> None:
202 | _, self.in_channels, self.n_h, self.n_w = x.shape
203 | self.kernel_size = kernel_size
204 | self.stride = stride
205 | self.out_h, self.out_w = (
206 | self.n_h - self.kernel_size) // self.stride + 1, (
207 | self.n_w - self.kernel_size) // self.stride + 1
208 |
209 | super().__init__(x)
210 |
211 | def forward_(self, x: tensor.Tensor) -> np.ndarray:
212 | s0, s1, s2, s3 = x.strides
213 | shape = (x.shape[0], self.in_channels, self.kernel_size,
214 | self.kernel_size, self.out_h, self.out_w)
215 | self.__strides = (s0, s1, s2, s3, s2 * self.stride, s3 * self.stride)
216 |
217 | col = self.xp.lib.stride_tricks.as_strided(
218 | x.data,
219 | shape=shape,
220 | strides=self.__strides,
221 | ).copy()
222 | return col
223 |
224 | def grad_fn(self, x: tensor.Tensor, grad: np.ndarray) -> np.ndarray:
225 | grad_x = self.xp.zeros(x.shape, dtype=self.dtype)
226 | view = self.xp.lib.stride_tricks.as_strided(
227 | grad_x,
228 | shape=self.shape,
229 | strides=self.__strides,
230 | )
231 | self.xp.add.at(view, (..., ), grad)
232 | return grad_x
233 |
234 |
235 | class __pad2d(tensor._UnaryOperator):
236 |
237 | def __init__(self, x: tensor.Tensor, pad_width=0) -> None:
238 | self.pad_width = pad_width
239 | super().__init__(x)
240 |
241 | def forward_(self, x: tensor.Tensor) -> np.ndarray:
242 | return self.xp.pad(x.data, [(0, 0), (0, 0),
243 | (self.pad_width, self.pad_width),
244 | (self.pad_width, self.pad_width)],
245 | 'constant')
246 |
247 | def grad_fn(self, x: tensor.Tensor, grad: np.ndarray) -> np.ndarray:
248 | if self.pad_width == 0:
249 | return grad[...]
250 | return grad[..., self.pad_width:-self.pad_width,
251 | self.pad_width:-self.pad_width]
252 |
253 |
254 | def conv2d(x: tensor.Tensor,
255 | kernel: tensor.Tensor,
256 | padding: int = 0,
257 | stride: int = 1):
258 | '''二维卷积函数
259 |
260 | 基于im2col实现的二维卷积. 为了实现上的方便, 我们不考虑长宽不同的卷积核, 步长和补零。
261 |
262 | Parameters
263 | ----------
264 | x : Tensor
265 | 输入数据, 形状为(N, in_channels, n_height, n_width);
266 | kernel : Tensor
267 | 卷积核, 形状为(out_channels, in_channels, kernel_height, kernel_width);
268 | padding : int, default=0
269 | 对输入图片周围补0数量;
270 | stride : int, default=1
271 | 卷积步长.
272 | '''
273 | N, _, _, _ = x.shape
274 | out_channels, _, kernel_size, _ = kernel.shape
275 | pad_x = __pad2d(x, padding)
276 | col = __im2col2d(pad_x, kernel_size, stride)
277 | out_h, out_w = col.shape[-2:]
278 | col = col.transpose(0, 4, 5, 1, 2, 3).reshape(N * out_h * out_w, -1)
279 | col_filter = kernel.reshape(out_channels, -1).T
280 | out = col @ col_filter
281 | return out.reshape(N, out_h, out_w, -1).transpose(0, 3, 1, 2)
282 |
283 |
284 | def max_pool2d(x: tensor.Tensor, kernel_size: int, stride: int, padding=0):
285 | '''二维卷积函数池化
286 |
287 | 基于im2col实现的二维卷积. 为了实现上的方便, 我们不考虑长宽不同的kernel_size, 步长和补零。
288 |
289 | Parameters
290 | ----------
291 | x : Tensor
292 | 输入数据, 形状为(N, in_channels, n_height, n_width);
293 | kernel_size : int
294 | 池化核尺寸;
295 | stride : int, default=1
296 | 卷积步长;
297 | padding : int, default=0
298 | 对输入图片周围补0数量;
299 | '''
300 | N, in_channels, _, _ = x.shape
301 | pad_x = __pad2d(x, padding)
302 | col = __im2col2d(pad_x, kernel_size, stride)
303 | out_h, out_w = col.shape[-2:]
304 | col = col.transpose(0, 4, 5, 1, 2, 3).reshape(
305 | -1,
306 | kernel_size * kernel_size,
307 | )
308 | out = col.max(1)
309 | out = out.reshape(N, out_h, out_w, in_channels).transpose(0, 3, 1, 2)
310 | return out
311 |
312 |
313 | def avg_pool2d(x: tensor.Tensor, kernel_size: int, stride: int, padding=0):
314 | '''二维平均池化
315 |
316 | 基于im2col实现的二维池化. 为了实现上的方便, 我们不考虑长宽不同的kernel_size, 步长和补零。
317 |
318 | Parameters
319 | ----------
320 | x : Tensor
321 | 输入数据, 形状为(N, in_channels, n_height, n_width);
322 | kernel_size : int
323 | 池化核尺寸;
324 | stride : int, default=1
325 | 卷积步长;
326 | padding : int, default=0
327 | 对输入图片周围补0数量;
328 | '''
329 | N, in_channels, _, _ = x.shape
330 | pad_x = __pad2d(x, padding)
331 | col = __im2col2d(pad_x, kernel_size, stride)
332 | out_h, out_w = col.shape[-2:]
333 | col = col.transpose(0, 4, 5, 1, 2, 3).reshape(
334 | -1,
335 | kernel_size * kernel_size,
336 | )
337 | out = col.mean(1)
338 | out = out.reshape(N, out_h, out_w, in_channels).transpose(0, 3, 1, 2)
339 | return out
340 |
341 |
342 | def mse_loss(y_pred, y_true, reduction='mean'):
343 | '''均方误差'''
344 | square_sum = function.square(y_pred - y_true)
345 | if reduction == 'mean':
346 | return tensor.mean(square_sum)
347 | elif reduction == 'sum':
348 | return tensor.sum(square_sum)
349 | else:
350 | raise ValueError("reduction must be mean or sum.")
351 |
352 |
353 | def nll_loss(y_pred, y_true, reduction='mean'):
354 | '''负对数似然'''
355 | nll = -y_pred * y_true
356 | if reduction == 'mean':
357 | return tensor.mean(nll)
358 | elif reduction == 'sum':
359 | return tensor.sum(nll)
360 | else:
361 | raise ValueError("reduction must be mean or sum.")
362 |
363 |
364 | def cross_entropy_loss(y_pred, y_true, reduction='mean'):
365 | '''交叉熵损失'''
366 | update_y_pred = y_pred - y_pred.max().item()
367 | log_sum_exp = tensor.log(
368 | tensor.sum(tensor.exp(update_y_pred), 1, keepdims=True))
369 |
370 | neg_log_sm = log_sum_exp - update_y_pred
371 | if y_true.ndim == 1:
372 | nll = neg_log_sm[range(len(neg_log_sm)), y_true]
373 | else:
374 | nll = neg_log_sm * y_true
375 |
376 | if reduction == 'mean':
377 | return tensor.mean(nll)
378 | elif reduction == 'sum':
379 | return tensor.sum(nll)
380 | else:
381 | raise ValueError("reduction must be mean or sum.")
382 |
--------------------------------------------------------------------------------
/pydynet/nn/modules/rnn.py:
--------------------------------------------------------------------------------
1 | from .module import Module
2 | from .. import init
3 | from .. import functional as F
4 | from ..parameter import Parameter
5 | from ...special import empty, zeros
6 | from ... import core
7 | from ...cuda import Device
8 |
9 | from typing import Literal, Optional, Tuple, List
10 | import math
11 |
12 |
13 | class RNNCell(Module):
14 |
15 | def __init__(
16 | self,
17 | input_size: int,
18 | hidden_size: int,
19 | bias: bool = True,
20 | nonlinearity: Literal['tanh', 'relu'] = 'tanh',
21 | device=None,
22 | dtype=None,
23 | ) -> None:
24 | super().__init__()
25 | self.input_size = input_size
26 | self.hidden_size = hidden_size
27 | self.kwargs = {"device": Device(device), "dtype": dtype}
28 | self.nonlinearity = nonlinearity
29 | self.fn = {'tanh': F.tanh, 'relu': F.relu}[nonlinearity]
30 |
31 | self.Wx = Parameter(empty((input_size, hidden_size), **self.kwargs))
32 | self.Wh = Parameter(empty((hidden_size, hidden_size), **self.kwargs))
33 | if bias:
34 | self.bias = Parameter(empty(self.hidden_size, **self.kwargs))
35 | self.has_bias = bias
36 | self.reset_paramters()
37 |
38 | def forward(self, x, h=None):
39 | if h is None:
40 | h = self.init_hidden(x)
41 | else:
42 | assert (x.ndim == 1 and h.shape == (self.hidden_size, )) or (
43 | x.ndim == 2 and h.shape
44 | == (x.shape[0], self.hidden_size)), "Wrong hidden state input!"
45 |
46 | lin = x @ self.Wx + h @ self.Wh
47 | if self.has_bias:
48 | lin = lin + self.bias
49 | return self.fn(lin)
50 |
51 | def reset_paramters(self):
52 | bound = math.sqrt(1 / self.hidden_size)
53 | init.uniform_(self.Wx, -bound, bound)
54 | init.uniform_(self.Wh, -bound, bound)
55 | if self.has_bias:
56 | init.uniform_(self.bias, -bound, bound)
57 |
58 | def init_hidden(self, x):
59 | assert x.ndim in {1, 2}
60 | if x.ndim == 1:
61 | return zeros(self.hidden_size, **self.kwargs)
62 | else:
63 | batch_size = x.shape[0]
64 | return zeros((batch_size, self.hidden_size), **self.kwargs)
65 |
66 | def __repr__(self) -> str:
67 | return "{}({}, {}, bias={}, nonlinearity={})".format(
68 | self.__class__.__name__,
69 | self.input_size,
70 | self.hidden_size,
71 | self.has_bias,
72 | self.nonlinearity,
73 | )
74 |
75 | def move(self, device):
76 | self.kwargs['device'] = device
77 | return super().move(device)
78 |
79 |
80 | class RNN(Module):
81 |
82 | def __init__(
83 | self,
84 | input_size: int,
85 | hidden_size: int,
86 | num_layers: int = 1,
87 | nonlinearity: Literal['tanh', 'relu'] = 'tanh',
88 | bias: bool = True,
89 | batch_first: bool = False,
90 | bidirectional: bool = False,
91 | device=None,
92 | dtype=None,
93 | ) -> None:
94 | super().__init__()
95 | self.input_size = input_size
96 | self.hidden_size = hidden_size
97 | self.num_layers = num_layers
98 | self.nonlinearity = nonlinearity
99 | self.has_bias = bias
100 | self.batch_first = batch_first
101 | self.bidirectional = bidirectional
102 | self.kwargs = {"device": Device(device), "dtype": dtype}
103 |
104 | assert num_layers > 0
105 | size_list = [input_size] + [hidden_size] * (num_layers - 1)
106 | self.RNNCells: List[RNNCell] = []
107 | for i in range(num_layers):
108 | cell = RNNCell(
109 | size_list[i],
110 | hidden_size,
111 | bias,
112 | nonlinearity,
113 | **self.kwargs,
114 | )
115 | setattr(self, 'rnn_{}'.format(i), cell)
116 | self.RNNCells.append(cell)
117 | if self.bidirectional:
118 | self.rRNNCells: List[RNNCell] = []
119 | for i in range(num_layers):
120 | cell = RNNCell(
121 | size_list[i],
122 | hidden_size,
123 | bias,
124 | nonlinearity,
125 | **self.kwargs,
126 | )
127 | setattr(self, 'rrnn_{}'.format(i), cell)
128 | self.rRNNCells.append(cell)
129 |
130 | def forward(self, x, h=None):
131 | if self.batch_first and x.ndim == 3:
132 | x = x.swapaxes(0, 1)
133 |
134 | if h is None:
135 | h = self.init_hidden(x)
136 | else:
137 | d = 2 if self.bidirectional else 1
138 | assert (x.ndim == 2
139 | and h.shape == (d * self.num_layers, self.hidden_size)
140 | ) or (x.ndim == 3 and h.shape
141 | == (d * self.num_layers, x.shape[1],
142 | self.hidden_size)), "Wrong hidden state input!"
143 |
144 | if self.num_layers == 1 and not self.bidirectional:
145 | h_list = self.cell_forward(self.RNNCells[0], x, h[0])
146 | output = core.concat(h_list)
147 | hn = h_list[-1]
148 |
149 | elif self.num_layers == 1 and self.bidirectional:
150 | h_list = self.cell_forward(self.RNNCells[0], x, h[0])
151 | hr_list = self.cell_forward(self.rRNNCells[0], x[::-1], h[1])
152 | output = core.concat(
153 | [core.concat(h_list),
154 | core.concat(hr_list[::-1])],
155 | axis=-1,
156 | )
157 | hn = core.concat([h_list[-1], hr_list[-1]])
158 |
159 | elif self.num_layers > 1 and not self.bidirectional:
160 | hn_list = []
161 | for i in range(self.num_layers):
162 | h_list = self.cell_forward(
163 | self.RNNCells[i],
164 | x if i == 0 else core.concat(h_list),
165 | h[i],
166 | )
167 | hn_list.append(h_list[-1])
168 | output = core.concat(h_list)
169 | hn = core.concat(hn_list)
170 |
171 | else:
172 | hn_list = []
173 | hrn_list = []
174 | for i in range(self.num_layers):
175 | h_list = self.cell_forward(
176 | self.RNNCells[i],
177 | x if i == 0 else core.concat(h_list),
178 | h[i],
179 | )
180 | hr_list = self.cell_forward(
181 | self.rRNNCells[i],
182 | x[::-1] if i == 0 else core.concat(hr_list),
183 | h[i + self.num_layers],
184 | )
185 | hn_list.append(h_list[-1])
186 | hrn_list.append(hr_list[-1])
187 | output = core.concat(
188 | [core.concat(h_list),
189 | core.concat(hr_list[::-1])], axis=-1)
190 | hn = core.concat(hn_list + hrn_list)
191 |
192 | if self.batch_first and x.ndim == 3:
193 | output = output.swapaxes(0, 1)
194 | hn = hn.swapaxes(0, 1)
195 | return output, hn
196 |
197 | def reset_parameters(self):
198 | for i in range(self.num_layers):
199 | self.RNNCells[i].reset_paramters()
200 | if self.bidirectional:
201 | for i in range(self.num_layers):
202 | self.rRNNCells[i].reset_paramters()
203 |
204 | def init_hidden(self, x):
205 | assert x.ndim in {2, 3}
206 | d = 2 if self.bidirectional else 1
207 | if x.ndim == 2:
208 | return zeros(
209 | (d * self.num_layers, self.hidden_size),
210 | **self.kwargs,
211 | )
212 | else:
213 | batch_size = x.shape[1]
214 | return zeros(
215 | (d * self.num_layers, batch_size, self.hidden_size),
216 | **self.kwargs,
217 | )
218 |
219 | def cell_forward(self, cell: RNNCell, x, h):
220 | seq_len = x.shape[0]
221 | h_list = []
222 | for i in range(seq_len):
223 | h = cell(x[i], h)
224 | h_list.append(core.unsqueeze(h, axis=0))
225 | return h_list
226 |
227 | def __repr__(self) -> str:
228 | return "{}({}, {}, num_layers={}, nonlinearity={}, bias={}, batch_first={}, bidirectional={})".format(
229 | self.__class__.__name__,
230 | self.input_size,
231 | self.hidden_size,
232 | self.num_layers,
233 | self.nonlinearity,
234 | self.has_bias,
235 | self.batch_first,
236 | self.bidirectional,
237 | )
238 |
239 | def move(self, device):
240 | self.kwargs['device'] = device
241 | return super().move(device)
242 |
243 |
244 | class LSTMCell(Module):
245 |
246 | def __init__(
247 | self,
248 | input_size: int,
249 | hidden_size: int,
250 | bias: bool = True,
251 | device=None,
252 | dtype=None,
253 | ) -> None:
254 | super().__init__()
255 | self.input_size = input_size
256 | self.hidden_size = hidden_size
257 | self.kwargs = {"device": Device(device), "dtype": dtype}
258 |
259 | self.Wx = Parameter(empty((input_size, 4 * hidden_size),
260 | **self.kwargs))
261 | self.Wh = Parameter(
262 | empty((hidden_size, 4 * hidden_size), **self.kwargs))
263 | if bias:
264 | self.bias = Parameter(empty(4 * self.hidden_size, **self.kwargs))
265 | self.has_bias = bias
266 | self.reset_paramters()
267 |
268 | def forward(self, x, hx: Optional[Tuple] = None):
269 | if hx is None:
270 | h = self.init_hidden(x)
271 | c = self.init_hidden(x)
272 | else:
273 | h, c = hx
274 | assert (x.ndim == 1 and h.shape == (self.hidden_size, )) or (
275 | x.ndim == 2 and h.shape
276 | == (x.shape[0], self.hidden_size)), "Wrong hidden state input!"
277 | assert (x.ndim == 1 and c.shape == (self.hidden_size, )) or (
278 | x.ndim == 2 and c.shape
279 | == (x.shape[0], self.hidden_size)), "Wrong cell state input!"
280 | lin = x @ self.Wx + h @ self.Wh
281 | if self.has_bias:
282 | lin = lin + self.bias
283 | fio, g = core.hsplit(lin, [3 * self.hidden_size])
284 | sig_fio, tanh_g = F.sigmoid(fio), F.tanh(g)
285 | f, i, o = core.hsplit(sig_fio, 3)
286 | c = f * c + i * tanh_g
287 | h = o * F.tanh(c)
288 | return h, c
289 |
290 | def init_hidden(self, x):
291 | assert x.ndim in {1, 2}
292 | if x.ndim == 1:
293 | return zeros(self.hidden_size, **self.kwargs)
294 | else:
295 | batch_size = x.shape[0]
296 | return zeros((batch_size, self.hidden_size), **self.kwargs)
297 |
298 | def reset_paramters(self):
299 | bound = math.sqrt(1 / self.hidden_size)
300 | init.uniform_(self.Wx, -bound, bound)
301 | init.uniform_(self.Wh, -bound, bound)
302 | if self.has_bias:
303 | init.uniform_(self.bias, -bound, bound)
304 |
305 | def __repr__(self) -> str:
306 | return "{}({}, {}, bias={})".format(
307 | self.__class__.__name__,
308 | self.input_size,
309 | self.hidden_size,
310 | self.has_bias,
311 | )
312 |
313 | def move(self, device):
314 | self.kwargs['device'] = device
315 | return super().move(device)
316 |
317 |
318 | class LSTM(Module):
319 |
320 | def __init__(
321 | self,
322 | input_size: int,
323 | hidden_size: int,
324 | num_layers: int = 1,
325 | bias: bool = True,
326 | batch_first: bool = False,
327 | bidirectional: bool = False,
328 | device=None,
329 | dtype=None,
330 | ) -> None:
331 | super().__init__()
332 | self.input_size = input_size
333 | self.hidden_size = hidden_size
334 | self.num_layers = num_layers
335 | self.has_bias = bias
336 | self.batch_first = batch_first
337 | self.bidirectional = bidirectional
338 | self.kwargs = {"device": Device(device), "dtype": dtype}
339 |
340 | assert num_layers > 0
341 | size_list = [input_size] + [hidden_size] * (num_layers - 1)
342 | self.LSTMCells: List[LSTMCell] = []
343 | for i in range(num_layers):
344 | cell = LSTMCell(
345 | size_list[i],
346 | hidden_size,
347 | bias,
348 | **self.kwargs,
349 | )
350 | setattr(self, 'lstm_{}'.format(i), cell)
351 | self.LSTMCells.append(cell)
352 | if self.bidirectional:
353 | self.rLSTMCells: List[LSTMCell] = []
354 | for i in range(num_layers):
355 | cell = LSTMCell(
356 | size_list[i],
357 | hidden_size,
358 | bias,
359 | **self.kwargs,
360 | )
361 | setattr(self, 'rlstm_{}'.format(i), cell)
362 | self.rLSTMCells.append(cell)
363 |
364 | def forward(self, x, hx: Optional[Tuple] = None):
365 | if self.batch_first and x.ndim == 3:
366 | x = x.swapaxes(0, 1)
367 |
368 | if hx is None:
369 | h = self.init_hidden(x)
370 | c = self.init_hidden(x)
371 | else:
372 | d = 2 if self.bidirectional else 1
373 | h, c = hx
374 | assert (x.ndim == 2
375 | and h.shape == (d * self.num_layers, self.hidden_size)
376 | ) or (x.ndim == 3 and h.shape
377 | == (d * self.num_layers, x.shape[1],
378 | self.hidden_size)), "Wrong hidden state input!"
379 | assert (x.ndim == 2
380 | and c.shape == (d * self.num_layers, self.hidden_size)
381 | ) or (x.ndim == 3 and c.shape
382 | == (d * self.num_layers, x.shape[1],
383 | self.hidden_size)), "Wrong cell state input!"
384 |
385 | if self.num_layers == 1 and not self.bidirectional:
386 | h_list, c_list = self.cell_forward(
387 | self.LSTMCells[0],
388 | x,
389 | (h[0], c[0]),
390 | )
391 | output = core.concat(h_list)
392 | hn = h_list[-1]
393 | cn = c_list[-1]
394 | elif self.num_layers == 1 and self.bidirectional:
395 | h_list, c_list = self.cell_forward(
396 | self.LSTMCells[0],
397 | x,
398 | (h[0], c[0]),
399 | )
400 | hr_list, cr_list = self.cell_forward(
401 | self.rLSTMCells[0],
402 | x[::-1],
403 | (h[1], c[1]),
404 | )
405 | output = core.concat(
406 | [core.concat(h_list),
407 | core.concat(hr_list[::-1])], axis=-1)
408 | hn = core.concat([h_list[-1], hr_list[-1]])
409 | cn = core.concat([c_list[-1], cr_list[-1]])
410 | elif self.num_layers > 1 and not self.bidirectional:
411 | hn_list, cn_list = [], []
412 | for i in range(self.num_layers):
413 | h_list, c_list = self.cell_forward(
414 | self.LSTMCells[i],
415 | x if i == 0 else core.concat(h_list),
416 | (h[i], c[i]),
417 | )
418 | hn_list.append(h_list[-1])
419 | cn_list.append(c_list[-1])
420 | output = core.concat(h_list)
421 | hn = core.concat(hn_list)
422 | cn = core.concat(cn_list)
423 | else:
424 | hn_list, hrn_list = [], []
425 | cn_list, crn_list = [], []
426 | for i in range(self.num_layers):
427 | h_list, c_list = self.cell_forward(
428 | self.LSTMCells[i],
429 | x if i == 0 else core.concat(h_list),
430 | (h[i], c[i]),
431 | )
432 | hr_list, cr_list = self.cell_forward(
433 | self.rLSTMCells[i],
434 | x[::-1] if i == 0 else core.concat(hr_list),
435 | (h[i + self.num_layers], c[i + self.num_layers]),
436 | )
437 | hn_list.append(h_list[-1])
438 | hrn_list.append(hr_list[-1])
439 | cn_list.append(c_list[-1])
440 | crn_list.append(cr_list[-1])
441 | output = core.concat(
442 | [core.concat(h_list),
443 | core.concat(hr_list[::-1])], axis=-1)
444 | hn = core.concat(hn_list + hrn_list)
445 | cn = core.concat(cn_list + crn_list)
446 | if self.batch_first and x.ndim == 3:
447 | output = output.swapaxes(0, 1)
448 | hn = hn.swapaxes(0, 1)
449 | cn = cn.swapaxes(0, 1)
450 |
451 | return output, (hn, cn)
452 |
453 | def reset_parameters(self):
454 | for i in range(self.num_layers):
455 | self.LSTMCells[i].reset_paramters()
456 | if self.bidirectional:
457 | for i in range(self.num_layers):
458 | self.rLSTMCells[i].reset_paramters()
459 |
460 | def init_hidden(self, x):
461 | assert x.ndim in {2, 3}
462 | d = 2 if self.bidirectional else 1
463 | if x.ndim == 2:
464 | return zeros(
465 | (d * self.num_layers, self.hidden_size),
466 | **self.kwargs,
467 | )
468 | else:
469 | batch_size = x.shape[1]
470 | return zeros(
471 | (d * self.num_layers, batch_size, self.hidden_size),
472 | **self.kwargs,
473 | )
474 |
475 | def cell_forward(self, cell: RNNCell, x, h: Tuple):
476 | seq_len = x.shape[0]
477 | h_list, c_list = [], []
478 | for i in range(seq_len):
479 | h = cell(x[i], h) # Infact, `h` here is a tuple (h, c)
480 | h_list.append(core.unsqueeze(h[0], axis=0))
481 | c_list.append(core.unsqueeze(h[1], axis=0))
482 | return h_list, c_list
483 |
484 | def __repr__(self) -> str:
485 | return "{}({}, {}, num_layers={}, bias={}, batch_first={}, bidirectional={})".format(
486 | self.__class__.__name__,
487 | self.input_size,
488 | self.hidden_size,
489 | self.num_layers,
490 | self.has_bias,
491 | self.batch_first,
492 | self.bidirectional,
493 | )
494 |
495 | def move(self, device):
496 | self.kwargs['device'] = device
497 | return super().move(device)
498 |
499 |
500 | class GRUCell(Module):
501 |
502 | def __init__(
503 | self,
504 | input_size: int,
505 | hidden_size: int,
506 | bias: bool = True,
507 | device=None,
508 | dtype=None,
509 | ) -> None:
510 | super().__init__()
511 | self.input_size = input_size
512 | self.hidden_size = hidden_size
513 | self.kwargs = {"device": Device(device), "dtype": dtype}
514 |
515 | self.Wx1 = Parameter(
516 | empty((input_size, 2 * hidden_size), **self.kwargs))
517 | self.Wh1 = Parameter(
518 | empty((hidden_size, 2 * hidden_size), **self.kwargs))
519 | self.Wx2 = Parameter(empty((input_size, hidden_size), **self.kwargs))
520 | self.Wh2 = Parameter(empty((hidden_size, hidden_size), **self.kwargs))
521 |
522 | if bias:
523 | self.bias1 = Parameter(empty(2 * self.hidden_size, **self.kwargs))
524 | self.bias2 = Parameter(empty(self.hidden_size, **self.kwargs))
525 |
526 | self.has_bias = bias
527 | self.reset_parameters()
528 |
529 | def forward(self, x, h=None):
530 | if h is None:
531 | h = self.init_hidden(x)
532 | else:
533 | assert (x.ndim == 1 and h.shape == (self.hidden_size, )) or (
534 | x.ndim == 2 and h.shape
535 | == (x.shape[0], self.hidden_size)), "Wrong hidden state input!"
536 |
537 | lin1 = x @ self.Wx1 + h @ self.Wh1
538 | if self.has_bias:
539 | lin1 = lin1 + self.bias1
540 | z, r = core.split(F.sigmoid(lin1), 2, axis=1)
541 | lin2 = x @ self.Wx2 + (r * h) @ self.Wh2
542 | if self.has_bias:
543 | lin2 = lin2 + self.bias2
544 | return (1 - z) * h + z * F.tanh(lin2)
545 |
546 | def reset_parameters(self):
547 | bound = math.sqrt(1 / self.hidden_size)
548 | init.uniform_(self.Wx1, -bound, bound)
549 | init.uniform_(self.Wx2, -bound, bound)
550 | init.uniform_(self.Wh1, -bound, bound)
551 | init.uniform_(self.Wh2, -bound, bound)
552 | if self.has_bias:
553 | init.uniform_(self.bias1, -bound, bound)
554 | init.uniform_(self.bias2, -bound, bound)
555 |
556 | def init_hidden(self, x):
557 | assert x.ndim in {1, 2}
558 | if x.ndim == 1:
559 | return zeros(self.hidden_size, **self.kwargs)
560 | else:
561 | batch_size = x.shape[0]
562 | return zeros((batch_size, self.hidden_size), **self.kwargs)
563 |
564 | def __repr__(self) -> str:
565 | return "{}({}, {}, bias={})".format(
566 | self.__class__.__name__,
567 | self.input_size,
568 | self.hidden_size,
569 | self.has_bias,
570 | )
571 |
572 | def move(self, device):
573 | self.kwargs['device'] = device
574 | return super().move(device)
575 |
576 |
577 | class GRU(Module):
578 |
579 | def __init__(
580 | self,
581 | input_size: int,
582 | hidden_size: int,
583 | num_layers: int = 1,
584 | bias: bool = True,
585 | batch_first: bool = False,
586 | bidirectional: bool = False,
587 | device=None,
588 | dtype=None,
589 | ) -> None:
590 | super().__init__()
591 | self.input_size = input_size
592 | self.hidden_size = hidden_size
593 | self.num_layers = num_layers
594 | self.has_bias = bias
595 | self.batch_first = batch_first
596 | self.bidirectional = bidirectional
597 | self.kwargs = {"device": Device(device), "dtype": dtype}
598 |
599 | assert num_layers > 0
600 | size_list = [input_size] + [hidden_size] * (num_layers - 1)
601 | self.GRUCells: List[GRUCell] = []
602 | for i in range(num_layers):
603 | cell = GRUCell(
604 | size_list[i],
605 | hidden_size,
606 | bias,
607 | **self.kwargs,
608 | )
609 | setattr(self, 'gru_{}'.format(i), cell)
610 | self.GRUCells.append(cell)
611 | if self.bidirectional:
612 | self.rGRUCells: List[GRUCell] = []
613 | for i in range(num_layers):
614 | cell = GRUCell(
615 | size_list[i],
616 | hidden_size,
617 | bias,
618 | **self.kwargs,
619 | )
620 | setattr(self, 'rgru_{}'.format(i), cell)
621 | self.rGRUCells.append(cell)
622 |
623 | def forward(self, x, h=None):
624 | if self.batch_first and x.ndim == 3:
625 | x = x.swapaxes(0, 1)
626 |
627 | if h is None:
628 | h = self.init_hidden(x)
629 | else:
630 | d = 2 if self.bidirectional else 1
631 | assert (x.ndim == 2
632 | and h.shape == (d * self.num_layers, self.hidden_size)
633 | ) or (x.ndim == 3 and h.shape
634 | == (d * self.num_layers, x.shape[1],
635 | self.hidden_size)), "Wrong hidden state input!"
636 |
637 | if self.num_layers == 1 and not self.bidirectional:
638 | h_list = self.cell_forward(self.GRUCells[0], x, h[0])
639 | output = core.concat(h_list)
640 | hn = h_list[-1]
641 |
642 | elif self.num_layers == 1 and self.bidirectional:
643 | h_list = self.cell_forward(self.GRUCells[0], x, h[0])
644 | hr_list = self.cell_forward(self.rGRUCells[0], x[::-1], h[1])
645 | output = core.concat(
646 | [core.concat(h_list),
647 | core.concat(hr_list[::-1])], axis=-1)
648 | hn = core.concat([h_list[-1], hr_list[-1]])
649 |
650 | elif self.num_layers > 1 and not self.bidirectional:
651 | hn_list = []
652 | for i in range(self.num_layers):
653 | h_list = self.cell_forward(
654 | self.GRUCells[i],
655 | x if i == 0 else core.concat(h_list),
656 | h[i],
657 | )
658 | hn_list.append(h_list[-1])
659 | output = core.concat(h_list)
660 | hn = core.concat(hn_list)
661 |
662 | else:
663 | hn_list = []
664 | hrn_list = []
665 | for i in range(self.num_layers):
666 | h_list = self.cell_forward(
667 | self.GRUCells[i],
668 | x if i == 0 else core.concat(h_list),
669 | h[i],
670 | )
671 | hr_list = self.cell_forward(
672 | self.rGRUCells[i],
673 | x[::-1] if i == 0 else core.concat(hr_list),
674 | h[i + self.num_layers],
675 | )
676 | hn_list.append(h_list[-1])
677 | hrn_list.append(hr_list[-1])
678 | output = core.concat(
679 | [core.concat(h_list),
680 | core.concat(hr_list[::-1])], axis=-1)
681 | hn = core.concat(hn_list + hrn_list)
682 |
683 | if self.batch_first and x.ndim == 3:
684 | output = output.swapaxes(0, 1)
685 | hn = hn.swapaxes(0, 1)
686 | return output, hn
687 |
688 | def init_hidden(self, x):
689 | assert x.ndim in {2, 3}
690 | d = 2 if self.bidirectional else 1
691 | if x.ndim == 2:
692 | return zeros(
693 | (d * self.num_layers, self.hidden_size),
694 | **self.kwargs,
695 | )
696 | else:
697 | return zeros(
698 | (d * self.num_layers, x.shape[1], self.hidden_size),
699 | **self.kwargs,
700 | )
701 |
702 | def cell_forward(self, cell: GRUCell, x, h):
703 | seq_len = x.shape[0]
704 | h_list = []
705 | for i in range(seq_len):
706 | h = cell(x[i], h)
707 | h_list.append(core.unsqueeze(h, axis=0))
708 | return h_list
709 |
710 | def __repr__(self) -> str:
711 | return "{}({}, {}, num_layers={}, bias={}, batch_first={}, bidirectional={})".format(
712 | self.__class__.__name__,
713 | self.input_size,
714 | self.hidden_size,
715 | self.num_layers,
716 | self.has_bias,
717 | self.batch_first,
718 | self.bidirectional,
719 | )
720 |
721 | def move(self, device):
722 | self.kwargs['device'] = device
723 | return super().move(device)
724 |
--------------------------------------------------------------------------------