├── .gitignore
├── imgs
    ├── ad1d.png
    ├── ad2d.png
    ├── rnn.png
    ├── mlp_cnn.png
    ├── dropout_bn.png
    └── transformer.png
├── pytest.ini
├── .gitattributes
├── llm
    ├── clip
    │   ├── picture.png
    │   ├── tokenizer.py
    │   ├── model.py
    │   └── infer.py
    └── llama
    │   ├── tokenizer.py
    │   ├── infer.py
    │   └── model.py
├── pydynet
    ├── nn
    │   ├── __init__.py
    │   ├── parameter.py
    │   ├── modules
    │   │   ├── dropout.py
    │   │   ├── loss.py
    │   │   ├── __init__.py
    │   │   ├── activation.py
    │   │   ├── pool.py
    │   │   ├── linear.py
    │   │   ├── conv.py
    │   │   ├── module.py
    │   │   ├── norm.py
    │   │   └── rnn.py
    │   ├── init.py
    │   └── functional.py
    ├── optim
    │   ├── __init__.py
    │   ├── lr_scheduler.py
    │   └── optimizer.py
    ├── core
    │   ├── __init__.py
    │   └── function.py
    ├── __init__.py
    ├── autograd.py
    ├── special.py
    ├── cuda.py
    └── data.py
├── requirements.txt
├── tests
    ├── test_backward.py
    └── test_tensor_basic.py
├── setup.py
├── LICENSE
├── .github
    └── workflows
    │   └── python-publish.yml
├── examples
    ├── pydynet
    │   ├── autograd1d.py
    │   ├── autograd2d.py
    │   ├── ts_prediction.py
    │   ├── mnist.py
    │   ├── dropout_bn.py
    │   └── transformer.py
    └── pytorch
    │   ├── ts_prediction.py
    │   ├── mnist.py
    │   ├── dropout_bn.py
    │   └── transformer.py
├── cnREADME.md
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | __pycache__/
3 | data


--------------------------------------------------------------------------------
/imgs/ad1d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WeltXing/PyDyNet/HEAD/imgs/ad1d.png


--------------------------------------------------------------------------------
/imgs/ad2d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WeltXing/PyDyNet/HEAD/imgs/ad2d.png


--------------------------------------------------------------------------------
/imgs/rnn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WeltXing/PyDyNet/HEAD/imgs/rnn.png


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | filterwarnings = 
3 |     ignore::UserWarning:pydynet


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/imgs/mlp_cnn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WeltXing/PyDyNet/HEAD/imgs/mlp_cnn.png


--------------------------------------------------------------------------------
/imgs/dropout_bn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WeltXing/PyDyNet/HEAD/imgs/dropout_bn.png


--------------------------------------------------------------------------------
/imgs/transformer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WeltXing/PyDyNet/HEAD/imgs/transformer.png


--------------------------------------------------------------------------------
/llm/clip/picture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WeltXing/PyDyNet/HEAD/llm/clip/picture.png


--------------------------------------------------------------------------------
/pydynet/nn/__init__.py:
--------------------------------------------------------------------------------
1 | from .modules import *
2 | from .parameter import Parameter
3 | from . import init


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=2.0.0
2 | cupy-cuda12x # For Cuda 12.x; refer to https://docs.cupy.dev/en/stable/install.html
3 | 


--------------------------------------------------------------------------------
/tests/test_backward.py:
--------------------------------------------------------------------------------
 1 | import sys, pytest, random
 2 | import numpy as np
 3 | 
 4 | sys.path.append('../pydynet')
 5 | 
 6 | np.random.seed(0)
 7 | random.seed(0)
 8 | 
 9 | type_list = [np.float16, np.float32, np.float64]
10 | 


--------------------------------------------------------------------------------
/pydynet/optim/__init__.py:
--------------------------------------------------------------------------------
 1 | from .optimizer import (
 2 |     Optimizer,
 3 |     SGD,
 4 |     Adagrad,
 5 |     Adadelta,
 6 |     Adam,
 7 | )
 8 | from .lr_scheduler import (
 9 |     _LRScheduler,
10 |     ExponentialLR,
11 |     StepLR,
12 |     MultiStepLR,
13 |     CosineAnnealingLR,
14 | )
15 | 


--------------------------------------------------------------------------------
/pydynet/core/__init__.py:
--------------------------------------------------------------------------------
1 | from .tensor import (Tensor, add, sub, mul, div, pow, matmul, abs, sum, mean,
2 |                      min, max, min, argmax, argmin, maximum, minimum, exp, log,
3 |                      sign, reshape, transpose, swapaxes, concat, sigmoid, tanh)
4 | from .function import sqrt, square, vsplit, hsplit, dsplit, split, unsqueeze, squeeze
5 | 


--------------------------------------------------------------------------------
/pydynet/nn/parameter.py:
--------------------------------------------------------------------------------
 1 | from ..core import Tensor
 2 | 
 3 | 
 4 | class Parameter(Tensor):
 5 | 
 6 |     def __init__(self, data: Tensor, requires_grad: bool = True) -> None:
 7 |         super().__init__(
 8 |             data=data.data,
 9 |             dtype=data.dtype,
10 |             device=data.device,
11 |             copy=False,
12 |             requires_grad=requires_grad,
13 |         )
14 | 
15 |     def __repr__(self) -> str:
16 |         return "Parameter : \n{}".format(self.data) + (",\ndevice={}".format(
17 |             self.device) if self.device.device != "cpu" else "")
18 | 


--------------------------------------------------------------------------------
/pydynet/nn/modules/dropout.py:
--------------------------------------------------------------------------------
 1 | from .module import Module
 2 | from ...core import Tensor
 3 | from ...special import rand
 4 | 
 5 | 
 6 | class Dropout(Module):
 7 | 
 8 |     def __init__(self, p: float = 0.5) -> None:
 9 |         super().__init__()
10 |         assert p >= 0 and p < 1
11 |         self.p = p
12 | 
13 |     def forward(self, x) -> Tensor:
14 |         if self._train:
15 |             mask = rand(*x.shape, device=x.device) < 1 - self.p
16 |             return x * mask.astype(x.dtype) / (1 - self.p)
17 |         return x
18 | 
19 |     def __repr__(self) -> str:
20 |         return "{}(p={})".format(self.__class__.__name__, self.p)
21 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | setuptools.setup(
 4 |     name='pydynet',
 5 |     version='1.0',
 6 |     description=
 7 |     'PyDyNet: Neuron Network (MLP, CNN, RNN, Transformer, ...) implementation using Numpy with Autodiff',
 8 |     author="Cun-Yuan Xing",
 9 |     author_email="xingcy@lamda.nju.edu.cn",
10 |     maintainer="Cun-Yuan Xing",
11 |     maintainer_email="xingcy@lamad.nju.edu.cn",
12 |     packages=[
13 |         'pydynet', 'pydynet/optim', 'pydynet/nn', 'pydynet/nn/modules',
14 |         'pydynet/core'
15 |     ],
16 |     license='MIT License',
17 |     install_requires=['numpy>=2.0.0'],
18 |     long_description=open('README.md', encoding='utf-8').read(),
19 |     long_description_content_type="text/markdown",
20 |     url='https://github.com/WeltXing/PyDyNet',
21 | )
22 | 


--------------------------------------------------------------------------------
/pydynet/__init__.py:
--------------------------------------------------------------------------------
 1 | from .core import (Tensor, add, sub, mul, div, pow, matmul, abs, sum, mean,
 2 |                    min, max, min, argmax, argmin, maximum, minimum, exp, log,
 3 |                    sign, reshape, transpose, swapaxes, concat, sigmoid, tanh,
 4 |                    sqrt, square, vsplit, hsplit, dsplit, split, unsqueeze,
 5 |                    squeeze)
 6 | from .special import zeros, ones, rand, randn, empty, uniform
 7 | from .cuda import Device
 8 | from .autograd import enable_grad, no_grad
 9 | 
10 | __all__ = [
11 |     "Tensor", "add", "sub", "mul", "div", "pow", "matmul", "abs", "sum",
12 |     "mean", "min", "max", "argmax", "argmin", "maximum", "minimum", "exp",
13 |     "log", "sign", "reshape", "transpose", "swapaxes", "concat", 'sigmoid',
14 |     'tanh', "sqrt", "square", "vsplit", "hsplit", "dsplit", "split",
15 |     "unsqueeze", "squeeze", "zeros", "ones", "rand", "randn", "empty",
16 |     "uniform", "Device", "enable_grad", "no_grad"
17 | ]
18 | 


--------------------------------------------------------------------------------
/pydynet/nn/modules/loss.py:
--------------------------------------------------------------------------------
 1 | from .module import Module
 2 | from .. import functional as F
 3 | from ...core import Tensor
 4 | 
 5 | 
 6 | class Loss(Module):
 7 |     '''损失函数基类'''
 8 | 
 9 |     def __init__(self, reduction='mean') -> None:
10 |         super().__init__()
11 |         self.reduction = reduction
12 |         assert self.reduction in {'mean', 'sum'}
13 | 
14 |     def forward(self, y_pred: Tensor, y_true: Tensor) -> Tensor:
15 |         raise NotImplementedError
16 | 
17 | 
18 | class MSELoss(Loss):
19 | 
20 |     def forward(self, y_pred: Tensor, y_true: Tensor) -> Tensor:
21 |         return F.mse_loss(y_pred, y_true, reduction=self.reduction)
22 | 
23 | 
24 | class NLLLoss(Loss):
25 | 
26 |     def forward(self, y_pred: Tensor, y_true: Tensor) -> Tensor:
27 |         return F.nll_loss(y_pred, y_true, reduction=self.reduction)
28 | 
29 | 
30 | class CrossEntropyLoss(Loss):
31 | 
32 |     def forward(self, y_pred: Tensor, y_true: Tensor) -> Tensor:
33 |         return F.cross_entropy_loss(y_pred, y_true, reduction=self.reduction)
34 | 


--------------------------------------------------------------------------------
/pydynet/nn/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | from .activation import Sigmoid, Tanh, ReLU, LeakyReLU, Softmax
 2 | from .norm import BatchNorm1d, BatchNorm2d, LayerNorm, RMSNorm
 3 | from .conv import Conv1d, Conv2d
 4 | from .pool import MaxPool1d, MaxPool2d, AvgPool1d, AvgPool2d
 5 | from .dropout import Dropout
 6 | from .linear import Linear, Embedding
 7 | from .loss import MSELoss, NLLLoss, CrossEntropyLoss
 8 | from .module import Module, Sequential, ModuleList
 9 | from .rnn import RNN, LSTM, GRU, RNNCell, LSTMCell, GRUCell
10 | 
11 | __all__ = [
12 |     "Sigmoid",
13 |     "Tanh",
14 |     "ReLU",
15 |     "LeakyReLU",
16 |     "Softmax",
17 |     "BatchNorm1d",
18 |     "BatchNorm2d",
19 |     "LayerNorm",
20 |     "RMSNorm",
21 |     "Conv1d",
22 |     "Conv2d",
23 |     "MaxPool1d",
24 |     "MaxPool2d",
25 |     "AvgPool1d",
26 |     "AvgPool2d",
27 |     "Dropout",
28 |     "Linear",
29 |     "Embedding",
30 |     "MSELoss",
31 |     "NLLLoss",
32 |     "CrossEntropyLoss",
33 |     "Module",
34 |     "Sequential",
35 |     "ModuleList",
36 |     "RNN",
37 |     "LSTM",
38 |     "GRU",
39 |     "RNNCell",
40 |     "LSTMCell",
41 |     "GRUCell",
42 | ]
43 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Welt Xing
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/pydynet/autograd.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | 
 3 | grad_enable = True
 4 | 
 5 | 
 6 | def is_grad_enable():
 7 |     return grad_enable
 8 | 
 9 | 
10 | def set_grad_enabled(mode: bool):
11 |     global grad_enable
12 |     grad_enable = mode
13 | 
14 | 
15 | class no_grad:
16 | 
17 |     def __enter__(self) -> None:
18 |         self.prev = is_grad_enable()
19 |         set_grad_enabled(False)
20 | 
21 |     def __exit__(self, exc_type, exc_value, traceback) -> None:
22 |         set_grad_enabled(self.prev)
23 | 
24 |     def __call__(self, func):
25 | 
26 |         @functools.wraps(func)
27 |         def decorate_context(*args, **kwargs):
28 |             with __class__():
29 |                 return func(*args, **kwargs)
30 | 
31 |         return decorate_context
32 | 
33 | 
34 | class enable_grad:
35 | 
36 |     def __enter__(self) -> None:
37 |         self.prev = is_grad_enable()
38 |         set_grad_enabled(True)
39 | 
40 |     def __exit__(self, exc_type, exc_value, traceback) -> None:
41 |         set_grad_enabled(self.prev)
42 | 
43 |     def __call__(self, func):
44 | 
45 |         @functools.wraps(func)
46 |         def decorate_context(*args, **kwargs):
47 |             with __class__():
48 |                 return func(*args, **kwargs)
49 | 
50 |         return decorate_context
51 | 


--------------------------------------------------------------------------------
/pydynet/nn/modules/activation.py:
--------------------------------------------------------------------------------
 1 | from .module import Module
 2 | from .. import functional as F
 3 | from ...core import Tensor
 4 | 
 5 | 
 6 | class Sigmoid(Module):
 7 |     '''激活函数层 : Sigmoid'''
 8 | 
 9 |     def forward(self, x) -> Tensor:
10 |         return F.sigmoid(x)
11 | 
12 |     def __repr__(self) -> str:
13 |         return "{}()".format(self.__class__.__name__)
14 | 
15 | 
16 | class Tanh(Module):
17 |     '''激活函数层 : Tanh'''
18 | 
19 |     def forward(self, x) -> Tensor:
20 |         return F.tanh(x)
21 | 
22 |     def __repr__(self) -> str:
23 |         return "{}()".format(self.__class__.__name__)
24 | 
25 | 
26 | class ReLU(Module):
27 |     '''激活函数层 : ReLU'''
28 | 
29 |     def forward(self, x) -> Tensor:
30 |         return F.relu(x)
31 | 
32 |     def __repr__(self) -> str:
33 |         return "{}()".format(self.__class__.__name__)
34 | 
35 | 
36 | class LeakyReLU(Module):
37 |     '''
38 |     激活函数层 : LeakyReLU
39 |     
40 |     Parameter
41 |     ---------
42 |     alpha : float
43 |         负输入对应的斜率.
44 |     '''
45 | 
46 |     def __init__(self, alpha: float = 0.1) -> None:
47 |         super().__init__()
48 |         self.alpha = float(alpha)
49 | 
50 |     def forward(self, x) -> Tensor:
51 |         return F.leaky_relu(x, self.alpha)
52 | 
53 |     def __repr__(self) -> str:
54 |         return "{}(alpha={})".format(self.__class__.__name__, self.alpha)
55 | 
56 | 
57 | class Softmax(Module):
58 |     '''
59 |     激活函数层 : softmax
60 | 
61 |     Parameter
62 |     ---------
63 |     axis : Optional[Tuple[int]], default=None
64 |         沿着axis计算softmax.
65 |     '''
66 | 
67 |     def __init__(self, axis=None) -> None:
68 |         super().__init__()
69 |         self.axis = axis
70 | 
71 |     def forward(self, x) -> Tensor:
72 |         return F.softmax(x, self.axis)
73 | 
74 |     def __repr__(self) -> str:
75 |         return "{}(axis={})".format(self.__class__.__name__, self.axis)
76 | 


--------------------------------------------------------------------------------
/llm/llama/tokenizer.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | import json
 3 | 
 4 | 
 5 | class Tokenizer:
 6 |     def __init__(self, model_path: str):
 7 |         with open(model_path, "r", encoding="utf-8") as f:
 8 |             model = json.load(f)
 9 |         self.vocab = model["tokens"]
10 |         self.scores = model["scores"]
11 |         self.bos_id = 1
12 |         self.eos_id = 2
13 | 
14 |     def str_lookup(self, token: str) -> int:
15 |         try:
16 |             index = self.vocab.index(token)
17 |             return index
18 |         except ValueError as err:
19 |             return -1
20 | 
21 |     def encode(
22 |             self,
23 |             text: str,
24 |             add_bos: bool = True,
25 |             add_eos: bool = False,
26 |     ) -> List[int]:
27 |         tokens = []
28 |         for pos, char in enumerate(text):
29 |             id = self.str_lookup(char)
30 |             if id >= 0:
31 |                 tokens.append(id)
32 |         while True:
33 |             best_score = -1e10
34 |             best_id = -1
35 |             best_idx = -1
36 | 
37 |             for i in range(len(tokens) - 1):
38 |                 # Check if we can merge the pair (tokens[i], tokens[i+1])
39 |                 string = self.vocab[tokens[i]] + self.vocab[tokens[i + 1]]
40 |                 id = self.str_lookup(string)
41 |                 if id != -1 and self.scores[id] > best_score:
42 |                     best_score = self.scores[id]
43 |                     best_id = id
44 |                     best_idx = i
45 | 
46 |             if best_idx == -1:
47 |                 break
48 | 
49 |             # Merge the consecutive pair (best_idx, best_idx+1) into new token best_id
50 |             tokens[best_idx] = best_id
51 |             # Delete token at position best_idx+1, shift the entire sequence back 1
52 |             tokens = tokens[0: best_idx + 1] + tokens[best_idx + 2:]
53 |         if add_bos:
54 |             tokens.insert(0, self.bos_id)
55 |         if add_eos:
56 |             tokens.append(self.eos_id)
57 |         return tokens
58 | 
59 |     def decode(self, ids: List[int]) -> str:
60 |         res = []
61 |         for i in ids:
62 |             token = self.vocab[i]
63 |             res.append(token)
64 |         text = "".join(res)
65 |         text = text.strip("<s>").strip("</s>")
66 |         return text
67 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package to PyPI when a release is created
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: PyDyNet
10 | 
11 | on:
12 |   release:
13 |     types: [published]
14 | 
15 | permissions:
16 |   contents: read
17 | 
18 | jobs:
19 |   release-build:
20 |     runs-on: ubuntu-latest
21 | 
22 |     steps:
23 |       - uses: actions/checkout@v4
24 | 
25 |       - uses: actions/setup-python@v5
26 |         with:
27 |           python-version: "3.x"
28 | 
29 |       - name: Build release distributions
30 |         run: |
31 |           # NOTE: put your own distribution build steps here.
32 |           python -m pip install build
33 |           python -m build
34 | 
35 |       - name: Upload distributions
36 |         uses: actions/upload-artifact@v4
37 |         with:
38 |           name: release-dists
39 |           path: dist/
40 | 
41 |   pypi-publish:
42 |     runs-on: ubuntu-latest
43 |     needs:
44 |       - release-build
45 |     permissions:
46 |       # IMPORTANT: this permission is mandatory for trusted publishing
47 |       id-token: write
48 | 
49 |     # Dedicated environments with protections for publishing are strongly recommended.
50 |     # For more information, see: https://docs.github.com/en/actions/deployment/targeting-different-environments/using-environments-for-deployment#deployment-protection-rules
51 |     environment:
52 |       name: pypi
53 |       # OPTIONAL: uncomment and update to include your PyPI project URL in the deployment status:
54 |       # url: https://pypi.org/p/YOURPROJECT
55 |       #
56 |       # ALTERNATIVE: if your GitHub Release name is the PyPI project version string
57 |       # ALTERNATIVE: exactly, uncomment the following line instead:
58 |       # url: https://pypi.org/project/YOURPROJECT/${{ github.event.release.name }}
59 | 
60 |     steps:
61 |       - name: Retrieve release distributions
62 |         uses: actions/download-artifact@v4
63 |         with:
64 |           name: release-dists
65 |           path: dist/
66 | 
67 |       - name: Publish release distributions to PyPI
68 |         uses: pypa/gh-action-pypi-publish@release/v1
69 |         with:
70 |           packages-dir: dist/
71 | 


--------------------------------------------------------------------------------
/examples/pydynet/autograd1d.py:
--------------------------------------------------------------------------------
 1 | import pydynet as pdn
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | device = f'cuda:{pdn.cuda.device_count() - 1}' if pdn.cuda.is_available(
 6 | ) else 'cpu'
 7 | 
 8 | 
 9 | def auto_grad(x: float, lr: float, n_iter: int):
10 |     x_list = [x]
11 |     x: pdn.Tensor = pdn.Tensor(float(x), requires_grad=True, device=device)
12 | 
13 |     for _ in range(n_iter):
14 |         x.zero_grad()
15 |         y = pdn.log((x - 7)**2 + 6)
16 |         y.backward()
17 | 
18 |         with x.device:
19 |             x.data -= lr * x.grad
20 |         x_list.append(x.item())
21 | 
22 |     return x_list
23 | 
24 | 
25 | def manual_grad(x: float, lr: float, n_iter: int):
26 |     x_list = [x]
27 |     for _ in range(n_iter):
28 |         grad = 2 * (x - 7) / ((x - 7)**2 + 6)
29 |         x -= lr * grad
30 | 
31 |         x_list.append(x)
32 | 
33 |     return x_list
34 | 
35 | 
36 | x_ = np.linspace(0, 10, 101)
37 | f = np.log((x_ - 7)**2 + 6)
38 | 
39 | x1 = np.array(auto_grad(1., 1.5, 20))
40 | x2 = np.array(manual_grad(1., 1.5, 20))
41 | y1 = np.log((x1 - 7)**2 + 6)
42 | y2 = np.log((x2 - 7)**2 + 6)
43 | 
44 | plt.figure(figsize=(9, 3))
45 | 
46 | plt.rcParams['font.sans-serif'] = ['Times New Roman']
47 | plt.rcParams['mathtext.fontset'] = 'stix'
48 | plt.rcParams['xtick.direction'] = 'in'
49 | plt.rcParams['ytick.direction'] = 'in'
50 | plt.rcParams['axes.linewidth'] = 0.5
51 | 
52 | plt.subplot(1, 2, 1)
53 | plt.grid()
54 | 
55 | plt.xlim(0, 10)
56 | plt.ylim(1.5, 4)
57 | plt.plot(x_, f, label=r"$f(x)=\log((x-7)^2+10)$", color='blue', lw=.7)
58 | plt.scatter(x1,
59 |             y1,
60 |             color='red',
61 |             marker='^',
62 |             s=50,
63 |             zorder=10,
64 |             label='Gradient descent with lr=1.5')
65 | 
66 | plt.yticks([1.5, 2, 2.5, 3, 3.5, 4], size=13)
67 | plt.xticks([2, 4, 6, 8, 10], size=13)
68 | plt.title("Gradient descent by AutoGrad")
69 | plt.legend()
70 | 
71 | plt.subplot(1, 2, 2)
72 | 
73 | plt.grid()
74 | 
75 | plt.xlim(0, 10)
76 | plt.ylim(1.5, 4)
77 | plt.plot(x_, f, label=r"$f(x)=\log((x-7)^2+10)$", color='blue', lw=.7)
78 | plt.scatter(x1,
79 |             y1,
80 |             color='green',
81 |             marker='*',
82 |             s=50,
83 |             zorder=10,
84 |             label='Gradient descent with lr=1.5')
85 | plt.yticks([1.5, 2, 2.5, 3, 3.5, 4], size=13)
86 | plt.xticks([2, 4, 6, 8, 10], size=13)
87 | plt.title("Gradient descent by Manual calculation")
88 | plt.legend()
89 | 
90 | plt.savefig("imgs/ad1d.png")
91 | 


--------------------------------------------------------------------------------
/examples/pydynet/autograd2d.py:
--------------------------------------------------------------------------------
 1 | import pydynet as pdn
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | np.random.seed(42)
 6 | 
 7 | device = f'cuda:{pdn.cuda.device_count() - 1}' if pdn.cuda.is_available(
 8 | ) else 'cpu'
 9 | 
10 | x = np.random.randn(2)
11 | A = pdn.Tensor([
12 |     [3, 1.],
13 |     [1, 2.],
14 | ]).to(device)
15 | b = pdn.Tensor([-1., 1]).to(device)
16 | 
17 | 
18 | def auto_grad(x, lr: float, n_iter: float):
19 |     Xs, ys = [], []
20 |     x = pdn.Tensor(x, requires_grad=True, device=device)
21 | 
22 |     for _ in range(n_iter):
23 |         obj = x @ A @ x / 2 + b @ x
24 |         obj.backward()
25 | 
26 |         Xs.append(x.numpy())
27 |         ys.append(obj.item())
28 |         with x.device:
29 |             x.data -= lr * x.grad
30 |         x.zero_grad()
31 | 
32 |     Xs, ys = np.array(Xs), np.array(ys)
33 |     return Xs[:, 0], Xs[:, 1], ys
34 | 
35 | 
36 | def manual_grad(x, lr: float, n_iter: float):
37 |     Xs, ys = [], []
38 | 
39 |     for _ in range(n_iter):
40 |         obj = x @ A @ x / 2 + b @ x
41 | 
42 |         Xs.append(x.copy())
43 |         ys.append(obj.item())
44 | 
45 |         grad = A.numpy() @ x + b.numpy()
46 |         x -= lr * grad
47 | 
48 |     Xs, ys = np.array(Xs), np.array(ys)
49 |     return Xs[:, 0], Xs[:, 1], ys
50 | 
51 | 
52 | plt.rcParams['font.sans-serif'] = ['Times New Roman']
53 | plt.rcParams['mathtext.fontset'] = 'stix'
54 | 
55 | fig = plt.figure(figsize=(8, 4))
56 | ax1 = fig.add_subplot(1, 2, 1, projection='3d')
57 | ax1.plot3D(
58 |     *auto_grad(x, .1, 30),
59 |     color='red',
60 |     lw=0.7,
61 |     label=r'$f(x)=\frac{1}{2}x^\top Ax+b^\top x$',
62 |     marker='^',
63 |     markersize=6,
64 | )
65 | 
66 | ax1.tick_params(direction='in')
67 | ax1.set_xlim(.45, .60)
68 | ax1.set_ylim(-.8, 0)
69 | ax1.set_zlim(-.8, -.3)
70 | ax1.set_xticks([.45, .5, .55, .6])
71 | ax1.set_yticks([-.8, -.6, -.4, -.2, 0])
72 | 
73 | plt.title('Gradient descent by AutoGrad')
74 | plt.legend(prop={'size': 11})
75 | 
76 | ax1 = fig.add_subplot(1, 2, 2, projection='3d')
77 | ax1.plot3D(
78 |     *manual_grad(x, .1, 30),
79 |     color='blue',
80 |     lw=0.7,
81 |     label=r'$f(x)=\frac{1}{2}x^\top Ax+b^\top x$',
82 |     marker='^',
83 |     markersize=6,
84 | )
85 | 
86 | ax1.tick_params(direction='in')
87 | ax1.set_xlim(.45, .60)
88 | ax1.set_ylim(-.8, 0)
89 | ax1.set_zlim(-.8, -.3)
90 | ax1.set_xticks([.45, .5, .55, .6])
91 | ax1.set_yticks([-.8, -.6, -.4, -.2, 0])
92 | 
93 | plt.title('Gradient descent by Manual calculation')
94 | plt.legend(prop={'size': 11})
95 | 
96 | plt.savefig("imgs/ad2d.png")
97 | 


--------------------------------------------------------------------------------
/pydynet/nn/modules/pool.py:
--------------------------------------------------------------------------------
 1 | from .module import Module
 2 | from .. import functional as F
 3 | 
 4 | 
 5 | class MaxPool1d(Module):
 6 | 
 7 |     def __init__(self, kernel_size: int, stride: int, padding: int) -> None:
 8 |         super().__init__()
 9 |         self.kernel_size = kernel_size
10 |         self.stride = stride
11 |         self.padding = padding
12 | 
13 |     def forward(self, x):
14 |         return F.max_pool1d(x, self.kernel_size, self.stride, self.padding)
15 | 
16 |     def __repr__(self) -> str:
17 |         return "{}(kernel_size={}, stride={}, padding={})".format(
18 |             self.__class__.__name__,
19 |             self.kernel_size,
20 |             self.stride,
21 |             self.padding,
22 |         )
23 | 
24 | 
25 | class AvgPool1d(Module):
26 | 
27 |     def __init__(self, kernel_size: int, stride: int, padding: int) -> None:
28 |         super().__init__()
29 |         self.kernel_size = kernel_size
30 |         self.stride = stride
31 |         self.padding = padding
32 | 
33 |     def forward(self, x):
34 |         return F.avg_pool1d(x, self.kernel_size, self.stride, self.padding)
35 | 
36 |     def __repr__(self) -> str:
37 |         return "{}(kernel_size={}, stride={}, padding={})".format(
38 |             self.__class__.__name__,
39 |             self.kernel_size,
40 |             self.stride,
41 |             self.padding,
42 |         )
43 | 
44 | 
45 | class MaxPool2d(Module):
46 | 
47 |     def __init__(self, kernel_size: int, stride: int, padding: int) -> None:
48 |         super().__init__()
49 |         self.kernel_size = kernel_size
50 |         self.stride = stride
51 |         self.padding = padding
52 | 
53 |     def forward(self, x):
54 |         return F.max_pool2d(x, self.kernel_size, self.stride, self.padding)
55 | 
56 |     def __repr__(self) -> str:
57 |         return "{}(kernel_size={}, stride={}, padding={})".format(
58 |             self.__class__.__name__,
59 |             self.kernel_size,
60 |             self.stride,
61 |             self.padding,
62 |         )
63 | 
64 | 
65 | class AvgPool2d(Module):
66 | 
67 |     def __init__(self, kernel_size: int, stride: int, padding: int) -> None:
68 |         super().__init__()
69 |         self.kernel_size = kernel_size
70 |         self.stride = stride
71 |         self.padding = padding
72 | 
73 |     def forward(self, x):
74 |         return F.avg_pool2d(x, self.kernel_size, self.stride, self.padding)
75 | 
76 |     def __repr__(self) -> str:
77 |         return "{}(kernel_size={}, stride={}, padding={})".format(
78 |             self.__class__.__name__,
79 |             self.kernel_size,
80 |             self.stride,
81 |             self.padding,
82 |         )
83 | 


--------------------------------------------------------------------------------
/pydynet/special.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from .core import Tensor
  3 | 
  4 | 
  5 | # 一些包装的特殊矩阵
  6 | def zeros(shape, dtype=None, device=None, requires_grad=False):
  7 |     '''全0张量
  8 |     
  9 |     Parameters
 10 |     ----------
 11 |     shape : 
 12 |         张量形状
 13 |     require_grad : bool, default=False
 14 |         是否需要求导
 15 |     '''
 16 |     return Tensor(np.zeros(shape),
 17 |                   dtype=dtype,
 18 |                   device=device,
 19 |                   requires_grad=requires_grad)
 20 | 
 21 | 
 22 | def ones(shape, dtype=None, device=None, requires_grad=False):
 23 |     '''全1张量
 24 |     
 25 |     Parameters
 26 |     ----------
 27 |     shape : 
 28 |         张量形状
 29 |     require_grad : bool, default=False
 30 |         是否需要求导
 31 |     '''
 32 |     return Tensor(np.ones(shape),
 33 |                   dtype=dtype,
 34 |                   device=device,
 35 |                   requires_grad=requires_grad)
 36 | 
 37 | 
 38 | def randn(*shape, dtype=None, device=None, requires_grad=False):
 39 |     '''0-1正态分布张量
 40 |     
 41 |     Parameters
 42 |     ----------
 43 |     *shape : 
 44 |         张量形状
 45 |     require_grad : bool, default=False
 46 |         是否需要求导
 47 |     '''
 48 |     return Tensor(np.random.randn(*shape),
 49 |                   dtype=dtype,
 50 |                   device=device,
 51 |                   requires_grad=requires_grad)
 52 | 
 53 | 
 54 | def rand(*shape, dtype=None, device=None, requires_grad=False):
 55 |     '''[0, 1)均匀分布张量
 56 |     
 57 |     Parameters
 58 |     ----------
 59 |     *shape : 
 60 |         张量形状
 61 |     require_grad : bool, default=False
 62 |         是否需要求导
 63 |     '''
 64 |     return Tensor(np.random.rand(*shape),
 65 |                   dtype=dtype,
 66 |                   device=device,
 67 |                   requires_grad=requires_grad)
 68 | 
 69 | 
 70 | def uniform(low: float,
 71 |             high: float,
 72 |             shape=None,
 73 |             dtype=None,
 74 |             device=None,
 75 |             requires_grad=False):
 76 |     '''均匀分布张量
 77 |     
 78 |     Parameters
 79 |     ----------
 80 |     low : float
 81 |         均匀分布下界;
 82 |     high : float
 83 |         均匀分布下界;
 84 |     *shape : 
 85 |         张量形状
 86 |     require_grad : bool, default=False
 87 |         是否需要求导
 88 |     '''
 89 |     return Tensor(np.random.uniform(low, high, size=shape),
 90 |                   dtype=dtype,
 91 |                   device=device,
 92 |                   requires_grad=requires_grad)
 93 | 
 94 | 
 95 | def empty(shape, dtype=None, device=None, requires_grad=False):
 96 |     return Tensor(np.empty(shape, dtype=dtype),
 97 |                   dtype=dtype,
 98 |                   device=device,
 99 |                   requires_grad=requires_grad)
100 | 


--------------------------------------------------------------------------------
/pydynet/nn/modules/linear.py:
--------------------------------------------------------------------------------
 1 | from .module import Module
 2 | from ..parameter import Parameter
 3 | from .. import init, functional as F
 4 | from ...core import Tensor
 5 | from ...special import empty
 6 | from ...cuda import Device
 7 | from ...autograd import no_grad
 8 | 
 9 | import math
10 | 
11 | 
12 | class Linear(Module):
13 | 
14 |     def __init__(
15 |         self,
16 |         in_features: int,
17 |         out_features: int,
18 |         bias: bool = True,
19 |         device=None,
20 |         dtype=None,
21 |     ) -> None:
22 |         super().__init__()
23 |         self.in_features = in_features
24 |         self.out_features = out_features
25 |         kwargs = {"device": Device(device), "dtype": dtype}
26 |         self.weight = Parameter(
27 |             empty((self.in_features, self.out_features), **kwargs))
28 |         self.bias = Parameter(empty(self.out_features, **
29 |                                     kwargs)) if bias else None
30 |         self.reset_paramters()
31 | 
32 |     def reset_paramters(self):
33 |         init.kaiming_uniform_(self.weight, a=math.sqrt(5))
34 |         if self.bias is not None:
35 |             fan_in, _ = init._calculate_fan(self.weight)
36 |             bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
37 |             init.uniform_(self.bias, -bound, bound)
38 | 
39 |     def forward(self, x: Tensor):
40 |         return F.linear(x, self.weight, self.bias)
41 | 
42 |     def __repr__(self) -> str:
43 |         return "Linear(in_features={}, out_features={}, bias={})".format(
44 |             self.in_features, self.out_features, self.bias is not None)
45 | 
46 | 
47 | class Embedding(Module):
48 | 
49 |     def __init__(
50 |         self,
51 |         num_embeddings: int,
52 |         embedding_dim: int,
53 |         padding_idx=None,
54 |         device=None,
55 |         dtype=None,
56 |     ) -> None:
57 |         super().__init__()
58 |         self.num_embedding = num_embeddings
59 |         self.embedding_dim = embedding_dim
60 |         self.padding_idx = padding_idx
61 | 
62 |         kwargs = {"device": Device(device), "dtype": dtype}
63 |         self.weight = Parameter(
64 |             empty((self.num_embedding, self.embedding_dim), **kwargs))
65 | 
66 |     def forward(self, x: Tensor):
67 |         return F.embedding(x, self.weight, self.padding_idx)
68 | 
69 |     def reset_parameters(self) -> None:
70 |         init.normal_(self.weight)
71 |         self._fill_padding_idx_with_zero()
72 | 
73 |     def _fill_padding_idx_with_zero(self) -> None:
74 |         if self.padding_idx is not None:
75 |             with no_grad():
76 |                 self.weight[self.padding_idx].data = self.weight.xp.zeros(
77 |                     self.weight[self.padding_idx].shape,
78 |                     dtype=self.weight.dtype,
79 |                 )
80 | 


--------------------------------------------------------------------------------
/pydynet/nn/init.py:
--------------------------------------------------------------------------------
 1 | from ..core import Tensor
 2 | from ..autograd import no_grad
 3 | import math
 4 | 
 5 | 
 6 | def calculate_gain(nonlinearity: str, param: float = None) -> float:
 7 |     return {
 8 |         "linear": 1,
 9 |         "conv1d": 1,
10 |         "conv2d": 1,
11 |         "sigmoid": 1,
12 |         "tanh": 5 / 3,
13 |         "relu": math.sqrt(2.),
14 |         "leaky_relu":
15 |         math.sqrt(2. / (1 + (param if param != None else 0.01)**2))
16 |     }[nonlinearity]
17 | 
18 | 
19 | def _calculate_fan(tensor: Tensor):
20 |     assert tensor.ndim >= 2
21 |     fan_in, fan_out = tensor.shape[:2]
22 |     if tensor.ndim > 2:
23 |         receptive_field_size = math.prod(tensor.shape[2:])
24 |         fan_in *= receptive_field_size
25 |         fan_out *= receptive_field_size
26 |     return fan_in, fan_out
27 | 
28 | 
29 | @no_grad()
30 | def uniform_(tensor: Tensor, a=0., b=1.) -> Tensor:
31 |     tensor.data[...] = tensor.xp.random.uniform(a, b, tensor.shape)
32 |     return tensor
33 | 
34 | 
35 | @no_grad()
36 | def normal_(tensor: Tensor, mean=0., std=1.) -> Tensor:
37 |     tensor.data[...] = tensor.xp.random.normal(mean, std, size=tensor.shape)
38 |     return tensor
39 | 
40 | 
41 | @no_grad()
42 | def constant_(tensor: Tensor, val: float) -> Tensor:
43 |     tensor.data[...] = val
44 |     return tensor
45 | 
46 | 
47 | def ones_(tensor: Tensor) -> Tensor:
48 |     return constant_(tensor, 1.)
49 | 
50 | 
51 | def zeros_(tensor: Tensor) -> Tensor:
52 |     return constant_(tensor, 0.)
53 | 
54 | 
55 | def xavier_uniform_(tensor: Tensor, gain: float = 1.) -> Tensor:
56 |     fan_in, fan_out = _calculate_fan(tensor)
57 |     bound = gain * math.sqrt(6. / (fan_in + fan_out))
58 |     return uniform_(tensor, -bound, bound)
59 | 
60 | 
61 | def xavier_normal_(tensor: Tensor, gain: float = 1.) -> Tensor:
62 |     fan_in, fan_out = _calculate_fan(tensor)
63 |     std = gain * math.sqrt(2 / (fan_in + fan_out))
64 |     return normal_(tensor, std=std)
65 | 
66 | 
67 | def kaiming_uniform_(tensor: Tensor,
68 |                      a: float = 0.,
69 |                      mode='fan_in',
70 |                      nonlinearity='relu') -> Tensor:
71 |     fan_in, fan_out = _calculate_fan(tensor)
72 |     fan = {
73 |         "fan_in": fan_in,
74 |         "fan_out": fan_out,
75 |     }[mode]
76 |     gain = calculate_gain(nonlinearity, a)
77 |     bound = gain * math.sqrt(3. / fan)
78 |     return uniform_(tensor, -bound, bound)
79 | 
80 | 
81 | def kaiming_normal_(tensor: Tensor,
82 |                     a: float = 0.,
83 |                     mode='fan_in',
84 |                     nonlinearity='relu'):
85 |     fan_in, fan_out = _calculate_fan(tensor)
86 |     fan = {
87 |         "fan_in": fan_in,
88 |         "fan_out": fan_out,
89 |     }[mode]
90 |     gain = calculate_gain(nonlinearity, a)
91 |     std = gain / math.sqrt(fan)
92 |     return normal_(tensor, std=std)
93 | 


--------------------------------------------------------------------------------
/pydynet/cuda.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import warnings
  3 | 
  4 | try:
  5 |     import cupy as cp
  6 |     cuda_available: bool = True
  7 | except ModuleNotFoundError:
  8 |     warnings.warn(
  9 |         "Cupy is not installed. You can install it with:\n"
 10 |         "  pip install cupy-cuda12x  # or appropriate version for your CUDA",
 11 |         category=UserWarning)
 12 |     cuda_available: bool = False
 13 |     cp = object()
 14 | 
 15 | 
 16 | def is_available() -> bool:
 17 |     return cuda_available
 18 | 
 19 | 
 20 | def device_count() -> int:
 21 |     if is_available():
 22 |         return cp.cuda.runtime.getDeviceCount()
 23 |     else:
 24 |         return 0
 25 | 
 26 | 
 27 | def current_device() -> int:
 28 |     return cp.cuda.runtime.getDevice()
 29 | 
 30 | 
 31 | def set_device(device: int) -> None:
 32 |     return cp.cuda.runtime.setDevice(device)
 33 | 
 34 | 
 35 | class Device:
 36 | 
 37 |     def __init__(self, device=None) -> None:
 38 |         if isinstance(device, str):
 39 |             if device == "cpu":
 40 |                 self.device = "cpu"
 41 |             elif device[:4] == "cuda":
 42 |                 self.device = "cuda"
 43 |                 if len(device) == 4:
 44 |                     device += ':0'
 45 | 
 46 |                 cuda_id = device.split(':')[-1]
 47 |                 if not cuda_id.isdigit():
 48 |                     raise ValueError(f'Wrong cuda id \"{cuda_id}\"!')
 49 | 
 50 |                 self.device_id = int(cuda_id)
 51 |             else:
 52 |                 raise ValueError(f"Unknown device \"{device}\"!")
 53 | 
 54 |         elif isinstance(device, int):
 55 |             self.device = "cuda"
 56 |             self.device_id = device
 57 | 
 58 |         elif device is None:
 59 |             self.device = "cpu"
 60 | 
 61 |         elif isinstance(device, Device):
 62 |             self.device = device.device
 63 |             if self.device != "cpu":
 64 |                 self.device_id = device.device_id
 65 | 
 66 |         if self.device == "cuda":
 67 |             if not is_available():
 68 |                 raise RuntimeError(
 69 |                     "Cuda device is not supported on this system.")
 70 |             self.device = cp.cuda.Device(self.device_id)
 71 |         assert self.device == "cpu" or is_available()
 72 | 
 73 |     def __repr__(self) -> str:
 74 |         if self.device == "cpu":
 75 |             return "Device(type='cpu')"
 76 |         else:
 77 |             return "Device(type='cuda', index={})".format(self.device_id)
 78 | 
 79 |     def __eq__(self, device) -> bool:
 80 |         if not isinstance(device, Device):
 81 |             device = Device(device)
 82 |         if self.device == "cpu":
 83 |             return device.device == "cpu"
 84 |         else:
 85 |             if device.device == "cpu":
 86 |                 return False
 87 |             return self.device == device.device
 88 | 
 89 |     @property
 90 |     def xp(self):
 91 |         return np if self.device == "cpu" else cp
 92 | 
 93 |     def __enter__(self):
 94 |         if self.device != "cpu" and self.device_id != current_device():
 95 |             return self.device.__enter__()
 96 | 
 97 |     def __exit__(self, type, value, trace):
 98 |         if self.device != "cpu" and self.device_id != current_device():
 99 |             return self.device.__exit__(type, value, trace)
100 | 


--------------------------------------------------------------------------------
/pydynet/data.py:
--------------------------------------------------------------------------------
  1 | from numpy.random import permutation
  2 | 
  3 | 
  4 | class Dataset:
  5 | 
  6 |     def __init__(self) -> None:
  7 |         pass
  8 | 
  9 |     def __getitem__(self, index):
 10 |         raise NotImplementedError
 11 | 
 12 |     def __len__(self):
 13 |         raise NotImplementedError
 14 | 
 15 | 
 16 | class Sampler:
 17 | 
 18 |     def __init__(self, dataset: Dataset) -> None:
 19 |         pass
 20 | 
 21 |     def __iter__(self):
 22 |         raise NotImplementedError
 23 | 
 24 | 
 25 | class SequentialSampler(Sampler):
 26 | 
 27 |     def __init__(self, dataset: Dataset) -> None:
 28 |         self.dataset = dataset
 29 | 
 30 |     def __iter__(self):
 31 |         return iter(range(len(self.dataset)))
 32 | 
 33 |     def __len__(self) -> int:
 34 |         return len(self.dataset)
 35 | 
 36 | 
 37 | class RandomSampler(Sampler):
 38 | 
 39 |     def __init__(self, dataset: Dataset) -> None:
 40 |         self.dataset = dataset
 41 | 
 42 |     def __iter__(self):
 43 |         yield from permutation(len(self.dataset)).tolist()
 44 | 
 45 |     def __len__(self):
 46 |         return len(self.dataset)
 47 | 
 48 | 
 49 | class BatchSampler(Sampler):
 50 | 
 51 |     def __init__(self, sampler: Sampler, batch_size: int,
 52 |                  drop_last: bool) -> None:
 53 |         self.sampler = sampler
 54 |         self.batch_size = batch_size
 55 |         self.drop_last = drop_last
 56 | 
 57 |     def __iter__(self):
 58 |         batch = []
 59 |         for idx in self.sampler:
 60 |             batch.append(idx)
 61 |             if len(batch) == self.batch_size:
 62 |                 yield batch
 63 |                 batch = []
 64 |         if len(batch) > 0 and not self.drop_last:
 65 |             yield batch
 66 | 
 67 |     def __len__(self):
 68 |         if self.drop_last:
 69 |             return len(self.sampler) // self.batch_size
 70 |         return (len(self.sampler) + self.batch_size - 1) // self.batch_size
 71 | 
 72 | 
 73 | class _DataLoaderIter:
 74 | 
 75 |     def __init__(self, loader) -> None:
 76 |         self.loader = loader
 77 |         self.sample_iter = iter(self.loader.batch_sampler)
 78 | 
 79 |     def __next__(self):
 80 |         index = next(self.sample_iter)
 81 |         return self.loader.dataset[index]
 82 | 
 83 | 
 84 | class DataLoader:
 85 | 
 86 |     def __init__(self,
 87 |                  dataset: Dataset,
 88 |                  batch_size: int = 1,
 89 |                  shuffle: bool = False,
 90 |                  drop_last: bool = False) -> None:
 91 |         self.dataset = dataset
 92 |         self.batch_size = batch_size
 93 |         self.shuffle = shuffle
 94 |         self.drop_last = drop_last
 95 | 
 96 |         if shuffle:
 97 |             self.sampler = RandomSampler(dataset)
 98 |         else:
 99 |             self.sampler = SequentialSampler(dataset)
100 | 
101 |         self.batch_sampler = BatchSampler(self.sampler, batch_size, drop_last)
102 |         self.batch_size = batch_size
103 |         self.drop_last = drop_last
104 | 
105 |     def __iter__(self):
106 |         return _DataLoaderIter(self)
107 | 
108 | 
109 | def data_loader(X, y, batch_size: int, shuffle: bool = False) -> list:
110 | 
111 |     class TrainSet(Dataset):
112 | 
113 |         def __init__(self, X, y) -> None:
114 |             self.data = X
115 |             self.target = y
116 | 
117 |         def __getitem__(self, index):
118 |             return self.data[index], self.target[index]
119 | 
120 |         def __len__(self):
121 |             return len(self.data)
122 | 
123 |     return DataLoader(TrainSet(X, y), batch_size, shuffle)
124 | 


--------------------------------------------------------------------------------
/llm/llama/infer.py:
--------------------------------------------------------------------------------
 1 | import sys, time, argparse
 2 | from .tokenizer import Tokenizer
 3 | from .model import Llama
 4 | 
 5 | import pydynet as pdn
 6 | import numpy as np
 7 | 
 8 | 
 9 | @pdn.no_grad()
10 | def load_model(llama: Llama, model_path: str) -> Llama:
11 |     weight = np.load(model_path)
12 | 
13 |     llama.tok_embedding.weight.data[...] = weight['model.embed_tokens.weight']
14 |     llama.lm_head.weight.data[...] = weight['lm_head.weight'].T
15 | 
16 |     for i in range(llama.n_layers):
17 |         (
18 |             llama.layers[i].attention.Q.weight.data[...],
19 |             llama.layers[i].attention.K.weight.data[...],
20 |             llama.layers[i].attention.V.weight.data[...],
21 |             llama.layers[i].attention.O.weight.data[...],
22 |             llama.layers[i].ffn.up.weight.data[...],
23 |             llama.layers[i].ffn.gate.weight.data[...],
24 |             llama.layers[i].ffn.down.weight[...],
25 |             llama.layers[i].input_norm.weight.data[...],
26 |             llama.layers[i].post_attn_norm.weight.data[...],
27 |         ) = (
28 |             weight[f'model.layers.{i}.self_attn.q_proj.weight'].T,
29 |             weight[f'model.layers.{i}.self_attn.k_proj.weight'].T,
30 |             weight[f'model.layers.{i}.self_attn.v_proj.weight'].T,
31 |             weight[f'model.layers.{i}.self_attn.o_proj.weight'].T,
32 |             weight[f'model.layers.{i}.mlp.up_proj.weight'].T,
33 |             weight[f'model.layers.{i}.mlp.gate_proj.weight'].T,
34 |             weight[f'model.layers.{i}.mlp.down_proj.weight'].T,
35 |             weight[f'model.layers.{i}.input_layernorm.weight'],
36 |             weight[f'model.layers.{i}.post_attention_layernorm.weight'],
37 |         )
38 | 
39 |         llama.norm.weight.data[...] = weight['model.norm.weight']
40 | 
41 |     return llama
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     parser = argparse.ArgumentParser(
46 |         description="Prompt input, e.g. There was a boy")
47 |     parser.add_argument("--prompt", type=str, default='There was a boy')
48 |     parser.add_argument("--cuda", action='store_true')
49 |     args = parser.parse_args()
50 | 
51 |     dim: int = 288  # D
52 |     n_layers: int = 6
53 |     n_heads: int = 6
54 |     vocab_size: int = 32000  # VS
55 |     max_seq_len: int = 1024  # M
56 |     max_new_tokens: int = 1024
57 |     max_batch_size: int = 1
58 |     datatype = np.float32
59 | 
60 |     tokenizer = Tokenizer("llm/llama/data/tokenizer.model.np")
61 |     model = load_model(
62 |         Llama(vocab_size,
63 |               dim,
64 |               n_heads,
65 |               768,
66 |               max_seq_len,
67 |               max_batch_size,
68 |               n_layers,
69 |               dtype=datatype), "llm/llama/data/stories15M.model.npz")
70 | 
71 |     # If cuda is available
72 |     if args.cuda and pdn.cuda.is_available():
73 |         model: Llama = model.to('cuda:2')
74 | 
75 |     model.eval()
76 |     with pdn.no_grad():
77 |         print(f"\n{args.prompt}", end="")
78 |         input_ids = np.array([tokenizer.encode(args.prompt)])
79 | 
80 |         _, L = input_ids.shape
81 |         start = time.time()
82 |         for id in model.generate(input_ids, max_new_tokens):
83 |             L += 1
84 |             output_id = id[0].numpy().tolist()
85 | 
86 |             if output_id[-1] in [tokenizer.eos_id, tokenizer.bos_id]:
87 |                 break
88 |             print(tokenizer.decode(output_id), end="")
89 |             sys.stdout.flush()
90 |         elapsed = time.time() - start
91 |         print(
92 |             f"\n\nToken count: {L}, elapsed: {elapsed:.2f}s, {round(L / elapsed)} tokens/s"
93 |         )
94 | 


--------------------------------------------------------------------------------
/examples/pytorch/ts_prediction.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | from sklearn.model_selection import train_test_split
  4 | from tqdm import tqdm
  5 | 
  6 | import torch
  7 | import pydynet.nn as nn
  8 | from pydynet.optim import Adam
  9 | 
 10 | 
 11 | def windowize(y, input_len, horizon=1, stride=1, step=1):
 12 | 
 13 |     y = np.asarray(y)
 14 |     max_i = len(y) - (input_len + horizon) * step + step
 15 |     idx_inputs = []
 16 |     idx_targets = []
 17 |     for i in range(0, max_i, stride):
 18 |         inp_idx = i + np.arange(0, input_len * step, step)
 19 |         tgt_idx = i + input_len * step + np.arange(0, horizon * step, step)
 20 |         idx_inputs.append(inp_idx)
 21 |         idx_targets.append(tgt_idx)
 22 |     X = y[np.array(idx_inputs)]
 23 |     Y = y[np.array(idx_targets)]
 24 |     return (
 25 |         torch.tensor(X[..., np.newaxis]),
 26 |         torch.tensor(Y),
 27 |     )
 28 | 
 29 | 
 30 | TIME_STEP = 40  # rnn 时序步长数
 31 | INPUT_SIZE = 1  # rnn 的输入维度
 32 | H_SIZE = 32  # rnn 隐藏单元个数
 33 | EPOCHS = 50  # 总共训练次数
 34 | h_state = None  # 隐藏层状态
 35 | 
 36 | 
 37 | def f(t):
 38 |     return np.sin(np.pi * t) + 0.5 * np.cos(2 * np.pi * t)
 39 | 
 40 | 
 41 | steps = np.arange(0, 100, .05)
 42 | X, Y = windowize(f(steps), input_len=TIME_STEP, horizon=1, stride=1, step=1)
 43 | 
 44 | X_train, X_test, Y_train, Y_test = train_test_split(
 45 |     X,
 46 |     Y,
 47 |     test_size=0.2,
 48 |     random_state=42,
 49 | )
 50 | 
 51 | 
 52 | class RNN(nn.Module):
 53 | 
 54 |     def __init__(self):
 55 |         super(RNN, self).__init__()
 56 |         self.rnn = nn.GRU(
 57 |             input_size=INPUT_SIZE,
 58 |             hidden_size=H_SIZE,
 59 |             num_layers=1,
 60 |             batch_first=True,
 61 |             dtype=np.float32,
 62 |         )
 63 |         self.out = nn.Linear(H_SIZE, 1)
 64 | 
 65 |     def forward(self, x, h_state):
 66 |         _, h_state = self.rnn(x, h_state)
 67 |         out = self.out(h_state[:, self.rnn.num_layers - 1, :])
 68 |         return out
 69 | 
 70 | 
 71 | rnn = RNN()
 72 | optimizer = Adam(rnn.parameters(), lr=0.01)
 73 | criterion = nn.MSELoss()
 74 | 
 75 | loss_list = []
 76 | 
 77 | plt.rcParams['font.sans-serif'] = ['Times New Roman']
 78 | plt.rcParams['mathtext.fontset'] = 'stix'
 79 | plt.rcParams['xtick.direction'] = 'in'
 80 | plt.rcParams['ytick.direction'] = 'in'
 81 | plt.rcParams['axes.linewidth'] = 0.5
 82 | 
 83 | bar = tqdm(range(EPOCHS))
 84 | visual_steps = np.arange(0, 10, .05)
 85 | visual_X, visual_Y = windowize(f(visual_steps),
 86 |                                TIME_STEP,
 87 |                                horizon=1,
 88 |                                stride=1,
 89 |                                step=1)
 90 | 
 91 | for step in bar:
 92 | 
 93 |     rnn.train()
 94 |     prediction = rnn(X_train, h_state)
 95 |     train_loss = criterion(prediction, Y_train)
 96 | 
 97 |     optimizer.zero_grad()
 98 |     train_loss.backward()
 99 |     optimizer.step()
100 | 
101 |     plt.figure(figsize=(5, 3))
102 |     plt.grid()
103 |     
104 |     rnn.eval()
105 |     with torch.no_grad():
106 |         test_loss = criterion(rnn(X_test, h_state), Y_test)
107 | 
108 |         plt.plot(visual_steps[TIME_STEP:],
109 |                  visual_Y.numpy(),
110 |                  'r-',
111 |                  lw=0.7,
112 |                  label=r'$f(x)=\sin(\pi x)+\cos(2\pi x)/2$')
113 |         plt.plot(
114 |             visual_steps[TIME_STEP:],
115 |             rnn(visual_X, h_state).numpy(),
116 |             'b-.',
117 |             lw=0.7,
118 |             label='Prediction',
119 |         )
120 | 
121 |     plt.xticks([4, 6, 8, 10])
122 |     plt.yticks([-1.6, -.8, 0, .8])
123 | 
124 |     plt.legend(loc=1)
125 |     plt.ylim(-1.6, 0.8)
126 |     plt.xlim(visual_steps[TIME_STEP], 10)
127 |     plt.title('Prediction with GRU')
128 |     plt.tight_layout()
129 |     plt.savefig("imgs/rnn.png")
130 |     plt.close()
131 | 
132 |     bar.set_postfix(
133 |         train_loss="{:.5f}".format(train_loss.item()),
134 |         test_loss="{:.5f}".format(test_loss.item()),
135 |     )
136 | 


--------------------------------------------------------------------------------
/pydynet/nn/modules/conv.py:
--------------------------------------------------------------------------------
  1 | from .module import Module
  2 | from ..parameter import Parameter
  3 | from .. import init
  4 | from .. import functional as F
  5 | from ...special import empty
  6 | from ...cuda import Device
  7 | 
  8 | import math
  9 | 
 10 | 
 11 | class Conv1d(Module):
 12 | 
 13 |     def __init__(
 14 |         self,
 15 |         in_channels: int,
 16 |         out_channels: int,
 17 |         kernel_size: int,
 18 |         stride: int = 1,
 19 |         padding: int = 0,
 20 |         bias: bool = True,
 21 |         device=None,
 22 |         dtype=None,
 23 |     ) -> None:
 24 |         super().__init__()
 25 |         kwargs = {"device": Device(device), "dtype": dtype}
 26 |         self.in_channels = in_channels
 27 |         self.out_channels = out_channels
 28 |         self.kernel_size = kernel_size
 29 |         self.padding = padding
 30 |         self.stride = stride
 31 |         self.weight = Parameter(
 32 |             empty((self.out_channels, self.in_channels, self.kernel_size),
 33 |                   **kwargs))
 34 |         self.bias = Parameter(empty(
 35 |             (1, self.out_channels, 1), **kwargs)) if bias else None
 36 |         self.reset_parameters()
 37 | 
 38 |     def reset_parameters(self):
 39 |         init.kaiming_uniform_(self.weight, a=math.sqrt(5))
 40 |         if self.bias is not None:
 41 |             fan_in, _ = init._calculate_fan(self.weight)
 42 |             if fan_in != 0:
 43 |                 bound = 1 / math.sqrt(fan_in)
 44 |                 init.uniform_(self.bias, -bound, bound)
 45 | 
 46 |     def forward(self, x):
 47 |         conv1d = F.conv1d(x, self.weight, self.padding, self.stride)
 48 |         if self.bias is not None:
 49 |             return conv1d + self.bias
 50 |         return conv1d
 51 | 
 52 |     def __repr__(self) -> str:
 53 |         return "{}(in_channels={}, out_channels={}, kernel_size={}, padding={}, stride={}, bias={})".format(
 54 |             self.__class__.__name__,
 55 |             self.in_channels,
 56 |             self.out_channels,
 57 |             self.kernel_size,
 58 |             self.padding,
 59 |             self.stride,
 60 |             self.bias is not None,
 61 |         )
 62 | 
 63 | 
 64 | class Conv2d(Module):
 65 | 
 66 |     def __init__(
 67 |         self,
 68 |         in_channels: int,
 69 |         out_channels: int,
 70 |         kernel_size: int,
 71 |         stride: int = 1,
 72 |         padding: int = 0,
 73 |         bias: bool = True,
 74 |         device=None,
 75 |         dtype=None,
 76 |     ) -> None:
 77 |         super().__init__()
 78 |         kwargs = {"device": Device(device), "dtype": dtype}
 79 |         self.in_channels = in_channels
 80 |         self.out_channels = out_channels
 81 |         self.kernel_size = kernel_size
 82 |         self.padding = padding
 83 |         self.stride = stride
 84 |         self.weight = Parameter(
 85 |             empty((self.out_channels, self.in_channels, self.kernel_size,
 86 |                    self.kernel_size), **kwargs))
 87 |         self.bias = Parameter(empty(
 88 |             (1, self.out_channels, 1, 1), **kwargs)) if bias else None
 89 |         self.reset_parameters()
 90 | 
 91 |     def reset_parameters(self):
 92 |         init.kaiming_uniform_(self.weight, a=math.sqrt(5))
 93 |         if self.bias is not None:
 94 |             fan_in, _ = init._calculate_fan(self.weight)
 95 |             if fan_in != 0:
 96 |                 bound = 1 / math.sqrt(fan_in)
 97 |                 init.uniform_(self.bias, -bound, bound)
 98 | 
 99 |     def forward(self, x):
100 |         conv2d = F.conv2d(x, self.weight, self.padding, self.stride)
101 |         if self.bias is not None:
102 |             return conv2d + self.bias
103 |         return conv2d
104 | 
105 |     def __repr__(self) -> str:
106 |         return "{}(in_channels={}, out_channels={}, kernel_size={}, padding={}, stride={}, bias={})".format(
107 |             self.__class__.__name__,
108 |             self.in_channels,
109 |             self.out_channels,
110 |             self.kernel_size,
111 |             self.padding,
112 |             self.stride,
113 |             self.bias is not None,
114 |         )
115 | 


--------------------------------------------------------------------------------
/examples/pydynet/ts_prediction.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | from sklearn.model_selection import train_test_split
  4 | from tqdm import tqdm
  5 | 
  6 | import pydynet as pdn
  7 | from pydynet import Tensor
  8 | import pydynet.nn as nn
  9 | from pydynet.optim import Adam
 10 | 
 11 | 
 12 | def windowize(y, input_len, horizon=1, stride=1, step=1):
 13 | 
 14 |     y = np.asarray(y)
 15 |     max_i = len(y) - (input_len + horizon) * step + step
 16 |     idx_inputs = []
 17 |     idx_targets = []
 18 |     for i in range(0, max_i, stride):
 19 |         inp_idx = i + np.arange(0, input_len * step, step)
 20 |         tgt_idx = i + input_len * step + np.arange(0, horizon * step, step)
 21 |         idx_inputs.append(inp_idx)
 22 |         idx_targets.append(tgt_idx)
 23 |     X = y[np.array(idx_inputs)]
 24 |     Y = y[np.array(idx_targets)]
 25 |     return (
 26 |         Tensor(X[..., np.newaxis], dtype=np.float32),
 27 |         Tensor(Y, dtype=np.float32),
 28 |     )
 29 | 
 30 | 
 31 | TIME_STEP = 40  # rnn 时序步长数
 32 | INPUT_SIZE = 1  # rnn 的输入维度
 33 | H_SIZE = 32  # rnn 隐藏单元个数
 34 | EPOCHS = 50  # 总共训练次数
 35 | h_state = None  # 隐藏层状态
 36 | 
 37 | 
 38 | def f(t):
 39 |     return np.sin(np.pi * t) + 0.5 * np.cos(2 * np.pi * t)
 40 | 
 41 | 
 42 | steps = np.arange(0, 100, .05)
 43 | X, Y = windowize(f(steps), input_len=TIME_STEP, horizon=1, stride=1, step=1)
 44 | 
 45 | X_train, X_test, Y_train, Y_test = train_test_split(
 46 |     X,
 47 |     Y,
 48 |     test_size=0.2,
 49 |     random_state=42,
 50 | )
 51 | 
 52 | 
 53 | class RNN(nn.Module):
 54 | 
 55 |     def __init__(self):
 56 |         super(RNN, self).__init__()
 57 |         self.rnn = nn.GRU(
 58 |             input_size=INPUT_SIZE,
 59 |             hidden_size=H_SIZE,
 60 |             num_layers=1,
 61 |             batch_first=True,
 62 |             dtype=np.float32,
 63 |         )
 64 |         self.out = nn.Linear(H_SIZE, 1, dtype=np.float32)
 65 | 
 66 |     def forward(self, x, h_state):
 67 |         _, h_state = self.rnn(x, h_state)
 68 |         out = self.out(h_state[:, self.rnn.num_layers - 1, :])
 69 |         return out
 70 | 
 71 | 
 72 | rnn = RNN()
 73 | optimizer = Adam(rnn.parameters(), lr=0.01)
 74 | criterion = nn.MSELoss()
 75 | 
 76 | loss_list = []
 77 | 
 78 | plt.rcParams['font.sans-serif'] = ['Times New Roman']
 79 | plt.rcParams['mathtext.fontset'] = 'stix'
 80 | plt.rcParams['xtick.direction'] = 'in'
 81 | plt.rcParams['ytick.direction'] = 'in'
 82 | plt.rcParams['axes.linewidth'] = 0.5
 83 | 
 84 | bar = tqdm(range(EPOCHS))
 85 | visual_steps = np.arange(0, 10, .05)
 86 | visual_X, visual_Y = windowize(f(visual_steps),
 87 |                                TIME_STEP,
 88 |                                horizon=1,
 89 |                                stride=1,
 90 |                                step=1)
 91 | 
 92 | for step in bar:
 93 | 
 94 |     rnn.train()
 95 |     prediction = rnn(X_train, h_state)
 96 |     train_loss = criterion(prediction, Y_train)
 97 | 
 98 |     optimizer.zero_grad()
 99 |     train_loss.backward()
100 |     optimizer.step()
101 | 
102 |     plt.figure(figsize=(5, 3))
103 |     plt.grid()
104 | 
105 |     rnn.eval()
106 |     with pdn.no_grad():
107 |         test_loss = criterion(rnn(X_test, h_state), Y_test)
108 | 
109 |         plt.plot(visual_steps[TIME_STEP:],
110 |                  visual_Y.numpy(),
111 |                  'r-',
112 |                  lw=0.7,
113 |                  label=r'$f(x)=\sin(\pi x)+\cos(2\pi x)/2$')
114 |         plt.plot(
115 |             visual_steps[TIME_STEP:],
116 |             rnn(visual_X, h_state).numpy(),
117 |             'b-.',
118 |             lw=0.7,
119 |             label='Prediction',
120 |         )
121 | 
122 |     plt.xticks([4, 6, 8, 10])
123 |     plt.yticks([-1.6, -.8, 0, .8])
124 | 
125 |     plt.legend(loc=1)
126 |     plt.ylim(-1.6, 0.8)
127 |     plt.xlim(visual_steps[TIME_STEP], 10)
128 |     plt.title('Prediction with GRU')
129 |     plt.tight_layout()
130 |     plt.savefig("imgs/rnn.png")
131 |     plt.close()
132 | 
133 |     bar.set_postfix(
134 |         train_loss="{:.5f}".format(train_loss.item()),
135 |         test_loss="{:.5f}".format(test_loss.item()),
136 |     )
137 | 


--------------------------------------------------------------------------------
/pydynet/nn/modules/module.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | 
  3 | from ..parameter import Parameter
  4 | from ...core import Tensor
  5 | from ...autograd import set_grad_enabled
  6 | from ...cuda import Device, current_device
  7 | 
  8 | 
  9 | class Module:
 10 | 
 11 |     def __init__(self) -> None:
 12 |         self._train = True
 13 |         self.device = Device("cpu")
 14 |         self._parameters = OrderedDict()
 15 | 
 16 |     def __call__(self, *x) -> Tensor:
 17 |         return self.forward(*x)
 18 | 
 19 |     def __setattr__(self, __name: str, __value) -> None:
 20 |         self.__dict__[__name] = __value
 21 |         if isinstance(__value, Parameter):
 22 |             self._parameters[__name] = __value
 23 |         if isinstance(__value, Module):
 24 |             for key in __value._parameters:
 25 |                 self._parameters[__name + "." + key] = __value._parameters[key]
 26 | 
 27 |     def __repr__(self) -> str:
 28 |         module_list = [
 29 |             module for module in self.__dict__.items()
 30 |             if isinstance(module[1], Module)
 31 |         ]
 32 |         return "{}(\n{}\n)".format(
 33 |             self.__class__.__name__,
 34 |             "\n".join([
 35 |                 "{:>10} : {}".format(module_name, module)
 36 |                 for module_name, module in module_list
 37 |             ]),
 38 |         )
 39 | 
 40 |     def parameters(self):
 41 |         for param in self._parameters.values():
 42 |             if param.requires_grad:
 43 |                 yield param
 44 | 
 45 |     def train(self, mode: bool = True):
 46 |         set_grad_enabled(mode)
 47 |         self.set_module_state(mode)
 48 | 
 49 |     def set_module_state(self, mode: bool):
 50 |         self._train = mode
 51 |         for module in self.__dict__.values():
 52 |             if isinstance(module, Module):
 53 |                 module.set_module_state(mode)
 54 | 
 55 |     def forward(self, x: Tensor) -> Tensor:
 56 |         raise NotImplementedError
 57 | 
 58 |     def eval(self):
 59 |         return self.train(False)
 60 | 
 61 |     def to(self, device):
 62 |         if not isinstance(device, Device):
 63 |             device = Device(device)
 64 |         if self.device != device:
 65 |             self.move(device)
 66 |         return self
 67 | 
 68 |     def move(self, device):
 69 |         self.device = device
 70 |         for module in self.__dict__.values():
 71 |             if isinstance(module, Module):
 72 |                 module.move(device)
 73 |             if isinstance(module, Parameter):
 74 |                 module.to(device)
 75 | 
 76 |     def cuda(self):
 77 |         return self.to(current_device())
 78 | 
 79 |     def cpu(self):
 80 |         return self.to('cpu')
 81 | 
 82 | 
 83 | class Sequential(Module):
 84 | 
 85 |     def __init__(self, *args) -> None:
 86 |         super().__init__()
 87 |         self.module_list = []
 88 |         if len(args) == 1 and isinstance(args[0], OrderedDict):
 89 |             for name, module in args[0].items():
 90 |                 self.__setattr__(name, module)
 91 |                 self.module_list.append(module)
 92 |         else:
 93 |             for idx, module in enumerate(args):
 94 |                 self.__setattr__(str(idx), module)
 95 |                 self.module_list.append(module)
 96 | 
 97 |     def forward(self, x: Tensor) -> Tensor:
 98 |         for module in self.module_list:
 99 |             x = module(x)
100 |         return x
101 | 
102 |     def __len__(self):
103 |         return len(self.module_list)
104 | 
105 | 
106 | class ModuleList(Module):
107 | 
108 |     def __init__(self, module_list: list) -> None:
109 |         super().__init__()
110 |         self.module_list = module_list
111 | 
112 |         for idx, module in enumerate(module_list):
113 |             self.__setattr__(str(idx), module)
114 | 
115 |     def __getitem__(self, index):
116 |         return self.module_list[index]
117 | 
118 |     def __len__(self):
119 |         return len(self.module_list)
120 | 
121 |     def append(self, module):
122 |         self.module_list.append(module)
123 |         self.__setattr__(str(len(self.module_list) - 1), module)
124 | 
125 |     def index(self, module):
126 |         return self.module_list.index(module)
127 | 


--------------------------------------------------------------------------------
/tests/test_tensor_basic.py:
--------------------------------------------------------------------------------
  1 | import sys, pytest, random
  2 | import numpy as np
  3 | from itertools import product
  4 | 
  5 | sys.path.append('../pydynet')
  6 | 
  7 | import pydynet as pdn
  8 | 
  9 | np.random.seed(0)
 10 | random.seed(0)
 11 | 
 12 | type_list = [np.float16, np.float32, np.float64]
 13 | 
 14 | 
 15 | def matmul_shape_pair(max_dim=4, max_size=5):
 16 |     ndim = random.randint(0, max_dim)
 17 | 
 18 |     shape1 = []
 19 |     shape2 = []
 20 |     for _ in range(ndim):
 21 |         if random.random() < 0.5:
 22 |             # 50% 概率设置为 1, 确保广播可能
 23 |             s1, s2 = random.choice([(1, random.randint(1, max_size)),
 24 |                                     (random.randint(1, max_size), 1)])
 25 |         else:
 26 |             # 否则两边相同
 27 |             val = random.randint(1, max_size)
 28 |             s1, s2 = val, val
 29 |         shape1.append(s1)
 30 |         shape2.append(s2)
 31 |     shape1, shape2 = tuple(shape1), tuple(shape2)
 32 | 
 33 |     m = random.randint(1, max_size)
 34 |     n = random.randint(1, max_size)
 35 |     p = random.randint(1, max_size)
 36 | 
 37 |     shape1 = shape1 + (m, n)
 38 |     shape2 = shape2 + (n, p)
 39 | 
 40 |     shape1 = shape1[random.randint(0, len(shape1) - 2):]
 41 | 
 42 |     return shape1, shape2
 43 | 
 44 | 
 45 | def broadcastable_shape_pair(max_dim=4, max_size=5):
 46 |     ndim = random.randint(0, max_dim)  # 随机维数
 47 |     shape1 = []
 48 |     shape2 = []
 49 |     for _ in range(ndim):
 50 |         if random.random() < 0.5:
 51 |             # 50% 概率设置为 1, 确保广播可能
 52 |             s1, s2 = random.choice([(1, random.randint(1, max_size)),
 53 |                                     (random.randint(1, max_size), 1)])
 54 |         else:
 55 |             # 否则两边相同
 56 |             val = random.randint(1, max_size)
 57 |             s1, s2 = val, val
 58 |         shape1.append(s1)
 59 |         shape2.append(s2)
 60 |     shape1, shape2 = tuple(shape1), tuple(shape2)
 61 | 
 62 |     # 随机缺失维度
 63 |     shape1 = shape1[random.randint(0, len(shape1)):]
 64 |     return shape1, shape2
 65 | 
 66 | 
 67 | def array_pair_generator(pair_gen_func,
 68 |                          max_dim=4,
 69 |                          max_size=5,
 70 |                          n_iter=4,
 71 |                          seed=None):
 72 |     rng = np.random.default_rng(seed)
 73 |     count = 0
 74 |     while n_iter is None or count < n_iter:
 75 |         shape1, shape2 = pair_gen_func(max_dim, max_size)
 76 |         a = rng.standard_normal(size=shape1).astype(rng.choice(type_list))
 77 |         b = rng.standard_normal(size=shape2).astype(rng.choice(type_list))
 78 |         yield a, b
 79 |         count += 1
 80 | 
 81 | 
 82 | test_list = array_pair_generator(broadcastable_shape_pair, 4, 5, 8, seed=42)
 83 | func_list = [(pdn.add, np.add), (pdn.sub, np.subtract), (pdn.mul, np.multiply),
 84 |              (pdn.div, np.divide), (pdn.pow, np.power),
 85 |              (pdn.maximum, np.maximum), (pdn.minimum, np.minimum)]
 86 | test_list = [(*array, *funcs)
 87 |              for (array, funcs) in product(test_list, func_list)]
 88 | 
 89 | 
 90 | @pytest.mark.parametrize("operand1, operand2, pdn_func, np_func", test_list)
 91 | @pytest.mark.filterwarnings("ignore:invalid value")
 92 | @pytest.mark.filterwarnings("ignore:divide by zero")
 93 | def test_binary_operator(operand1: np.ndarray, operand2: np.ndarray,
 94 |                          pdn_func: callable, np_func: callable):
 95 |     pdn_operand1, pdn_operand2 = pdn.Tensor(operand1), pdn.Tensor(operand2)
 96 |     pdn_output: pdn.Tensor = pdn_func(pdn_operand1, pdn_operand2)
 97 |     np_output: np.ndarray = np_func(operand1, operand2)
 98 |     assert pdn_output.shape == np_output.shape
 99 |     assert pdn_output.dtype == np_output.dtype
100 |     assert np.allclose(pdn_output.data, np_output, equal_nan=True)
101 | 
102 | 
103 | test_list = array_pair_generator(matmul_shape_pair, 4, 5, 8, seed=42)
104 | 
105 | 
106 | @pytest.mark.parametrize("operand1, operand2", test_list)
107 | def test_matmul(operand1: np.ndarray, operand2: np.ndarray):
108 |     pdn_operand1, pdn_operand2 = pdn.Tensor(operand1), pdn.Tensor(operand2)
109 |     pdn_output: pdn.Tensor = pdn.matmul(pdn_operand1, pdn_operand2)
110 |     np_output: np.ndarray = np.matmul(operand1, operand2)
111 |     assert pdn_output.shape == np_output.shape
112 |     assert pdn_output.dtype == np_output.dtype
113 |     assert np.allclose(pdn_output.data, np_output, equal_nan=True)
114 | 
115 | 


--------------------------------------------------------------------------------
/cnREADME.md:
--------------------------------------------------------------------------------
  1 | # PyDyNet：NumPy-based Dynamic Deep Learning Framework
  2 | 
  3 | **PyDyNet已被多个技术公众号和社区分享**：[居然用Numpy实现了一个深度学习框架](https://segmentfault.com/a/1190000042108301).
  4 | 
  5 | [![Downloads](https://pepy.tech/badge/pydynet)](https://pepy.tech/project/pydynet)
  6 | [![Downloads](https://static.pepy.tech/personalized-badge/pydynet?period=month&units=international_system&left_color=grey&right_color=orange&left_text=downloads/month)](https://pepy.tech/project/pydynet)
  7 | ![x](https://img.shields.io/pypi/l/pydynet)
  8 | ![x](https://img.shields.io/pypi/implementation/numpy)
  9 | ![x](https://img.shields.io/github/stars/Kaslanarian/PyDyNet?style=social)
 10 | ![x](https://img.shields.io/github/forks/Kaslanarian/PyDyNet?style=social)
 11 | 
 12 | ## Towards Large Language Model
 13 | 
 14 | **2025.8.12**: 实现了纯推理的llama3 (6-layer Transformer, vocab-size=32000). 参考了[这里](https://github.com/likejazz/llama3.np)的NumPy实现和数据集. 将数据集下载到`llama`文件夹即可运行:
 15 | 
 16 | ```bash
 17 | >>> python -m llama.infer
 18 | There was a boy named Timmy. He loved to play with hi toy and run around outside. One day, Timmy' mom asked him to help her with the laundry. Timmy didn't want to help because he wanted to play. But hi mom said, "Timmy, you need to help me. It' important to help out."
 19 | Timmy didn't want to help, but he knew he had to. So, he put on hi shoe and went outside to help hi mom. A they were folding the clothe, Timmy saw a big pile of laundry on the floor. He wanted to help, so he started to pick it up. But then, he accidentally knocked over a pile of clothe and they fell on him. Timmy wa okay, but he felt bad.
 20 | Hi mom saw what happened and said, "Timmy, you need to be more careful. You could have hurt yourself." Timmy felt bad and said sorry. Hi mom hugged him and said, "It' okay, accident happen. Let' clean up the laundry together." Timmy learned that it' important to be careful and help out when you need it.
 21 | 
 22 | Token count: 262, elapsed: 0.87s, 300 tokens/s
 23 | ```
 24 | 
 25 | ## Overview
 26 | 
 27 | PyDyNet也是纯NumPy(0.0.7版本后加入CuPy，其用法和NumPy一致)实现的神经网络，语法受PyTorch的启发，大致结构如下：
 28 | 
 29 | ```mermaid
 30 | graph LR
 31 |    N(numpy/cupy.ndarray)--Backend--> A(Tensor) --> ds(Dataset) ---> Data(DataLoader)---> Mission
 32 |    A  --Eager execution--> B(Basic operators:<br> add, exp, etc)
 33 |    B -.Autograd-.-> A
 34 | 
 35 |    B --> CO(Complex<br>operators)
 36 |    --> f(Function:<br>img2col, etc) 
 37 |    --> M(Basic Module:<br>Linear, etc)
 38 |    --> CM(Advanced Module: CNN, RNN, Transformer, etc)
 39 |    --> Mission(Learning task)
 40 |    A --> GD(Optimizer:<br> SGD, Adam, etc) ---> LS(lr_scheduler: <br>StepLR, etc)---> Mission
 41 | ```
 42 | 
 43 | 虚线表示用户可以通过`no_grad`来关闭自动微分功能.
 44 | 
 45 | ## Install
 46 | 
 47 | ```bash
 48 | git clone https://github.com/Kaslanarian/PyDyNet
 49 | cd PyDyNet
 50 | python setup.py install
 51 | ```
 52 | 
 53 | ## Example
 54 | 
 55 | [examples/pydynet](./examples/pydynet)中是一些例子，[examples/pytorch](./examples/pytorch)给出等价的pytorch实现. 运行`python examples.pydynet.xxx`即可:
 56 | 
 57 | ### AutoDiff
 58 | 
 59 | [autodiff1d.py](examples/pydynet/autodiff1d.py)利用自动微分，对一个一维凸函数进行梯度下降：
 60 | 
 61 | <img src="imgs/ad1d.png" alt="ad1" style="zoom:67%;" />
 62 | 
 63 | 以及一个多元凸函数的例子: [autodiff2d.py](examples/pydynet/autodiff2d.py)
 64 | 
 65 | <img src="imgs/ad2d.png" alt="ad2" style="zoom:67%;" />
 66 | 
 67 | ### MLP & LeNet
 68 | 
 69 | [mlp_cnn.py](examples/pydynet/mnist.py)使用MLP和LeNet对MNIST进行分类. 训练准确率和测试准确率：
 70 | 
 71 | <img src="imgs/mlp_cnn.png" alt="dnn" style="zoom:67%;" />
 72 | 
 73 | ### Dropout & BN
 74 | 
 75 | [mlp_dropout_bn.py](examples/pydynet/dropout_bn.py)使用三种网络对`fetch_olivetti_faces`人脸(64×64)数据集进行分类并进行性能对比：
 76 | 
 77 | 1. 三层MLP;
 78 | 2. 三层MLP + Dropout;
 79 | 3. 三层MLP + BatchNormalization.
 80 | 
 81 | 学习效果对比：
 82 | 
 83 | <img src="imgs/dropout_bn.png" alt="cnn" style="zoom:67%;" />
 84 | 
 85 | ### RNN
 86 | 
 87 | [ts_prediction](examples/pydynet/ts_prediction.py)中是一个用GRU做时序预测例子:
 88 | 
 89 | <img src="imgs/rnn.png" alt="RNN" style="zoom:67%;" />
 90 | 
 91 | ### Transformer
 92 | 
 93 | [transformer.py](examples/pydynet/transformer.py)中是一个用Transformer训练文本分类模型的例子. 训练结果:
 94 | 
 95 | <img src="imgs/transformer.png" alt="transformer" style="zoom:67%;" />
 96 | 
 97 | > 数据集 (CoLA) 链接: <https://nyu-mll.github.io/CoLA/cola_public_1.1.zip>
 98 | 
 99 | ## cuda加速
100 | 
101 | 在训练batch size为256, 测试batch size为1024情况下，模型在CPU和GPU上的训练速度比较:
102 | 
103 | |      Network structure         |      Dataset      | CPU time (s) per epoch | GPU time (s) per epoch |
104 | | :-----------------: | :---------------: | :--------------------: | :--------------------: |
105 | |    3-layer MLP     | MNIST (80000×574) |      7.256±0.138      |       1.203±.0181       |
106 | |        LeNet        | MNIST (80000×574) |     239.664±2.108      |      2.841±0.026      |
107 | | 1-layer Transformer (dim=512, head=4) | CoLA (8551×45×64) |      17.503±0.251      |      1.075±0.002       |
108 | 
109 | 设备: Nvidia GeForce RTX 4090.
110 | 


--------------------------------------------------------------------------------
/pydynet/optim/lr_scheduler.py:
--------------------------------------------------------------------------------
  1 | '''学习率调节器类，我们目前实现了\n
  2 | - ExponentialLR;\n
  3 | - StepLR;\n
  4 | - MultiStepLR;\n
  5 | - CosineAnnealingLR.\n
  6 | '''
  7 | 
  8 | from typing import List
  9 | from .optimizer import Optimizer
 10 | import weakref
 11 | from functools import wraps
 12 | from collections import Counter
 13 | from math import cos, pi
 14 | 
 15 | 
 16 | class _LRScheduler:
 17 |     def __init__(self, optimizer: Optimizer, last_epoch: int = -1) -> None:
 18 |         self.optimizer = optimizer
 19 |         self.last_epoch = last_epoch
 20 | 
 21 |         if self.last_epoch == -1:
 22 |             self.optimizer.initial_lr = self.optimizer.lr
 23 |         else:
 24 |             assert hasattr(
 25 |                 self.optimizer, "initial_lr"
 26 |             ), "last_epoch=1 but no 'initial_lr' attribute in optimizer!"
 27 | 
 28 |         def with_counter(method):
 29 |             if getattr(method, '_with_counter', False):
 30 |                 # `optimizer.step()` has already been replaced, return.
 31 |                 return method
 32 | 
 33 |             # 建立一个method的弱引用。弱引用不增加对象的引用计数,只存在弱引用的对象是可被垃圾回收的;
 34 |             # 弱引用可以解决循环引用的问题。
 35 |             instance_ref = weakref.ref(method.__self__)
 36 |             # Get the unbound method for the same purpose.
 37 |             func = method.__func__  # __func__是method的底层实现,不跟具体的实例绑定
 38 |             cls = instance_ref().__class__  # method的所属类
 39 |             del method
 40 | 
 41 |             @wraps(func)
 42 |             def wrapper(*args, **kwargs):
 43 |                 instance = instance_ref()
 44 |                 instance._step_count += 1
 45 |                 wrapped = func.__get__(instance, cls)
 46 |                 return wrapped(*args, **kwargs)
 47 | 
 48 |             # Note that the returned function here is no longer a bound method,
 49 |             # so attributes like `__func__` and `__self__` no longer exist.
 50 |             wrapper._with_counter = True
 51 |             return wrapper
 52 | 
 53 |         # 通过装饰器来为optimizer.step添加计数功能,并初始化计数器
 54 |         self.optimizer.step = with_counter(self.optimizer.step)
 55 |         self.optimizer._step_count = 0
 56 |         self._step_count = 0
 57 | 
 58 |         self.step()
 59 | 
 60 |     def step(self):
 61 |         self._step_count += 1  # lr_scheduler的step计数
 62 | 
 63 |         # 支持上下文管理器协议的类
 64 |         class _enable_get_lr_call:
 65 |             def __init__(self, o):
 66 |                 self.o = o
 67 | 
 68 |             def __enter__(self):
 69 |                 self.o._get_lr_called_within_step = True
 70 |                 return self
 71 | 
 72 |             def __exit__(self, type, value, traceback):
 73 |                 self.o._get_lr_called_within_step = False
 74 | 
 75 |         with _enable_get_lr_call(self):
 76 |             self.last_epoch += 1  # 更新epoch
 77 |             lr = self.get_lr()  # 计算新的lr,与具体的lr_scheduler类型有关
 78 | 
 79 |         # _last_lr记录上一轮次更新的lr值
 80 |         self._last_lr = self.optimizer.lr
 81 |         self.optimizer.lr = lr
 82 | 
 83 |     def get_lr(self):
 84 |         raise NotImplementedError
 85 | 
 86 |     def get_last_lr(self):
 87 |         return self._last_lr
 88 | 
 89 | 
 90 | class ExponentialLR(_LRScheduler):
 91 |     def __init__(
 92 |         self,
 93 |         optimizer: Optimizer,
 94 |         gamma: float = 0.1,
 95 |         last_epoch: int = -1,
 96 |     ) -> None:
 97 |         self.gamma = gamma
 98 |         super().__init__(optimizer, last_epoch)
 99 | 
100 |     def get_lr(self):
101 |         return self.optimizer.lr * self.gamma**self.last_epoch
102 | 
103 | 
104 | class StepLR(_LRScheduler):
105 |     def __init__(
106 |         self,
107 |         optimizer: Optimizer,
108 |         step_size: int,
109 |         gamma=0.1,
110 |         last_epoch: int = -1,
111 |     ) -> None:
112 |         self.step_size = step_size
113 |         self.gamma = gamma
114 |         super().__init__(optimizer, last_epoch)
115 | 
116 |     def get_lr(self):
117 |         return self.optimizer.lr * self.gamma**(self.last_epoch //
118 |                                                 self.step_size)
119 | 
120 | 
121 | class MultiStepLR(_LRScheduler):
122 |     def __init__(
123 |         self,
124 |         optimizer: Optimizer,
125 |         milestones: List[int],
126 |         gamma=0.1,
127 |         last_epoch: int = -1,
128 |     ) -> None:
129 |         self.milestones = Counter(milestones)
130 |         self.gamma = gamma
131 |         super().__init__(optimizer, last_epoch)
132 | 
133 |     def get_lr(self):
134 |         if self.last_epoch not in self.milestones:
135 |             return self.optimizer.lr
136 |         return self.optimizer.lr * self.gamma**self.milestones[self.last_epoch]
137 | 
138 | 
139 | class CosineAnnealingLR(_LRScheduler):
140 |     def __init__(
141 |         self,
142 |         optimizer: Optimizer,
143 |         T_max: int,
144 |         eta_min: float = 0,
145 |         last_epoch: int = -1,
146 |     ) -> None:
147 |         self.T_max = T_max
148 |         self.eta_min = eta_min
149 |         super().__init__(optimizer, last_epoch)
150 | 
151 |     def get_lr(self):
152 |         base_lr = self.optimizer.initial_lr
153 |         if self.last_epoch == 0:
154 |             return base_lr
155 |         elif (self.last_epoch - 1 - self.T_max) % (2 * self.T_max) == 0:
156 |             return self.get_last_lr() + (base_lr - self.eta_min) * (
157 |                 1 - cos(pi / self.T_max)) / 2
158 |         return (1 + cos(pi * self.last_epoch / self.T_max)) / (
159 |             1 + cos(pi * (self.last_epoch - 1) / self.T_max)) * (
160 |                 self.get_last_lr() - self.eta_min) + self.eta_min
161 | 


--------------------------------------------------------------------------------
/pydynet/optim/optimizer.py:
--------------------------------------------------------------------------------
  1 | '''优化器类，我们目前实现了\n
  2 | - SGD with momentum and Nestrov;\n
  3 | - Adagrad;\n
  4 | - Adadelta;\n
  5 | - Adam.\n
  6 | 
  7 | Reference
  8 | ---------
  9 | 论文: https://arxiv.org/abs/1609.04747;\n
 10 | 博客: https://xingcy.net/2021/08/20/gd/.
 11 | '''
 12 | 
 13 | from math import sqrt
 14 | from ..core import Tensor
 15 | 
 16 | 
 17 | class Optimizer:
 18 |     '''优化器基类'''
 19 | 
 20 |     def __init__(self, params: list[Tensor]) -> None:
 21 |         self.params: list[Tensor] = list(params)
 22 | 
 23 |     def step(self):
 24 |         raise NotImplementedError
 25 | 
 26 |     def zero_grad(self):
 27 |         '''针对self.params梯度清零.'''
 28 |         for param in self.params:
 29 |             param.zero_grad()
 30 | 
 31 | 
 32 | class SGD(Optimizer):
 33 |     '''带动量的梯度下降
 34 | 
 35 |     Parameters
 36 |     ----------
 37 |     params : list[Parameter]
 38 |         待优化参数;
 39 |     lr : float
 40 |         学习率;
 41 |     momentum : float
 42 |         动量系数;
 43 |     weight_decay : float, default=0.
 44 |         权重衰减系数.
 45 |     nesterov : bool, defallt=True.
 46 |         是否采用Nesterov加速.
 47 |     '''
 48 | 
 49 |     def __init__(
 50 |         self,
 51 |         params: list[Tensor],
 52 |         lr: float,
 53 |         momentum: float = .5,
 54 |         weight_decay: float = 0.,
 55 |         nesterov=True,
 56 |     ) -> None:
 57 |         super().__init__(params)
 58 |         self.lr = lr
 59 |         self.momentum = momentum
 60 |         self.weight_decay = weight_decay
 61 |         self.nesterov = nesterov
 62 |         self.v = [
 63 |             param.xp.zeros(param.shape, dtype=param.dtype)
 64 |             for param in self.params
 65 |         ]
 66 | 
 67 |     def step(self):
 68 |         for i in range(len(self.params)):
 69 |             with self.params[i].device:
 70 |                 grad = self.params[i].grad + self.weight_decay * self.params[i].data
 71 |                 self.v[i] *= self.momentum
 72 |                 self.v[i] += self.lr * grad
 73 |                 self.params[i].data -= self.v[i]
 74 |                 if self.nesterov:
 75 |                     self.params[i].data -= self.lr * grad
 76 | 
 77 | 
 78 | class Adagrad(Optimizer):
 79 |     '''Adaptive Gradient Descent
 80 |     
 81 |     Parameters
 82 |     ----------
 83 |     params : list[Parameter]
 84 |         待优化参数;
 85 |     lr : float, default=1e-2.
 86 |         学习率;
 87 |     weight_decay : float, default=0.
 88 |         权重衰减系数.
 89 |     eps : float, default=1e-10
 90 |         epsilon.
 91 |     '''
 92 | 
 93 |     def __init__(
 94 |         self,
 95 |         params: list[Tensor],
 96 |         lr: float = 1e-2,
 97 |         weight_decay: float = 0,
 98 |         eps: float = 1e-10,
 99 |     ) -> None:
100 |         super().__init__(params)
101 |         self.lr = lr
102 |         self.weight_decay = weight_decay
103 |         self.eps = eps
104 |         self.G = [
105 |             param.xp.zeros(param.shape, dtype=param.dtype)
106 |             for param in self.params
107 |         ]
108 | 
109 |     def step(self):
110 |         for i in range(len(self.params)):
111 |             with self.params[i].device:
112 |                 grad = self.params[i].grad + self.weight_decay * self.params[i].data
113 |                 self.G[i] += grad**2
114 |                 self.params[i].data -= self.lr * grad / (self.eps + self.G[i])**0.5
115 | 
116 | 
117 | class Adadelta(Optimizer):
118 |     '''
119 |     Adadelta优化器
120 |     
121 |     params : list[Parameter]
122 |         待优化参数;
123 |     lr : float, default=1e-2.
124 |         学习率;
125 |     rho :float, default=
126 |     weight_decay : float, default=0.
127 |         权重衰减系数.
128 |     eps : float, default=1e-10
129 |         epsilon.
130 |     '''
131 | 
132 |     def __init__(
133 |         self,
134 |         params: list[Tensor],
135 |         lr: float = 1.0,
136 |         rho: float = 0.9,
137 |         weight_decay: float = 0,
138 |         eps: float = 1e-6,
139 |     ) -> None:
140 |         super().__init__(params)
141 |         self.lr = lr
142 |         self.rho = rho
143 |         self.eps = eps
144 |         self.eps = eps
145 |         self.weight_decay = weight_decay
146 |         self.G = [
147 |             param.xp.zeros(param.shape, dtype=param.dtype)
148 |             for param in self.params
149 |         ]
150 | 
151 |     def step(self):
152 |         for i in range(len(self.params)):
153 |             with self.params[i].device:
154 |                 grad = self.params[i].grad + self.weight_decay * self.params[i].data
155 | 
156 |                 self.G[i] = self.rho * self.G[i] + (1 - self.rho) * grad**2
157 |                 self.params[i].data -= self.lr * grad / (self.G[i] + self.eps)**0.5
158 | 
159 | 
160 | class Adam(Optimizer):
161 | 
162 |     def __init__(
163 |         self,
164 |         params: list[Tensor],
165 |         lr: float = 1e-3,
166 |         betas: tuple[float] = (0.9, 0.999),
167 |         eps: float = 1e-8,
168 |         weight_decay: float = 0,
169 |     ) -> None:
170 |         super().__init__(params)
171 |         self.lr = lr
172 |         self.beta1, self.beta2 = betas
173 |         self.eps = eps
174 |         self.weight_decay = weight_decay
175 |         self.m = [
176 |             param.xp.zeros(param.shape, dtype=param.dtype)
177 |             for param in self.params
178 |         ]
179 |         self.v = [
180 |             param.xp.zeros(param.shape, dtype=param.dtype)
181 |             for param in self.params
182 |         ]
183 |         self.t = 1
184 | 
185 |     def step(self):
186 |         for i in range(len(self.params)):
187 |             with self.params[i].device:
188 |                 grad = self.params[i].grad + self.weight_decay * self.params[i].data
189 |                 self.m[i] *= self.beta1
190 |                 self.m[i] += (1 - self.beta1) * grad
191 |                 self.v[i] *= self.beta2
192 |                 self.v[i] += (1 - self.beta2) * grad**2
193 |                 a_t = sqrt(1 - self.beta2**self.t) / (1 - self.beta1**self.t)
194 |                 self.params[i].data -= self.lr * a_t * self.m[i] / (
195 |                     self.v[i]**0.5 + self.eps)
196 |         self.t += 1
197 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # PyDyNet：NumPy-based Dynamic Deep Learning Framework
  2 | 
  3 | Chinese README: [cnREADME.md](./cnREADME.md)
  4 | 
  5 | [![Downloads](https://pepy.tech/badge/pydynet)](https://pepy.tech/project/pydynet)
  6 | [![Downloads](https://static.pepy.tech/personalized-badge/pydynet?period=month&units=international_system&left_color=grey&right_color=orange&left_text=downloads/month)](https://pepy.tech/project/pydynet)
  7 | ![x](https://img.shields.io/pypi/l/pydynet)
  8 | ![x](https://img.shields.io/pypi/implementation/numpy)
  9 | ![x](https://img.shields.io/github/stars/Kaslanarian/PyDyNet?style=social)
 10 | ![x](https://img.shields.io/github/forks/Kaslanarian/PyDyNet?style=social)
 11 | 
 12 | ## Towards Large Language Model
 13 | 
 14 | **In the summer of 2025, I restart the development of PyDyNet after two years.** PyDyNet implemented a pure inference version of Llama3 (6-layer Transformer, vocab-size=32000). The implementation is inspired by the NumPy version and dataset available [here](https://github.com/likejazz/llama3.np). To run it, download the dataset into the `llm/llama` folder and execute:
 15 | 
 16 | ```bash
 17 | >>> python -m llm.llama.infer
 18 | 
 19 | There was a boy named Timmy. He loved to play with hi toy and run around outside. One day, Timmy' mom asked him to help her with the laundry. Timmy didn't want to help because he wanted to play. But hi mom said, "Timmy, you need to help me. It' important to help out."
 20 | Timmy didn't want to help, but he knew he had to. So, he put on hi shoe and went outside to help hi mom. A they were folding the clothe, Timmy saw a big pile of laundry on the floor. He wanted to help, so he started to pick it up. But then, he accidentally knocked over a pile of clothe and they fell on him. Timmy wa okay, but he felt bad.
 21 | Hi mom saw what happened and said, "Timmy, you need to be more careful. You could have hurt yourself." Timmy felt bad and said sorry. Hi mom hugged him and said, "It' okay, accident happen. Let' clean up the laundry together." Timmy learned that it' important to be careful and help out when you need it.
 22 | 
 23 | Token count: 262, elapsed: 0.87s, 300 tokens/s
 24 | ```
 25 | 
 26 | We also implemented a pure inference version of CLIP, inspired by the NumPy version and dataset available [NPCLIP](https://github.com/99991/NPCLIP). To run it, imigrate `data` folder of `MPCLIP` into `llm/clip` folder and execute: 
 27 | 
 28 | ```bash
 29 | >>> python -m llm.clip.infer
 30 | Label probs: [0.000953   0.48176003 0.51728696]
 31 | ```
 32 | 
 33 | for the following image and query ["a fish", "a dog", "a cat"]
 34 | 
 35 | <img src="llm/clip/picture.png" alt="cat_dog" width="400px" />
 36 | 
 37 | ## Overview
 38 | 
 39 | PyDyNet is a neural network framework implemented entirely in NumPy (with CuPy support since version 0.0.7, using the same API). Its syntax is inspired by PyTorch, and its structure is as follows:
 40 | 
 41 | ```mermaid
 42 | graph LR
 43 |    N(numpy/cupy.ndarray)--Backend--> A(Tensor) --> ds(Dataset) ---> Data(DataLoader)---> Mission
 44 |    A  --Eager execution--> B(Basic operators:<br> add, exp, etc)
 45 |    B -.Autograd-.-> A
 46 | 
 47 |    B --> CO(Complex<br>operators)
 48 |    --> f(Function:<br>img2col, etc) 
 49 |    --> M(Basic Module:<br>Linear, etc)
 50 |    --> CM(Advanced Module: CNN, RNN, Transformer, etc)
 51 |    --> Mission(Learning task)
 52 |    A --> GD(Optimizer:<br> SGD, Adam, etc) ---> LS(lr_scheduler: <br>StepLR, etc)---> Mission
 53 | ```
 54 | 
 55 | Dashed lines indicate that users can disable automatic differentiation using `no_grad`.
 56 | 
 57 | ## Install
 58 | 
 59 | Just
 60 | 
 61 | ```bash
 62 | pip install pydynet
 63 | ```
 64 | 
 65 | or
 66 | 
 67 | ```bash
 68 | git clone https://github.com/Kaslanarian/PyDyNet
 69 | cd PyDyNet
 70 | python setup.py install
 71 | ```
 72 | 
 73 | ## Example
 74 | 
 75 | Examples can be found in the [examples/pydynet](./examples/pydynet) directory, with equivalent PyTorch implementations in [examples/pytorch](./examples/pytorch). To run an example, use:
 76 | 
 77 | ```bash
 78 | python -m examples.pydynet.xxx
 79 | ```
 80 | 
 81 | ### Automatic Differentiation
 82 | 
 83 | The example [autodiff1d.py](examples/pydynet/autodiff1d.py) demonstrates automatic differentiation by performing gradient descent on a one-dimensional convex function:
 84 | 
 85 | <img src="imgs/ad1d.png" alt="ad1" style="zoom:67%;" />
 86 | 
 87 | A multi-variable convex function example is provided in [autodiff2d.py](examples/pydynet/autodiff2d.py):
 88 | 
 89 | <img src="imgs/ad2d.png" alt="ad2" style="zoom:67%;" />
 90 | 
 91 | ### MLP & LeNet
 92 | 
 93 | The example [mlp_cnn.py](examples/pydynet/mnist.py) uses MLP and LeNet to classify MNIST digits. The training and testing accuracies are shown below:
 94 | 
 95 | <img src="imgs/mlp_cnn.png" alt="dnn" style="zoom:67%;" />
 96 | 
 97 | ### Dropout & Batch Normalization
 98 | 
 99 | The example [mlp_dropout_bn.py](examples/pydynet/dropout_bn.py) compares the performance of three networks on the `fetch_olivetti_faces` dataset (64×64 pixel images):
100 | 
101 | 1. Three-layer MLP;
102 | 2. Three-layer MLP with Dropout;
103 | 3. Three-layer MLP with Batch Normalization.
104 | 
105 | <img src="imgs/dropout_bn.png" alt="cnn" style="zoom:67%;" />
106 | 
107 | ### Recurrent Neural Network (RNN)
108 | 
109 | The example [ts_prediction.py](examples/pydynet/ts_prediction.py) demonstrates time series prediction using a GRU:
110 | 
111 | <img src="imgs/rnn.png" alt="RNN" style="zoom:67%;" />
112 | 
113 | ### Transformer
114 | 
115 | The example [transformer.py](examples/pydynet/transformer.py) shows how to train a text classification model using a Transformer. The training results are as follows:
116 | 
117 | <img src="imgs/transformer.png" alt="transformer" style="zoom:67%;" />
118 | 
119 | > Dataset (CoLA) link: <https://nyu-mll.github.io/CoLA/cola_public_1.1.zip>
120 | 
121 | ## Cuda Acceleration
122 | 
123 | PyDyNet supports CUDA acceleration through CuPy. To use it, simply install CuPy and use the same API as NumPy. We compare the performance of PyDyNet with CuPy and NumPy as follows on **Nvidia GeForce RTX 4090**:
124 | 
125 | |      Network structure         |      Dataset      | CPU time (s) per epoch | GPU time (s) per epoch |
126 | | :-----------------: | :---------------: | :--------------------: | :--------------------: |
127 | |    3-layer MLP     | MNIST (80000×574) |      7.256±0.138      |       1.203±.0181       |
128 | |        LeNet        | MNIST (80000×574) |     239.664±2.108      |      2.841±0.026      |
129 | | 1-layer Transformer (dim=512, head=4) | CoLA (8551×45×64) |      17.503±0.251      |      1.075±0.002       |
130 | 


--------------------------------------------------------------------------------
/llm/clip/tokenizer.py:
--------------------------------------------------------------------------------
  1 | import gzip
  2 | import html
  3 | import os
  4 | import re
  5 | import typing
  6 | from functools import lru_cache
  7 | 
  8 | 
  9 | @lru_cache()
 10 | def default_bpe() -> str:
 11 |     return os.path.join(os.path.dirname(os.path.abspath(__file__)),
 12 |                         "data/bpe_simple_vocab_16e6.txt.gz")
 13 | 
 14 | 
 15 | @lru_cache()
 16 | def bytes_to_unicode() -> typing.Dict[int, str]:
 17 |     """
 18 |     Returns list of utf-8 byte and a corresponding list of unicode strings.
 19 |     The reversible bpe codes work on unicode strings.
 20 |     This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
 21 |     When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
 22 |     This is a signficant percentage of your normal, say, 32K bpe vocab.
 23 |     To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
 24 |     And avoids mapping to whitespace/control characters the bpe code barfs on.
 25 |     """
 26 |     byte_ints = list(range(ord("!"),
 27 |                            ord("~") + 1)) + list(range(ord("¡"),
 28 |                                                        ord("¬") + 1)) + list(
 29 |                                                            range(
 30 |                                                                ord("®"),
 31 |                                                                ord("ÿ") + 1))
 32 |     char_ints = byte_ints[:]
 33 |     n = 0
 34 |     for b in range(2**8):
 35 |         if b not in byte_ints:
 36 |             byte_ints.append(b)
 37 |             char_ints.append(2**8 + n)
 38 |             n += 1
 39 |     chars = [chr(n) for n in char_ints]
 40 |     return dict(zip(byte_ints, chars))
 41 | 
 42 | 
 43 | def get_pairs(
 44 |         word: typing.Tuple[str, ...]) -> typing.Set[typing.Tuple[str, str]]:
 45 |     """Return set of symbol pairs in a word.
 46 |     Word is represented as tuple of symbols (symbols being variable-length strings).
 47 |     """
 48 |     pairs = set()
 49 |     prev_char = word[0]
 50 |     for char in word[1:]:
 51 |         pairs.add((prev_char, char))
 52 |         prev_char = char
 53 |     return pairs
 54 | 
 55 | 
 56 | def basic_clean(text: str) -> str:
 57 |     import ftfy
 58 | 
 59 |     text = ftfy.fix_text(text)
 60 |     text = html.unescape(html.unescape(text))
 61 |     return text.strip()
 62 | 
 63 | 
 64 | def whitespace_clean(text: str) -> str:
 65 |     text = re.sub(r"\s+", " ", text)
 66 |     text = text.strip()
 67 |     return text
 68 | 
 69 | 
 70 | def read_text(path: str) -> str:
 71 |     with open(path, "r", encoding="utf-8") as f:
 72 |         return f.read()
 73 | 
 74 | 
 75 | class SimpleTokenizer(object):
 76 | 
 77 |     def __init__(self, bpe_path: str = default_bpe()) -> None:
 78 |         self.byte_encoder = bytes_to_unicode()
 79 |         self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
 80 |         with gzip.open(bpe_path) as f:
 81 |             lines = f.read().decode("utf-8").split("\n")
 82 |             lines = lines[1:49152 - 256 - 2 + 1]
 83 |         merges = [tuple(line.split()) for line in lines]
 84 |         vocab = list(bytes_to_unicode().values())
 85 |         vocab = vocab + [v + "</w>" for v in vocab]
 86 |         for merge in merges:
 87 |             vocab.append("".join(merge))
 88 |         vocab.extend(["<|startoftext|>", "<|endoftext|>"])
 89 |         self.encoder: typing.Dict[str,
 90 |                                   int] = dict(zip(vocab, range(len(vocab))))
 91 |         self.decoder = {v: k for k, v in self.encoder.items()}
 92 |         self.bpe_ranks = dict(zip(merges, range(len(merges))))
 93 |         self.cache = {
 94 |             "<|startoftext|>": "<|startoftext|>",
 95 |             "<|endoftext|>": "<|endoftext|>"
 96 |         }
 97 |         pattern = r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+"""
 98 |         pattern = pattern.replace(r"\p{N}", read_text("llm/clip/data/pN.txt"))
 99 |         pattern = pattern.replace(r"\p{L}", read_text("llm/clip/data/pL.txt"))
100 |         self.pat = re.compile(pattern, re.IGNORECASE)
101 | 
102 |     def bpe(self, token: str) -> str:
103 |         if token in self.cache:
104 |             return self.cache[token]
105 |         word = tuple(token[:-1]) + (token[-1] + "</w>", )
106 |         pairs = get_pairs(word)
107 | 
108 |         if not pairs:
109 |             return token + "</w>"
110 | 
111 |         while True:
112 |             bigram = min(
113 |                 pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
114 |             if bigram not in self.bpe_ranks:
115 |                 break
116 |             first, second = bigram
117 |             new_word: typing.List[str] = []
118 |             i = 0
119 |             while i < len(word):
120 |                 try:
121 |                     j = word.index(first, i)
122 |                     new_word.extend(word[i:j])
123 |                     i = j
124 |                 except:
125 |                     new_word.extend(word[i:])
126 |                     break
127 | 
128 |                 if word[i] == first and i < len(word) - 1 and word[
129 |                         i + 1] == second:
130 |                     new_word.append(first + second)
131 |                     i += 2
132 |                 else:
133 |                     new_word.append(word[i])
134 |                     i += 1
135 |             word = tuple(new_word)
136 |             if len(word) == 1:
137 |                 break
138 |             else:
139 |                 pairs = get_pairs(word)
140 |         joined_word = " ".join(word)
141 |         self.cache[token] = joined_word
142 |         return joined_word
143 | 
144 |     def encode(self,
145 |                text: str,
146 |                basic_cleaning: bool = False) -> typing.List[int]:
147 |         bpe_tokens: typing.List[int] = []
148 |         if basic_cleaning:
149 |             text = basic_clean(text)
150 |         text = whitespace_clean(text).lower()
151 |         for token in re.findall(self.pat, text):
152 |             token = "".join(self.byte_encoder[b]
153 |                             for b in token.encode("utf-8"))
154 |             bpe_tokens.extend(self.encoder[bpe_token]
155 |                               for bpe_token in self.bpe(token).split(" "))
156 |         return bpe_tokens
157 | 
158 |     def decode(self, tokens: typing.Iterable[int]) -> str:
159 |         text = "".join([self.decoder[token] for token in tokens])
160 |         text = bytearray([self.byte_decoder[c] for c in text
161 |                           ]).decode("utf-8",
162 |                                     errors="replace").replace("</w>", " ")
163 |         return text
164 | 


--------------------------------------------------------------------------------
/examples/pytorch/mnist.py:
--------------------------------------------------------------------------------
  1 | import gzip, argparse
  2 | from os.path import join
  3 | from tqdm import tqdm
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | from torch import nn
  8 | import torch.nn.functional as F
  9 | from torch.optim import Adam
 10 | from pydynet.data import data_loader
 11 | 
 12 | 
 13 | class MNISTDataset:
 14 | 
 15 |     def __init__(self, root) -> None:
 16 |         self.root = root
 17 |         self.train_images_path = join(root, 'train-images-idx3-ubyte.gz')
 18 |         self.train_labels_path = join(root, 'train-labels-idx1-ubyte.gz')
 19 |         self.test_images_path = join(root, 't10k-images-idx3-ubyte.gz')
 20 |         self.test_labels_path = join(root, 't10k-labels-idx1-ubyte.gz')
 21 | 
 22 |     def load_train(self):
 23 |         return (
 24 |             MNISTDataset.load_mnist_images(self.train_images_path),
 25 |             MNISTDataset.load_mnist_labels(self.train_labels_path),
 26 |         )
 27 | 
 28 |     def load_test(self):
 29 |         return (
 30 |             MNISTDataset.load_mnist_images(self.test_images_path),
 31 |             MNISTDataset.load_mnist_labels(self.test_labels_path),
 32 |         )
 33 | 
 34 |     @staticmethod
 35 |     def load_mnist_images(file_path):
 36 |         with gzip.open(file_path, 'r') as f:
 37 |             # Skip the magic number and dimensions (4 bytes magic number + 4 bytes each for dimensions)
 38 |             f.read(16)
 39 |             # Read the rest of the file
 40 |             buffer = f.read()
 41 |             data = np.frombuffer(buffer, dtype=np.uint8).astype(np.float32)
 42 |             # Normalize the data to be in the range [0, 1]
 43 |             data = data / 255.0
 44 |             # Reshape the data to be in the shape (number_of_images, 28, 28)
 45 |             data = data.reshape(-1, 1, 28, 28)
 46 |             return torch.tensor(data)
 47 | 
 48 |     @staticmethod
 49 |     def load_mnist_labels(file_path):
 50 |         with gzip.open(file_path, 'r') as f:
 51 |             # Skip the magic number and number of items (4 bytes magic number + 4 bytes number of items)
 52 |             f.read(8)
 53 |             # Read the rest of the file
 54 |             buffer = f.read()
 55 |             labels = np.frombuffer(buffer, dtype=np.uint8)
 56 |             return torch.tensor(labels, dtype=int)
 57 | 
 58 | 
 59 | class Flatten(nn.Module):
 60 | 
 61 |     def forward(self, x):  # for batch only
 62 |         return x.reshape(x.shape[0], -1)
 63 | 
 64 | 
 65 | class MLP(nn.Module):
 66 | 
 67 |     def __init__(self) -> None:
 68 |         super().__init__()
 69 |         self.layer1 = nn.Sequential(
 70 |             Flatten(),
 71 |             nn.Linear(28 * 28, 1024),
 72 |         )
 73 |         self.layer2 = nn.Linear(1024, 1024)
 74 |         self.layer3 = nn.Linear(1024, 10)
 75 | 
 76 |     def forward(self, x):
 77 |         z1 = F.relu(self.layer1(x))
 78 |         z2 = F.relu(self.layer2(z1))
 79 |         return self.layer3(z2)
 80 | 
 81 | 
 82 | class ConvNet(nn.Module):
 83 | 
 84 |     def __init__(self):
 85 |         super().__init__()
 86 |         self.conv1 = nn.Conv2d(1, 20, 3, 1, 1)
 87 |         self.conv2 = nn.Conv2d(20, 50, 3, 1, 1)
 88 |         self.fc1 = nn.Linear(7 * 7 * 50, 500)
 89 |         self.fc2 = nn.Linear(500, 10)
 90 | 
 91 |     def forward(self, x):
 92 |         x = F.relu(self.conv1(x))
 93 |         x = F.max_pool2d(x, 2, 2)
 94 |         x = F.relu(self.conv2(x))
 95 |         x = F.max_pool2d(x, 2, 2)
 96 |         x = x.reshape(-1, 7 * 7 * 50)
 97 |         x = F.relu(self.fc1(x))
 98 |         return self.fc2(x)
 99 | 
100 | 
101 | parser = argparse.ArgumentParser()
102 | parser.add_argument("--network",
103 |                     help="Network structure",
104 |                     choices=['mlp', 'conv'],
105 |                     default='conv')
106 | parser.add_argument('--batch-size',
107 |                     type=int,
108 |                     default=256,
109 |                     help='input batch size for training (default: 256)')
110 | parser.add_argument('--test-batch-size',
111 |                     type=int,
112 |                     default=1024,
113 |                     metavar='N',
114 |                     help='input batch size for testing (default: 1024)')
115 | parser.add_argument('--epochs',
116 |                     type=int,
117 |                     default=20,
118 |                     help='number of epochs to train (default: 20)')
119 | parser.add_argument('--lr',
120 |                     type=float,
121 |                     default=1e-4,
122 |                     help='learning rate (default: 1e-4)')
123 | parser.add_argument('--no-cuda',
124 |                     action='store_true',
125 |                     default=False,
126 |                     help='disables CUDA training')
127 | parser.add_argument('--seed',
128 |                     type=int,
129 |                     default=42,
130 |                     help='random seed (default: 1)')
131 | args = parser.parse_args()
132 | 
133 | torch.manual_seed(42)
134 | torch.cuda.manual_seed(42)
135 | 
136 | device = f'cuda:{torch.cuda.device_count() - 1}' if torch.cuda.is_available(
137 | ) and not args.no_cuda else 'cpu'
138 | 
139 | net = {'mlp': MLP(), 'conv': ConvNet()}.get(args.network).to(device)
140 | print(net)
141 | 
142 | optimizer = Adam(net.parameters(), lr=args.lr)
143 | 
144 | dataset = MNISTDataset(r'./examples/data/MNIST/raw')
145 | train_loader = data_loader(
146 |     *dataset.load_train(),
147 |     shuffle=True,
148 |     batch_size=args.batch_size,
149 | )
150 | test_loader = data_loader(
151 |     *dataset.load_test(),
152 |     shuffle=False,
153 |     batch_size=args.test_batch_size,
154 | )
155 | 
156 | bar = tqdm(range(args.epochs))
157 | info_list = []
158 | for epoch in bar:
159 | 
160 |     net.train()
161 | 
162 |     for batch_X, batch_y in train_loader:
163 |         input_, label = batch_X.to(device), batch_y.to(device)
164 |         loss = F.cross_entropy(net(input_), label)
165 |         optimizer.zero_grad()
166 |         loss.backward()
167 |         optimizer.step()
168 | 
169 |     net.eval()
170 | 
171 |     train_right, train_size = 0, 0
172 |     test_right, test_size = 0, 0
173 |     with torch.no_grad():
174 |         for batch_X, batch_y in train_loader:
175 |             input_, label = batch_X.to(device), batch_y.to(device)
176 |             pred: torch.Tensor = net(input_).argmax(-1)
177 |             train_right += pred.eq(label).sum().item()
178 |             train_size += batch_X.shape[0]
179 | 
180 |         for batch_X, batch_y in test_loader:
181 |             input_, label = batch_X.to(device), batch_y.to(device)
182 |             pred = net(input_).argmax(-1)
183 |             test_right += pred.eq(label).sum().item()
184 |             test_size += batch_X.shape[0]
185 | 
186 |     train_acc, test_acc = train_right / train_size, test_right / test_size
187 |     bar.set_postfix(TEST_ACC="{:.4f}".format(test_acc),
188 |                     TRAIN_ACC="{:.4f}".format(train_acc),
189 |                     LOSS="{:.6f}".format(loss.item()))
190 | 


--------------------------------------------------------------------------------
/examples/pydynet/mnist.py:
--------------------------------------------------------------------------------
  1 | import gzip, argparse
  2 | from os.path import join
  3 | from tqdm import tqdm
  4 | 
  5 | import numpy as np
  6 | import pydynet as pdn
  7 | from pydynet import nn
  8 | import pydynet.nn.functional as F
  9 | from pydynet.optim import Adam
 10 | from pydynet.data import data_loader
 11 | 
 12 | 
 13 | class MNISTDataset:
 14 | 
 15 |     def __init__(self, root) -> None:
 16 |         self.root = root
 17 |         self.train_images_path = join(root, 'train-images-idx3-ubyte.gz')
 18 |         self.train_labels_path = join(root, 'train-labels-idx1-ubyte.gz')
 19 |         self.test_images_path = join(root, 't10k-images-idx3-ubyte.gz')
 20 |         self.test_labels_path = join(root, 't10k-labels-idx1-ubyte.gz')
 21 | 
 22 |     def load_train(self):
 23 |         return (
 24 |             MNISTDataset.load_mnist_images(self.train_images_path),
 25 |             MNISTDataset.load_mnist_labels(self.train_labels_path),
 26 |         )
 27 | 
 28 |     def load_test(self):
 29 |         return (
 30 |             MNISTDataset.load_mnist_images(self.test_images_path),
 31 |             MNISTDataset.load_mnist_labels(self.test_labels_path),
 32 |         )
 33 | 
 34 |     @staticmethod
 35 |     def load_mnist_images(file_path):
 36 |         with gzip.open(file_path, 'r') as f:
 37 |             # Skip the magic number and dimensions (4 bytes magic number + 4 bytes each for dimensions)
 38 |             f.read(16)
 39 |             # Read the rest of the file
 40 |             buffer = f.read()
 41 |             data = np.frombuffer(buffer, dtype=np.uint8)
 42 |             # Normalize the data to be in the range [0, 1]
 43 |             data = data / 255.0
 44 |             # Reshape the data to be in the shape (number_of_images, 28, 28)
 45 |             data = data.reshape(-1, 1, 28, 28)
 46 |             return pdn.Tensor(data).astype(DTYPE)
 47 | 
 48 |     @staticmethod
 49 |     def load_mnist_labels(file_path):
 50 |         with gzip.open(file_path, 'r') as f:
 51 |             # Skip the magic number and number of items (4 bytes magic number + 4 bytes number of items)
 52 |             f.read(8)
 53 |             # Read the rest of the file
 54 |             buffer = f.read()
 55 |             labels = np.frombuffer(buffer, dtype=np.uint8)
 56 |             return pdn.Tensor(labels, dtype=int)
 57 | 
 58 | 
 59 | class Flatten(nn.Module):
 60 | 
 61 |     def forward(self, x):  # for batch only
 62 |         return x.reshape(x.shape[0], -1)
 63 | 
 64 | 
 65 | class MLP(nn.Module):
 66 | 
 67 |     def __init__(self) -> None:
 68 |         super().__init__()
 69 |         self.layer1 = nn.Sequential(
 70 |             Flatten(),
 71 |             nn.Linear(28 * 28, 1024, dtype=DTYPE),
 72 |         )
 73 |         self.layer2 = nn.Linear(1024, 1024, dtype=DTYPE)
 74 |         self.layer3 = nn.Linear(1024, 10, dtype=DTYPE)
 75 | 
 76 |     def forward(self, x):
 77 |         z1 = F.relu(self.layer1(x))
 78 |         z2 = F.relu(self.layer2(z1))
 79 |         return self.layer3(z2)
 80 | 
 81 | 
 82 | class ConvNet(nn.Module):
 83 | 
 84 |     def __init__(self):
 85 |         super().__init__()
 86 |         self.conv1 = nn.Conv2d(1, 20, 3, 1, 1, dtype=DTYPE)
 87 |         self.conv2 = nn.Conv2d(20, 50, 3, 1, 1, dtype=DTYPE)
 88 |         self.fc1 = nn.Linear(7 * 7 * 50, 500, dtype=DTYPE)
 89 |         self.fc2 = nn.Linear(500, 10, dtype=DTYPE)
 90 | 
 91 |     def forward(self, x):
 92 |         x = F.relu(self.conv1(x))
 93 |         x = F.max_pool2d(x, 2, 2)
 94 |         x = F.relu(self.conv2(x))
 95 |         x = F.max_pool2d(x, 2, 2)
 96 |         x = x.reshape(-1, 7 * 7 * 50)
 97 |         x = F.relu(self.fc1(x))
 98 |         return self.fc2(x)
 99 | 
100 | 
101 | parser = argparse.ArgumentParser()
102 | parser.add_argument("--network",
103 |                     help="Network structure",
104 |                     choices=['mlp', 'conv'],
105 |                     default='conv')
106 | parser.add_argument('--batch-size',
107 |                     type=int,
108 |                     default=256,
109 |                     help='input batch size for training (default: 256)')
110 | parser.add_argument('--test-batch-size',
111 |                     type=int,
112 |                     default=1024,
113 |                     metavar='N',
114 |                     help='input batch size for testing (default: 1024)')
115 | parser.add_argument('--epochs',
116 |                     type=int,
117 |                     default=20,
118 |                     help='number of epochs to train (default: 20)')
119 | parser.add_argument('--lr',
120 |                     type=float,
121 |                     default=1e-4,
122 |                     help='learning rate (default: 1e-4)')
123 | parser.add_argument('--no-cuda',
124 |                     action='store_true',
125 |                     default=False,
126 |                     help='disables CUDA training')
127 | parser.add_argument('--seed',
128 |                     type=int,
129 |                     default=42,
130 |                     help='random seed (default: 1)')
131 | args = parser.parse_args()
132 | 
133 | DTYPE = np.float32
134 | np.random.seed(args.seed)
135 | device = f'cuda:{pdn.cuda.device_count() - 1}' if pdn.cuda.is_available(
136 | ) and not args.no_cuda else 'cpu'
137 | 
138 | net = {'mlp': MLP(), 'conv': ConvNet()}.get(args.network).to(device)
139 | print(net)
140 | 
141 | optimizer = Adam(net.parameters(), lr=args.lr)
142 | 
143 | dataset = MNISTDataset(r'./examples/data/MNIST/raw')
144 | train_loader = data_loader(
145 |     *dataset.load_train(),
146 |     shuffle=True,
147 |     batch_size=args.batch_size,
148 | )
149 | test_loader = data_loader(
150 |     *dataset.load_test(),
151 |     shuffle=False,
152 |     batch_size=args.test_batch_size,
153 | )
154 | 
155 | bar = tqdm(range(args.epochs))
156 | info_list = []
157 | for epoch in bar:
158 | 
159 |     net.train()
160 | 
161 |     for batch_X, batch_y in train_loader:
162 |         input_, label = batch_X.to(device), batch_y.to(device)
163 |         loss = F.cross_entropy_loss(net(input_), label)
164 |         optimizer.zero_grad()
165 |         loss.backward()
166 |         optimizer.step()
167 | 
168 |     net.eval()
169 | 
170 |     train_right, train_size = 0, 0
171 |     test_right, test_size = 0, 0
172 |     with pdn.no_grad():
173 |         for batch_X, batch_y in train_loader:
174 |             input_, label = batch_X.to(device), batch_y.to(device)
175 |             pred: pdn.Tensor = net(input_).argmax(-1)
176 |             train_right += pred.eq(label).sum().item()
177 |             train_size += batch_X.shape[0]
178 | 
179 |         for batch_X, batch_y in test_loader:
180 |             input_, label = batch_X.to(device), batch_y.to(device)
181 |             pred = net(input_).argmax(-1)
182 |             test_right += pred.eq(label).sum().item()
183 |             test_size += batch_X.shape[0]
184 | 
185 |     train_acc, test_acc = train_right / train_size, test_right / test_size
186 |     bar.set_postfix(TEST_ACC="{:.4f}".format(test_acc),
187 |                     TRAIN_ACC="{:.4f}".format(train_acc),
188 |                     LOSS="{:.6f}".format(loss.item()))
189 | 


--------------------------------------------------------------------------------
/llm/clip/model.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import numpy as np
  4 | import pydynet as pdn
  5 | from pydynet import nn
  6 | import pydynet.nn.functional as F
  7 | 
  8 | 
  9 | def build_attention_mask(context_length: int):
 10 |     mask = np.full((context_length, context_length),
 11 |                    fill_value=-np.inf,
 12 |                    dtype=np.float32)
 13 |     mask = np.triu(mask, 1)
 14 |     return pdn.Tensor(mask, dtype=np.float32)
 15 | 
 16 | 
 17 | def patch_project(x: pdn.Tensor, kernel: pdn.Tensor):
 18 |     # Decompose images into 32x32 patches and multiply all patches by matrix.
 19 | 
 20 |     n, c, h, w = x.shape
 21 |     d, pc, ph, pw = kernel.shape
 22 |     p = pc * ph * pw
 23 |     gh = h // ph
 24 |     gw = w // pw
 25 | 
 26 |     assert c == pc and h % ph == 0 and w % pw == 0
 27 | 
 28 |     W = kernel.transpose(1, 2, 3, 0).reshape(p, d)
 29 |     x = x.reshape(n, c, gh, ph, gw, pw).transpose(0, 2, 4, 1, 3,
 30 |                                                   5).reshape(n, gh, gw, p)
 31 |     x = x @ W
 32 |     return x.reshape(n, gh * gw, d)
 33 | 
 34 | 
 35 | class MultiHeadAttention(nn.Module):
 36 | 
 37 |     def __init__(self, n_dim: int, n_heads: int):
 38 |         super().__init__()
 39 |         self.n_dim = n_dim
 40 |         self.n_heads = n_heads
 41 |         self.head_dim = n_dim // n_heads
 42 | 
 43 |         self.QKV = nn.Linear(self.n_dim, self.n_dim * 3, dtype=np.float32)
 44 |         self.O = nn.Linear(self.n_dim, self.n_dim, dtype=np.float32)
 45 | 
 46 |     def forward(self, x, mask):
 47 |         B, L, _ = x.shape
 48 |         xq, xk, xv = pdn.split(self.QKV(x), 3, -1)
 49 |         xq = xq.reshape(B, L, self.n_heads, self.head_dim)
 50 |         xk = xk.reshape(B, L, self.n_heads, self.head_dim)
 51 |         xv = xv.reshape(B, L, self.n_heads, self.head_dim)
 52 | 
 53 |         xq, xkT = xq.transpose(0, 2, 1, 3), xk.transpose(0, 2, 3, 1)
 54 |         attention = xq @ xkT / math.sqrt(self.head_dim)
 55 | 
 56 |         if mask is not None:
 57 |             attention = attention + mask
 58 | 
 59 |         attention = F.softmax(attention, axis=-1)
 60 |         output = attention @ xv.transpose(0, 2, 1, 3)
 61 | 
 62 |         output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
 63 |         return self.O(output)
 64 | 
 65 | 
 66 | class CLIPLayerNorm(nn.LayerNorm):
 67 | 
 68 |     def __init__(self,
 69 |                  normalized_shape,
 70 |                  eps=0.000001,
 71 |                  momentum=0.1,
 72 |                  device=None,
 73 |                  dtype=None):
 74 |         super().__init__(normalized_shape, eps, momentum, device, dtype)
 75 | 
 76 |     def forward(self, x):
 77 |         mean = x.mean(axis=-1, keepdims=True)
 78 |         var = pdn.square(x - mean).mean(axis=-1, keepdims=True)
 79 |         x = (x - mean) / pdn.sqrt(var + self.eps) * self.scale + self.shift
 80 |         return x
 81 | 
 82 | 
 83 | class MLP(nn.Module):
 84 | 
 85 |     def __init__(self, d_in: int, d_proj: int):
 86 |         super().__init__()
 87 |         self.d_in = d_in
 88 |         self.d_proj = d_proj
 89 |         self.fc1 = nn.Linear(d_in, d_proj, dtype=np.float32)
 90 |         self.fc2 = nn.Linear(d_proj, d_in, dtype=np.float32)
 91 | 
 92 |     def forward(self, x):
 93 |         x = self.fc1(x)
 94 |         x = x * pdn.sigmoid(1.702 * x)
 95 |         return self.fc2(x)
 96 | 
 97 | 
 98 | class Transformer(nn.Module):
 99 | 
100 |     def __init__(self, n_dim: int, n_head: int, mlp_dim: int):
101 |         super().__init__()
102 |         self.mha = MultiHeadAttention(n_dim, n_head)
103 |         self.mlp = MLP(n_dim, mlp_dim)
104 |         self.layer_norm1 = CLIPLayerNorm((n_dim, ), eps=1e-5, dtype=np.float32)
105 |         self.layer_norm2 = CLIPLayerNorm((n_dim, ), eps=1e-5, dtype=np.float32)
106 | 
107 |     def forward(self, x, mask):
108 |         x = x + self.mha(self.layer_norm1(x), mask)
109 |         x = x + self.mlp(self.layer_norm2(x))
110 |         return x
111 | 
112 | 
113 | class ImageEncoder(nn.Module):
114 | 
115 |     def __init__(self, n_dim, n_head, mlp_dim, kernel_size, n_layer,
116 |                  final_dim):
117 |         super().__init__()
118 |         self.kernel = nn.Parameter(
119 |             pdn.randn(n_dim, 3, kernel_size, kernel_size, dtype=np.float32))
120 | 
121 |         self.pre_norm = CLIPLayerNorm((n_dim, ), 1e-5, dtype=np.float32)
122 |         self.transformers: list[Transformer] = nn.ModuleList(
123 |             [Transformer(n_dim, n_head, mlp_dim) for _ in range(n_layer)])
124 | 
125 |         self.post_norm = CLIPLayerNorm((n_dim, ), 1e-5, dtype=np.float32)
126 |         self.proj = nn.Linear(n_dim, final_dim, bias=False, dtype=np.float32)
127 | 
128 |     def forward(self, x, class_emb, position_emb):
129 |         x = patch_project(x, self.kernel)
130 |         x = pdn.concat([class_emb, x], axis=-2) + position_emb
131 | 
132 |         x = self.pre_norm(x)
133 |         for model in self.transformers:
134 |             x = model(x, None)
135 | 
136 |         x = self.post_norm(x[:, 0])
137 |         return self.proj(x)
138 | 
139 | 
140 | class TextEncoder(nn.Module):
141 | 
142 |     def __init__(self, n_dim, n_head, mlp_dim, n_layer, final_dim, vocab_size):
143 |         super().__init__()
144 |         self.token_embed = nn.Embedding(vocab_size, n_dim, dtype=np.float32)
145 |         self.transformers: list[Transformer] = nn.ModuleList(
146 |             [Transformer(n_dim, n_head, mlp_dim) for _ in range(n_layer)])
147 | 
148 |         self.post_norm = CLIPLayerNorm((n_dim, ), 1e-5, dtype=np.float32)
149 |         self.proj = nn.Linear(n_dim, final_dim, bias=False, dtype=np.float32)
150 | 
151 |     def forward(self, idx, position_emb):
152 |         x = self.token_embed(idx) + position_emb
153 |         mask = build_attention_mask(x.shape[1])
154 | 
155 |         for model in self.transformers:
156 |             x = model(x, mask)
157 | 
158 |         x = self.post_norm(x)
159 | 
160 |         return self.proj(x[np.arange(x.shape[0]), x.xp.argmax(idx, axis=-1)])
161 | 
162 | 
163 | class CLIP(nn.Module):
164 | 
165 |     def __init__(self):
166 |         super().__init__()
167 |         self.class_embed = nn.Parameter(pdn.randn(1, 1, 768, dtype=np.float32))
168 |         self.v_pos_emb = nn.Parameter(pdn.randn(50, 768, dtype=np.float32))
169 |         self.t_pos_emb = nn.Parameter(pdn.randn(77, 512, dtype=np.float32))
170 |         self.image_encoder = ImageEncoder(768, 12, 3072, 32, 12, 512)
171 |         self.text_encoder = TextEncoder(512, 8, 2048, 12, 512, 49408)
172 |         self.scale = 1
173 | 
174 |     def forward(self, img, idx):
175 |         img_feature = self.image_encoder(img, self.class_embed, self.v_pos_emb)
176 |         txt_feature = self.text_encoder(idx, self.t_pos_emb)
177 | 
178 |         norm_img = pdn.sqrt(pdn.square(img_feature).sum(1, keepdims=True))
179 |         norm_txt = pdn.sqrt(pdn.square(txt_feature).sum(1, keepdims=True))
180 | 
181 |         img_feature = img_feature / norm_img
182 |         txt_feature = txt_feature / norm_txt
183 |         logits_per_image = self.scale * img_feature @ txt_feature.T
184 |         return logits_per_image
185 | 


--------------------------------------------------------------------------------
/examples/pytorch/dropout_bn.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | from sklearn.datasets import fetch_olivetti_faces
  3 | from sklearn.preprocessing import MinMaxScaler
  4 | from sklearn.model_selection import train_test_split
  5 | from tqdm import tqdm
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | import torch.nn.functional as F
 10 | import torch.nn as nn
 11 | from torch.optim import Adam
 12 | from pydynet.data import data_loader
 13 | 
 14 | data_X, data_y = fetch_olivetti_faces(return_X_y=True)
 15 | print(data_X.shape)
 16 | train_X, test_X, train_y, test_y = train_test_split(
 17 |     data_X,
 18 |     data_y,
 19 |     train_size=0.8,
 20 |     stratify=data_y,
 21 |     random_state=42,
 22 | )
 23 | scaler = MinMaxScaler()
 24 | train_X = scaler.fit_transform(train_X)
 25 | test_X = scaler.transform(test_X)
 26 | 
 27 | 
 28 | class DNN(nn.Module):
 29 | 
 30 |     def __init__(self) -> None:
 31 |         super().__init__()
 32 |         self.fc1 = nn.Linear(4096, 512)
 33 |         self.fc2 = nn.Linear(512, 128)
 34 |         self.fc3 = nn.Linear(128, 40)
 35 | 
 36 |     def forward(self, x):
 37 |         x = F.relu(self.fc1(x))
 38 |         x = F.relu(self.fc2(x))
 39 |         return self.fc3(x)
 40 | 
 41 | 
 42 | class DNN_dropout(DNN):
 43 | 
 44 |     def __init__(self) -> None:
 45 |         super().__init__()
 46 |         self.dropout = nn.Dropout(p=0.05)
 47 | 
 48 |     def forward(self, x):
 49 |         x = F.relu(self.dropout(self.fc1(x)))
 50 |         x = F.relu(self.dropout(self.fc2(x)))
 51 |         return self.fc3(x)
 52 | 
 53 | 
 54 | class DNN_BN(DNN):
 55 | 
 56 |     def __init__(self) -> None:
 57 |         super().__init__()
 58 |         self.bn1 = nn.BatchNorm1d(512)
 59 |         self.bn2 = nn.BatchNorm1d(128)
 60 | 
 61 |     def forward(self, x):
 62 |         x = F.relu(self.bn1(self.fc1(x)))
 63 |         x = F.relu(self.bn2(self.fc2(x)))
 64 |         return self.fc3(x)
 65 | 
 66 | 
 67 | np.random.seed(42)
 68 | use_cuda = True
 69 | device = f'cuda:{torch.cuda.device_count() - 1}' if torch.cuda.is_available(
 70 | ) else 'cpu'
 71 | 
 72 | net1 = DNN().to(device)
 73 | net2 = DNN_dropout().to(device)
 74 | net3 = DNN_BN().to(device)
 75 | print(net1)
 76 | print(net2)
 77 | print(net3)
 78 | optim1 = Adam(net1.parameters(), lr=5e-5)
 79 | optim2 = Adam(net2.parameters(), lr=5e-5)
 80 | optim3 = Adam(net3.parameters(), lr=5e-5)
 81 | loss = nn.CrossEntropyLoss()
 82 | EPOCHES = 50
 83 | BATCH_SIZE = 40
 84 | 
 85 | train_loader = data_loader(torch.tensor(train_X), torch.tensor(train_y),
 86 |                            BATCH_SIZE, True)
 87 | 
 88 | train_accs, test_accs = [], []
 89 | test_X_cuda = torch.tensor(test_X, device=device)
 90 | test_y_cuda = torch.tensor(test_y, device=device)
 91 | 
 92 | bar = tqdm(range(EPOCHES))
 93 | 
 94 | for epoch in bar:
 95 |     # 相同数据训练3个网络
 96 |     net1.train()
 97 |     net2.train()
 98 |     net3.train()
 99 | 
100 |     for batch_X, batch_y in train_loader:
101 |         input_, label = batch_X.to(device), batch_y.to(device)
102 | 
103 |         output1 = net1(input_)
104 |         l1 = loss(output1, label)
105 |         output2 = net2(input_)
106 |         l2 = loss(output2, label)
107 |         output3 = net3(input_)
108 |         l3 = loss(output3, label)
109 | 
110 |         optim1.zero_grad()
111 |         optim2.zero_grad()
112 |         optim3.zero_grad()
113 |         (l1 + l2 + l3).backward()
114 |         optim1.step()
115 |         optim2.step()
116 |         optim3.step()
117 | 
118 |     net1.eval()
119 |     net2.eval()
120 |     net3.eval()
121 | 
122 |     # train
123 |     train_right = [0, 0, 0]
124 |     with torch.no_grad():
125 |         for batch_X, batch_y in train_loader:
126 |             input_, label = batch_X.to(device), batch_y.to(device)
127 |             pred1 = net1(input_).argmax(-1)
128 |             pred2 = net2(input_).argmax(-1)
129 |             pred3 = net3(input_).argmax(-1)
130 | 
131 |             train_right[0] += pred1.eq(label).sum().item()
132 |             train_right[1] += pred2.eq(label).sum().item()
133 |             train_right[2] += pred3.eq(label).sum().item()
134 | 
135 |         train_acc = np.array(train_right) / len(train_X)
136 | 
137 |         pred1, pred2, pred3 = (
138 |             net1(test_X_cuda).argmax(-1),
139 |             net2(test_X_cuda).argmax(-1),
140 |             net3(test_X_cuda).argmax(-1),
141 |         )
142 |         test_acc = np.array([
143 |             pred1.eq(test_y_cuda).float().mean().item(),
144 |             pred2.eq(test_y_cuda).float().mean().item(),
145 |             pred3.eq(test_y_cuda).float().mean().item(),
146 |         ])
147 | 
148 |         bar.set_postfix(
149 |             TRAIN_ACC="{:.3f}, {:.3f}, {:.3f}".format(*train_acc),
150 |             TEST_ACC="{:.3f}, {:.3f}, {:.3f}".format(*test_acc),
151 |         )
152 |         train_accs.append(train_acc)
153 |         test_accs.append(test_acc)
154 | 
155 | train_accs = np.array(train_accs)
156 | test_accs = np.array(test_accs)
157 | 
158 | plt.figure(figsize=(9, 3))
159 | 
160 | plt.rcParams['font.sans-serif'] = ['Times New Roman']
161 | plt.rcParams['mathtext.fontset'] = 'stix'
162 | plt.rcParams['xtick.direction'] = 'in'
163 | plt.rcParams['ytick.direction'] = 'in'
164 | plt.rcParams['axes.linewidth'] = 0.5
165 | 
166 | plot_kwargs = {'linewidth': 0.7, 'zorder': 10}
167 | 
168 | plt.subplot(1, 2, 1)
169 | plt.grid(zorder=-10)
170 | 
171 | plt.xlim(2, 50)
172 | plt.ylim(0, 1.05)
173 | 
174 | x = np.arange(0, 50, 2) + 2
175 | plt.plot(x,
176 |          train_accs[::2, 0],
177 |          label="MLP",
178 |          color='blue',
179 |          marker='^',
180 |          **plot_kwargs)
181 | plt.plot(x,
182 |          train_accs[::2, 1],
183 |          label="MLP with Dropout",
184 |          color='green',
185 |          marker='s',
186 |          **plot_kwargs)
187 | plt.plot(x,
188 |          train_accs[::2, 2],
189 |          label="MLP with BN",
190 |          color='red',
191 |          marker='*',
192 |          **plot_kwargs)
193 | 
194 | plt.yticks([0, .2, .4, .6, .8, 1], size=13)
195 | plt.xticks([10, 20, 30, 40, 50], size=13)
196 | plt.xlabel("Epochs", size=13)
197 | plt.title("Training Accuracy on Olivetti Faces Dataset")
198 | plt.legend()
199 | plt.tight_layout()
200 | 
201 | plt.subplot(1, 2, 2)
202 | plt.grid(zorder=-10)
203 | 
204 | plt.xlim(2, 50)
205 | plt.ylim(0, 1.)
206 | 
207 | plt.plot(x,
208 |          test_accs[::2, 0],
209 |          label="MLP",
210 |          color='blue',
211 |          marker='^',
212 |          **plot_kwargs)
213 | plt.plot(x,
214 |          test_accs[::2, 1],
215 |          label="MLP with Dropout",
216 |          color='green',
217 |          marker='s',
218 |          **plot_kwargs)
219 | plt.plot(x,
220 |          test_accs[::2, 2],
221 |          label="MLP with BN",
222 |          color='red',
223 |          marker='*',
224 |          **plot_kwargs)
225 | 
226 | plt.yticks([0, .2, .4, .6, .8, 1], size=13)
227 | plt.xticks([10, 20, 30, 40, 50], size=13)
228 | plt.xlabel("Epochs", size=13)
229 | plt.title("Test Accuracy on Olivetti Faces Dataset")
230 | plt.legend()
231 | plt.tight_layout()
232 | 
233 | plt.savefig("imgs/dropout_bn.png")
234 | 


--------------------------------------------------------------------------------
/examples/pydynet/dropout_bn.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | from sklearn.datasets import fetch_olivetti_faces
  3 | from sklearn.preprocessing import MinMaxScaler
  4 | from sklearn.model_selection import train_test_split
  5 | from tqdm import tqdm
  6 | 
  7 | import numpy as np
  8 | import pydynet as pdn
  9 | import pydynet.nn.functional as F
 10 | import pydynet.nn as nn
 11 | from pydynet.optim import Adam
 12 | from pydynet.data import data_loader
 13 | 
 14 | data_X, data_y = fetch_olivetti_faces(return_X_y=True)
 15 | print(data_X.shape)
 16 | train_X, test_X, train_y, test_y = train_test_split(
 17 |     data_X,
 18 |     data_y,
 19 |     train_size=0.8,
 20 |     stratify=data_y,
 21 |     random_state=42,
 22 | )
 23 | scaler = MinMaxScaler()
 24 | train_X = scaler.fit_transform(train_X)
 25 | test_X = scaler.transform(test_X)
 26 | 
 27 | 
 28 | class DNN(nn.Module):
 29 | 
 30 |     def __init__(self) -> None:
 31 |         super().__init__()
 32 |         self.fc1 = nn.Linear(4096, 512, dtype=np.float32)
 33 |         self.fc2 = nn.Linear(512, 128, dtype=np.float32)
 34 |         self.fc3 = nn.Linear(128, 40, dtype=np.float32)
 35 | 
 36 |     def forward(self, x):
 37 |         x = F.relu(self.fc1(x))
 38 |         x = F.relu(self.fc2(x))
 39 |         return self.fc3(x)
 40 | 
 41 | 
 42 | class DNN_dropout(DNN):
 43 | 
 44 |     def __init__(self) -> None:
 45 |         super().__init__()
 46 |         self.dropout = nn.Dropout(p=0.05)
 47 | 
 48 |     def forward(self, x):
 49 |         x = F.relu(self.dropout(self.fc1(x)))
 50 |         x = F.relu(self.dropout(self.fc2(x)))
 51 |         return self.fc3(x)
 52 | 
 53 | 
 54 | class DNN_BN(DNN):
 55 | 
 56 |     def __init__(self) -> None:
 57 |         super().__init__()
 58 |         self.bn1 = nn.BatchNorm1d(512, dtype=np.float32)
 59 |         self.bn2 = nn.BatchNorm1d(128, dtype=np.float32)
 60 | 
 61 |     def forward(self, x):
 62 |         x = F.relu(self.bn1(self.fc1(x)))
 63 |         x = F.relu(self.bn2(self.fc2(x)))
 64 |         return self.fc3(x)
 65 | 
 66 | 
 67 | np.random.seed(42)
 68 | use_cuda = True
 69 | device = f'cuda:{pdn.cuda.device_count() - 1}' if pdn.cuda.is_available(
 70 | ) else 'cpu'
 71 | 
 72 | net1 = DNN().to(device)
 73 | net2 = DNN_dropout().to(device)
 74 | net3 = DNN_BN().to(device)
 75 | print(net1)
 76 | print(net2)
 77 | print(net3)
 78 | optim1 = Adam(net1.parameters(), lr=5e-5)
 79 | optim2 = Adam(net2.parameters(), lr=5e-5)
 80 | optim3 = Adam(net3.parameters(), lr=5e-5)
 81 | loss = nn.CrossEntropyLoss()
 82 | EPOCHES = 50
 83 | BATCH_SIZE = 40
 84 | 
 85 | train_loader = data_loader(pdn.Tensor(train_X), pdn.Tensor(train_y),
 86 |                            BATCH_SIZE, True)
 87 | 
 88 | train_accs, test_accs = [], []
 89 | test_X_cuda = pdn.Tensor(test_X, device=device)
 90 | test_y_cuda = pdn.Tensor(test_y, device=device)
 91 | 
 92 | bar = tqdm(range(EPOCHES))
 93 | 
 94 | for epoch in bar:
 95 |     # 相同数据训练3个网络
 96 |     net1.train()
 97 |     net2.train()
 98 |     net3.train()
 99 | 
100 |     for batch_X, batch_y in train_loader:
101 |         input_, label = batch_X.to(device), batch_y.to(device)
102 | 
103 |         output1 = net1(input_)
104 |         l1 = loss(output1, label)
105 |         output2 = net2(input_)
106 |         l2 = loss(output2, label)
107 |         output3 = net3(input_)
108 |         l3 = loss(output3, label)
109 | 
110 |         optim1.zero_grad()
111 |         optim2.zero_grad()
112 |         optim3.zero_grad()
113 |         (l1 + l2 + l3).backward()
114 |         optim1.step()
115 |         optim2.step()
116 |         optim3.step()
117 | 
118 |     net1.eval()
119 |     net2.eval()
120 |     net3.eval()
121 | 
122 |     # train
123 |     train_right = [0, 0, 0]
124 |     with pdn.no_grad():
125 |         for batch_X, batch_y in train_loader:
126 |             input_, label = batch_X.to(device), batch_y.to(device)
127 |             pred1 = net1(input_).argmax(-1)
128 |             pred2 = net2(input_).argmax(-1)
129 |             pred3 = net3(input_).argmax(-1)
130 | 
131 |             train_right[0] += pred1.eq(label).sum().item()
132 |             train_right[1] += pred2.eq(label).sum().item()
133 |             train_right[2] += pred3.eq(label).sum().item()
134 | 
135 |         train_acc = np.array(train_right) / len(train_X)
136 | 
137 |         pred1, pred2, pred3 = (
138 |             net1(test_X_cuda).argmax(-1),
139 |             net2(test_X_cuda).argmax(-1),
140 |             net3(test_X_cuda).argmax(-1),
141 |         )
142 |         test_acc = np.array([
143 |             pred1.eq(test_y_cuda.data).mean().item(),
144 |             pred2.eq(test_y_cuda.data).mean().item(),
145 |             pred3.eq(test_y_cuda.data).mean().item(),
146 |         ])
147 | 
148 |         bar.set_postfix(
149 |             TRAIN_ACC="{:.3f}, {:.3f}, {:.3f}".format(*train_acc),
150 |             TEST_ACC="{:.3f}, {:.3f}, {:.3f}".format(*test_acc),
151 |         )
152 |         train_accs.append(train_acc)
153 |         test_accs.append(test_acc)
154 | 
155 | train_accs = np.array(train_accs)
156 | test_accs = np.array(test_accs)
157 | 
158 | plt.figure(figsize=(9, 3))
159 | 
160 | plt.rcParams['font.sans-serif'] = ['Times New Roman']
161 | plt.rcParams['mathtext.fontset'] = 'stix'
162 | plt.rcParams['xtick.direction'] = 'in'
163 | plt.rcParams['ytick.direction'] = 'in'
164 | plt.rcParams['axes.linewidth'] = 0.5
165 | 
166 | plot_kwargs = {'linewidth': 0.7, 'zorder': 10}
167 | 
168 | plt.subplot(1, 2, 1)
169 | plt.grid(zorder=-10)
170 | 
171 | plt.xlim(2, 50)
172 | plt.ylim(0, 1.05)
173 | 
174 | x = np.arange(0, 50, 2) + 2
175 | plt.plot(x,
176 |          train_accs[::2, 0],
177 |          label="MLP",
178 |          color='blue',
179 |          marker='^',
180 |          **plot_kwargs)
181 | plt.plot(x,
182 |          train_accs[::2, 1],
183 |          label="MLP with Dropout",
184 |          color='green',
185 |          marker='s',
186 |          **plot_kwargs)
187 | plt.plot(x,
188 |          train_accs[::2, 2],
189 |          label="MLP with BN",
190 |          color='red',
191 |          marker='*',
192 |          **plot_kwargs)
193 | 
194 | plt.yticks([0, .2, .4, .6, .8, 1], size=13)
195 | plt.xticks([10, 20, 30, 40, 50], size=13)
196 | plt.xlabel("Epochs", size=13)
197 | plt.title("Training Accuracy on Olivetti Faces Dataset")
198 | plt.legend()
199 | plt.tight_layout()
200 | 
201 | plt.subplot(1, 2, 2)
202 | plt.grid(zorder=-10)
203 | 
204 | plt.xlim(2, 50)
205 | plt.ylim(0, 1.)
206 | 
207 | plt.plot(x,
208 |          test_accs[::2, 0],
209 |          label="MLP",
210 |          color='blue',
211 |          marker='^',
212 |          **plot_kwargs)
213 | plt.plot(x,
214 |          test_accs[::2, 1],
215 |          label="MLP with Dropout",
216 |          color='green',
217 |          marker='s',
218 |          **plot_kwargs)
219 | plt.plot(x,
220 |          test_accs[::2, 2],
221 |          label="MLP with BN",
222 |          color='red',
223 |          marker='*',
224 |          **plot_kwargs)
225 | 
226 | plt.yticks([0, .2, .4, .6, .8, 1], size=13)
227 | plt.xticks([10, 20, 30, 40, 50], size=13)
228 | plt.xlabel("Epochs", size=13)
229 | plt.title("Test Accuracy on Olivetti Faces Dataset")
230 | plt.legend()
231 | plt.tight_layout()
232 | 
233 | plt.savefig("imgs/dropout_bn.png")
234 | 


--------------------------------------------------------------------------------
/pydynet/core/function.py:
--------------------------------------------------------------------------------
  1 | from .tensor import Tensor, swapaxes
  2 | 
  3 | 
  4 | def sqrt(x: Tensor):
  5 |     '''平方根函数'''
  6 |     return x**0.5
  7 | 
  8 | 
  9 | def square(x: Tensor):
 10 |     '''平方函数'''
 11 |     return x * x
 12 | 
 13 | 
 14 | def vsplit(x: Tensor, indices_or_sections: int | tuple) -> list[Tensor]:
 15 |     if not isinstance(x, Tensor):
 16 |         x = Tensor(x)
 17 | 
 18 |     try:
 19 |         len(indices_or_sections)
 20 |     except TypeError:
 21 |         sections = indices_or_sections
 22 |         N = x.shape[0]
 23 |         assert N % sections == 0, 'array split does not result in an equal division'
 24 | 
 25 |     Ntotal = x.shape[0]
 26 |     try:
 27 |         # handle array case.
 28 |         Nsections = len(indices_or_sections) + 1
 29 |         div_points = [0] + list(indices_or_sections) + [Ntotal]
 30 |     except TypeError:
 31 |         # indices_or_sections is a scalar, not an array.
 32 |         Nsections = int(indices_or_sections)
 33 |         if Nsections <= 0:
 34 |             raise ValueError(
 35 |                 'number sections must be larger than 0.') from None
 36 |         Neach_section, extras = divmod(Ntotal, Nsections)
 37 |         section_sizes = ([0] + extras * [Neach_section + 1] +
 38 |                          (Nsections - extras) * [Neach_section])
 39 |         div_points = x.xp.array(section_sizes, dtype=x.xp.intp).cumsum()
 40 | 
 41 |     sub_tensors = []
 42 |     for i in range(Nsections):
 43 |         st = div_points[i]
 44 |         end = div_points[i + 1]
 45 |         sub_tensors.append(x[st:end])
 46 | 
 47 |     return sub_tensors
 48 | 
 49 | 
 50 | def hsplit(x: Tensor, indices_or_sections: int | tuple) -> list[Tensor]:
 51 |     if not isinstance(x, Tensor):
 52 |         x = Tensor(x)
 53 | 
 54 |     try:
 55 |         len(indices_or_sections)
 56 |     except TypeError:
 57 |         sections = indices_or_sections
 58 |         N = x.shape[1]
 59 |         assert N % sections == 0, 'array split does not result in an equal division'
 60 | 
 61 |     Ntotal = x.shape[1]
 62 |     try:
 63 |         # handle array case.
 64 |         Nsections = len(indices_or_sections) + 1
 65 |         div_points = [0] + list(indices_or_sections) + [Ntotal]
 66 |     except TypeError:
 67 |         # indices_or_sections is a scalar, not an array.
 68 |         Nsections = int(indices_or_sections)
 69 |         if Nsections <= 0:
 70 |             raise ValueError(
 71 |                 'number sections must be larger than 0.') from None
 72 |         Neach_section, extras = divmod(Ntotal, Nsections)
 73 |         section_sizes = ([0] + extras * [Neach_section + 1] +
 74 |                          (Nsections - extras) * [Neach_section])
 75 |         div_points = x.xp.array(section_sizes, dtype=x.xp.intp).cumsum()
 76 | 
 77 |     sub_tensors = []
 78 |     for i in range(Nsections):
 79 |         st = div_points[i]
 80 |         end = div_points[i + 1]
 81 |         sub_tensors.append(x[:, st:end])
 82 | 
 83 |     return sub_tensors
 84 | 
 85 | 
 86 | def dsplit(x: Tensor, indices_or_sections: int | tuple) -> list[Tensor]:
 87 |     if not isinstance(x, Tensor):
 88 |         x = Tensor(x)
 89 | 
 90 |     try:
 91 |         len(indices_or_sections)
 92 |     except TypeError:
 93 |         sections = indices_or_sections
 94 |         N = x.shape[2]
 95 |         assert N % sections == 0, 'array split does not result in an equal division'
 96 | 
 97 |     Ntotal = x.shape[2]
 98 |     try:
 99 |         # handle array case.
100 |         Nsections = len(indices_or_sections) + 1
101 |         div_points = [0] + list(indices_or_sections) + [Ntotal]
102 |     except TypeError:
103 |         # indices_or_sections is a scalar, not an array.
104 |         Nsections = int(indices_or_sections)
105 |         if Nsections <= 0:
106 |             raise ValueError(
107 |                 'number sections must be larger than 0.') from None
108 |         Neach_section, extras = divmod(Ntotal, Nsections)
109 |         section_sizes = ([0] + extras * [Neach_section + 1] +
110 |                          (Nsections - extras) * [Neach_section])
111 |         div_points = x.xp.array(section_sizes, dtype=x.xp.intp).cumsum()
112 | 
113 |     sub_tensors = []
114 |     for i in range(Nsections):
115 |         st = div_points[i]
116 |         end = div_points[i + 1]
117 |         sub_tensors.append(x[:, :, st:end])
118 | 
119 |     return sub_tensors
120 | 
121 | 
122 | def split(
123 |     x: Tensor,
124 |     indices_or_sections: int | tuple,
125 |     axis: int = 0,
126 | ) -> list[Tensor]:
127 |     if not isinstance(x, Tensor):
128 |         x = Tensor(x)
129 | 
130 |     if axis == 0 or axis == -x.ndim:
131 |         return vsplit(x, indices_or_sections)
132 |     elif axis == 1 or axis == -x.ndim + 1:
133 |         return hsplit(x, indices_or_sections)
134 |     elif axis == 2 or axis == -x.ndim + 2:
135 |         return dsplit(x, indices_or_sections)
136 | 
137 |     try:
138 |         len(indices_or_sections)
139 |     except TypeError:
140 |         sections = indices_or_sections
141 |         N = x.shape[axis]
142 |         assert N % sections == 0, 'array split does not result in an equal division'
143 | 
144 |     Ntotal = x.shape[axis]
145 |     try:
146 |         # handle array case.
147 |         Nsections = len(indices_or_sections) + 1
148 |         div_points = [0] + list(indices_or_sections) + [Ntotal]
149 |     except TypeError:
150 |         # indices_or_sections is a scalar, not an array.
151 |         Nsections = int(indices_or_sections)
152 |         if Nsections <= 0:
153 |             raise ValueError(
154 |                 'number sections must be larger than 0.') from None
155 |         Neach_section, extras = divmod(Ntotal, Nsections)
156 |         section_sizes = ([0] + extras * [Neach_section + 1] +
157 |                          (Nsections - extras) * [Neach_section])
158 |         div_points = x.xp.array(section_sizes, dtype=x.xp.intp).cumsum()
159 | 
160 |     sub_tensors = []
161 |     stensor = swapaxes(x, 0, axis)
162 |     for i in range(Nsections):
163 |         st = div_points[i]
164 |         end = div_points[i + 1]
165 |         sub_tensors.append(swapaxes(stensor[st:end], axis, 0))
166 |     return sub_tensors
167 | 
168 | 
169 | def unsqueeze(x: Tensor, axis):
170 |     '''等价于numpy的expand_dims, 因此我们借用了expand_dims的源码'''
171 |     from numpy.core.numeric import normalize_axis_tuple
172 | 
173 |     if type(axis) not in (tuple, list):
174 |         axis = (axis, )
175 | 
176 |     out_ndim = len(axis) + x.ndim
177 |     axis = normalize_axis_tuple(axis, out_ndim)
178 | 
179 |     shape_it = iter(x.shape)
180 |     shape = [1 if ax in axis else next(shape_it) for ax in range(out_ndim)]
181 |     return x.reshape(*shape)
182 | 
183 | 
184 | def squeeze(x: Tensor, axis=None):
185 |     shape = x.shape
186 |     if axis is None:
187 |         new_shape = tuple(dim for dim in shape if dim != 1)
188 |     else:
189 |         if isinstance(axis, int):
190 |             axis = (axis, )
191 |         axis = tuple(axis)
192 | 
193 |         for ax in axis:
194 |             if ax >= len(shape) or ax < -len(shape):
195 |                 raise ValueError("Axis out of range")
196 |             if shape[ax] != 1:
197 |                 raise ValueError(
198 |                     f"Cannot squeeze axis {ax} with size {shape[ax]}")
199 | 
200 |         # 构造新形状，排除指定轴
201 |         new_shape = tuple(dim for i, dim in enumerate(shape) if i not in axis)
202 | 
203 |     # 返回重塑后的数组
204 |     return x.reshape(*new_shape)
205 | 


--------------------------------------------------------------------------------
/llm/llama/model.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | 
  4 | import pydynet as pdn
  5 | from pydynet.core.tensor import Tensor
  6 | import pydynet.nn as nn
  7 | import pydynet.nn.functional as F
  8 | 
  9 | 
 10 | def compute_cos_sin_cache(head_dim: int,
 11 |                           max_seq_len: int,
 12 |                           base: int = 10000,
 13 |                           dtype=None):
 14 |     inv_freq = 1.0 / (base**(np.arange(0, head_dim, 2)[:(head_dim // 2)] /
 15 |                              head_dim))
 16 |     t = np.arange(max_seq_len)
 17 |     freqs = np.outer(t, inv_freq).astype(dtype)
 18 | 
 19 |     return Tensor(np.cos(freqs)), Tensor(np.sin(freqs))
 20 | 
 21 | 
 22 | def apply_rotary_emb(xq: Tensor, xk: Tensor, freqs_cos, freqs_sin):
 23 |     xqri = xq.reshape(*(xq.shape[:-1] + (-1, 2)))
 24 |     xkri = xk.reshape(*(xk.shape[:-1] + (-1, 2)))
 25 | 
 26 |     xq_r, xq_i = xqri[..., 0], xqri[..., 1]
 27 |     xk_r, xk_i = xkri[..., 0], xkri[..., 1]
 28 | 
 29 |     freqs_cos = pdn.unsqueeze(freqs_cos, axis=-2)
 30 |     freqs_sin = pdn.unsqueeze(freqs_sin, axis=-2)
 31 | 
 32 |     # Apply rotation using real numbers.
 33 |     xq_out_r = pdn.unsqueeze(xq_r * freqs_cos - xq_i * freqs_sin, -1)
 34 |     xq_out_i = pdn.unsqueeze(xq_r * freqs_sin + xq_i * freqs_cos, -1)
 35 |     xk_out_r = pdn.unsqueeze(xk_r * freqs_cos - xk_i * freqs_sin, -1)
 36 |     xk_out_i = pdn.unsqueeze(xk_r * freqs_sin + xk_i * freqs_cos, -1)
 37 | 
 38 |     # Flatten last two dimensions.
 39 |     xq_out = pdn.concat([xq_out_r, xq_out_i], axis=-1)
 40 |     xk_out = pdn.concat([xk_out_r, xk_out_i], axis=-1)
 41 |     xq_out = xq_out.reshape(*(xq_out.shape[:-2] + (-1, )))
 42 |     xk_out = xk_out.reshape(*(xk_out.shape[:-2] + (-1, )))
 43 |     return xq_out, xk_out
 44 | 
 45 | 
 46 | class FeedForward(nn.Module):
 47 | 
 48 |     def __init__(self, dim, up_dim, dtype=None):
 49 |         super().__init__()
 50 |         self.dim, self.up_dim = dim, up_dim
 51 |         self.up = nn.Linear(dim, up_dim, bias=False, dtype=dtype)
 52 |         self.gate = nn.Linear(dim, up_dim, bias=False, dtype=dtype)
 53 |         self.down = nn.Linear(up_dim, dim, bias=False, dtype=dtype)
 54 | 
 55 |     def forward(self, x):
 56 |         swish, x_V = F.silu(self.gate(x)), self.up(x)
 57 |         return self.down(swish * x_V)
 58 | 
 59 | 
 60 | class Attention(nn.Module):
 61 | 
 62 |     def __init__(
 63 |         self,
 64 |         dim: int,
 65 |         n_heads: int,
 66 |         max_seq_len: int,
 67 |         max_batch_size: int = None,
 68 |         dtype=None,
 69 |     ):
 70 |         super().__init__()
 71 |         self.dim = dim
 72 |         self.n_heads = n_heads
 73 | 
 74 |         assert dim % n_heads == 0
 75 |         self.head_dim = dim // n_heads
 76 | 
 77 |         self.Q = nn.Linear(self.dim, self.dim, bias=False, dtype=dtype)
 78 |         self.K = nn.Linear(self.dim, self.dim, bias=False, dtype=dtype)
 79 |         self.V = nn.Linear(self.dim, self.dim, bias=False, dtype=dtype)
 80 |         self.O = nn.Linear(self.dim, self.dim, bias=False, dtype=dtype)
 81 | 
 82 |         self.max_seq_len = max_seq_len
 83 |         self.max_batch_size = max_batch_size if max_batch_size is not None else 1
 84 | 
 85 |         self.cache_k = nn.Parameter(pdn.special.zeros(
 86 |             (self.max_batch_size, max_seq_len, self.n_heads, self.head_dim),
 87 |             dtype=dtype),
 88 |                                     requires_grad=False)
 89 |         self.cache_v = nn.Parameter(pdn.special.zeros(
 90 |             (self.max_batch_size, max_seq_len, self.n_heads, self.head_dim),
 91 |             dtype=dtype),
 92 |                                     requires_grad=False)
 93 | 
 94 |     def __call__(self, x, start_pos: int, mask, freqs_cos, freqs_sin):
 95 |         B, L, _ = x.shape
 96 |         xq, xk, xv = (
 97 |             self.Q(x).reshape(B, L, self.n_heads, self.head_dim),
 98 |             self.K(x).reshape(B, L, self.n_heads, self.head_dim),
 99 |             self.V(x).reshape(B, L, self.n_heads, self.head_dim),
100 |         )
101 | 
102 |         xq, xk = apply_rotary_emb(xq, xk, freqs_cos, freqs_sin)
103 | 
104 |         if not self._train:
105 |             self.cache_k[:B, start_pos:start_pos + L] = xk
106 |             self.cache_v[:B, start_pos:start_pos + L] = xv
107 | 
108 |             xk = self.cache_k[:B, :start_pos + L]
109 |             xv = self.cache_v[:B, :start_pos + L]
110 | 
111 |         xq, xkT = xq.transpose(0, 2, 1, 3), xk.transpose(0, 2, 3, 1)
112 |         attention = xq @ xkT / math.sqrt(self.head_dim)
113 | 
114 |         if mask is not None:
115 |             attention = attention + mask
116 |         attention = F.softmax(attention, axis=-1)
117 |         output = attention @ xv.transpose(0, 2, 1, 3)
118 | 
119 |         output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
120 |         return self.O(output)
121 | 
122 | 
123 | class TransformerBlock(nn.Module):
124 | 
125 |     def __init__(
126 |         self,
127 |         dim: int,
128 |         n_heads: int,
129 |         ffn_dim: int,
130 |         max_seq_len: int,
131 |         max_batch_size: int = None,
132 |         dtype=None,
133 |     ):
134 |         super().__init__()
135 |         self.attention = Attention(dim, n_heads, max_seq_len, max_batch_size,
136 |                                    dtype)
137 |         self.ffn = FeedForward(dim, ffn_dim, dtype)
138 |         self.input_norm = nn.RMSNorm(dim, dtype=dtype)
139 |         self.post_attn_norm = nn.RMSNorm(dim, dtype=dtype)
140 | 
141 |     def forward(self, x, start_pos: int, mask, freqs_cos, freqs_sin):
142 |         norm_x = self.input_norm(x)
143 | 
144 |         h1 = self.attention(norm_x, start_pos, mask, freqs_cos, freqs_sin)
145 |         z = x + h1
146 | 
147 |         norm_z = self.post_attn_norm(z)
148 |         h2 = self.ffn(norm_z)
149 |         return z + h2
150 | 
151 | 
152 | class Llama(nn.Module):
153 | 
154 |     def __init__(
155 |         self,
156 |         vocab_size,
157 |         embed_dim,
158 |         n_heads,
159 |         ffn_dim: int,
160 |         max_seq_len: int,
161 |         max_batch_size: int = None,
162 |         n_layers: int = 6,
163 |         dtype=None,
164 |     ):
165 |         super().__init__()
166 |         self.vocab_size = vocab_size
167 |         self.embed_dim = embed_dim
168 |         self.n_heads = n_heads
169 |         self.ffn_dim = ffn_dim
170 |         self.max_seq_len = max_seq_len
171 |         self.max_batch_size = max_batch_size
172 |         self.n_layers = n_layers
173 | 
174 |         self.tok_embedding = nn.Embedding(vocab_size, embed_dim, dtype=dtype)
175 |         freqs_cos, freqs_sin = compute_cos_sin_cache(embed_dim // n_heads,
176 |                                                      max_seq_len,
177 |                                                      dtype=dtype)
178 | 
179 |         self.freqs_cos = nn.parameter.Parameter(freqs_cos, False)
180 |         self.freqs_sin = nn.parameter.Parameter(freqs_sin, False)
181 | 
182 |         self.layers = nn.ModuleList([
183 |             TransformerBlock(embed_dim, n_heads, ffn_dim, max_seq_len,
184 |                              max_batch_size, dtype)
185 |             for _ in range(self.n_layers)
186 |         ])
187 | 
188 |         self.norm = nn.RMSNorm(embed_dim, dtype=dtype)
189 |         self.lm_head = nn.Linear(embed_dim, vocab_size, dtype=dtype)
190 | 
191 |     def forward(self, input_ids, start_pos: int):
192 |         L = input_ids.shape[-1]
193 |         h = self.tok_embedding(input_ids)
194 | 
195 |         freqs_cos = self.freqs_cos[start_pos:start_pos + L]
196 |         freqs_sin = self.freqs_sin[start_pos:start_pos + L]
197 | 
198 |         mask = None
199 |         if L > 1:
200 |             mask = np.triu(np.full((L, L), float("-inf")), k=1)
201 |             mask = np.concatenate([np.zeros((L, start_pos)), mask], axis=1)
202 |             mask = pdn.Tensor(mask, device=h.device, dtype=h.dtype)
203 | 
204 |         for layer in self.layers:
205 |             h = layer(h, start_pos, mask, freqs_cos, freqs_sin)
206 | 
207 |         logit = self.lm_head(self.norm(h)[:, [-1], :])
208 |         return logit
209 | 
210 |     def generate(self, input_ids, max_new_tokens: int):
211 |         _, L = input_ids.shape
212 |         for i, curr_pos in enumerate(range(L, max_new_tokens)):
213 |             if i == 0:  # Prefill Phase
214 |                 inputs = input_ids
215 |                 pos = 0
216 |             else:  # Decode Phase
217 |                 inputs = next_id
218 |                 pos = curr_pos
219 |             logits = self(inputs, pos)
220 |             next_id = logits[:, -1, :].argmax(-1, True)
221 |             yield next_id
222 | 


--------------------------------------------------------------------------------
/pydynet/nn/modules/norm.py:
--------------------------------------------------------------------------------
  1 | from .module import Module
  2 | from ..parameter import Parameter
  3 | from .. import init
  4 | from ...special import empty
  5 | from ... import core
  6 | from ...cuda import Device
  7 | 
  8 | 
  9 | class BatchNorm1d(Module):
 10 |     '''
 11 |     一维Batch Normalization层
 12 | 
 13 |     Parameters
 14 |     ----------
 15 |     num_features : int
 16 |         输入特征数.
 17 |     eps : float, default=1e-5
 18 |         防止除数为0的极小项.
 19 |     momentum : float, default=0.5
 20 |         计算累积均值和方差的动量项.
 21 |     device : Optional[Device], default=None
 22 |         层数据所在的设备.
 23 |     dtype : default=Nonr
 24 |         层数据的类型.
 25 |     '''
 26 | 
 27 |     def __init__(
 28 |         self,
 29 |         num_features: int,
 30 |         eps: float = 1e-6,
 31 |         momentum: float = 0.1,
 32 |         device=None,
 33 |         dtype=None,
 34 |     ) -> None:
 35 |         super().__init__()
 36 |         kwargs = {"device": Device(device), "dtype": dtype}
 37 |         self.num_features = num_features
 38 |         self.eps = eps
 39 |         self.momentum = momentum
 40 |         self.running_mean = Parameter(
 41 |             empty(self.num_features, **kwargs),
 42 |             requires_grad=False,
 43 |         )
 44 |         self.running_var = Parameter(
 45 |             empty(self.num_features, **kwargs),
 46 |             requires_grad=False,
 47 |         )
 48 |         self.scale = Parameter(empty(self.num_features, **kwargs))
 49 |         self.shift = Parameter(empty(self.num_features, **kwargs))
 50 |         self.reset_parameters()
 51 | 
 52 |     def reset_parameters(self):
 53 |         init.zeros_(self.running_mean)
 54 |         init.ones_(self.running_var)
 55 |         init.zeros_(self.shift)
 56 |         init.ones_(self.scale)
 57 | 
 58 |     def forward(self, x):
 59 |         if self._train:
 60 |             mean = x.mean(0)
 61 |             center_data = x - mean
 62 |             var = core.mean(core.square(center_data), 0)
 63 |             std_data = center_data / core.sqrt(var + self.eps)
 64 | 
 65 |             self.running_mean *= (1 - self.momentum)
 66 |             self.running_mean += self.momentum * mean
 67 |             self.running_var *= (1 - self.momentum)
 68 |             self.running_var += self.momentum * var
 69 | 
 70 |             return std_data * self.scale + self.shift
 71 |         else:
 72 |             return (x - self.running_mean) * self.scale / core.sqrt(
 73 |                 self.running_var + self.eps) + self.shift
 74 | 
 75 |     def __repr__(self) -> str:
 76 |         return "{}(num_features={}, momentum={})".format(
 77 |             self.__class__.__name__,
 78 |             self.num_features,
 79 |             self.momentum,
 80 |         )
 81 | 
 82 | 
 83 | class BatchNorm2d(Module):
 84 |     '''
 85 |     二维Batch Normalization层
 86 | 
 87 |     Parameters
 88 |     ----------
 89 |     num_features : int
 90 |         输入特征数(通道数).
 91 |     eps : float, default=1e-5
 92 |         防止除数为0的极小项.
 93 |     momentum : float, default=0.5
 94 |         计算累积均值和方差的动量项.
 95 |     device : Optional[Device], default=None
 96 |         层数据所在的设备.
 97 |     dtype : default=None
 98 |         层数据的类型.
 99 |     '''
100 | 
101 |     def __init__(
102 |         self,
103 |         num_features: int,
104 |         eps: float = 1e-6,
105 |         momentum: float = 0.1,
106 |         device=None,
107 |         dtype=None,
108 |     ) -> None:
109 |         super().__init__()
110 |         kwargs = {"device": Device(device), "dtype": dtype}
111 |         self.num_features = num_features
112 |         self.eps = eps
113 |         self.momentum = momentum
114 |         self.running_mean = Parameter(
115 |             empty((1, self.num_features, 1, 1), **kwargs),
116 |             requires_grad=False,
117 |         )
118 |         self.running_var = Parameter(
119 |             empty((1, self.num_features, 1, 1), **kwargs),
120 |             requires_grad=False,
121 |         )
122 |         self.scale = Parameter(empty(1, self.num_features, 1, 1, **kwargs))
123 |         self.shift = Parameter(empty((1, self.num_features, 1, 1), **kwargs))
124 |         self.reset_parameters()
125 | 
126 |     def reset_parameters(self):
127 |         init.zeros_(self.running_mean)
128 |         init.ones_(self.running_var)
129 |         init.zeros_(self.shift)
130 |         init.ones_(self.scale)
131 | 
132 |     def forward(self, x):
133 |         if self._train:
134 |             mean = x.mean((0, 2, 3), keepdims=True)
135 |             center_data = x - mean
136 |             var = core.mean(core.square(center_data), (0, 2, 3), keepdims=True)
137 |             std_data = center_data / core.sqrt(var + self.eps)
138 | 
139 |             self.running_mean *= (1 - self.momentum)
140 |             self.running_mean += self.momentum * mean
141 |             self.running_var *= (1 - self.momentum)
142 |             self.running_var += self.momentum * var
143 | 
144 |             return std_data * self.scale + self.shift
145 |         else:
146 |             return (x - self.running_mean) * self.scale / core.sqrt(
147 |                 self.running_var + self.eps) + self.shift
148 | 
149 |     def __repr__(self) -> str:
150 |         return "{}(num_features={}, momentum={})".format(
151 |             self.__class__.__name__,
152 |             self.num_features,
153 |             self.momentum,
154 |         )
155 | 
156 | 
157 | class LayerNorm(Module):
158 |     '''
159 |     Layer Normalization
160 | 
161 |     Parameters
162 |     ----------
163 |     normalized_shape : Tuple[int]
164 |     eps : float, default=1e-5
165 |     momentum : float, default=0.5
166 |     device : Optional[Device], default=None
167 |     dtype : default=None
168 |     '''
169 | 
170 |     def __init__(
171 |         self,
172 |         normalized_shape: int,
173 |         eps: float = 1e-6,
174 |         momentum: float = 0.1,
175 |         device=None,
176 |         dtype=None,
177 |     ) -> None:
178 |         super().__init__()
179 |         kwargs = {"device": Device(device), "dtype": dtype}
180 |         if isinstance(normalized_shape, int):
181 |             normalized_shape = (normalized_shape, )
182 |         self.normalized_shape = tuple(normalized_shape)
183 |         self.eps = eps
184 |         self.momentum = momentum
185 |         self.running_mean = Parameter(
186 |             empty(normalized_shape, **kwargs),
187 |             requires_grad=False,
188 |         )
189 |         self.running_var = Parameter(
190 |             empty(normalized_shape, **kwargs),
191 |             requires_grad=False,
192 |         )
193 |         self.scale = Parameter(empty(*normalized_shape, **kwargs))
194 |         self.shift = Parameter(empty(normalized_shape, **kwargs))
195 |         self.reset_parameters()
196 | 
197 |     def reset_parameters(self):
198 |         init.zeros_(self.running_mean)
199 |         init.ones_(self.running_var)
200 |         init.zeros_(self.shift)
201 |         init.ones_(self.scale)
202 | 
203 |     def forward(self, x):
204 |         if self._train:
205 |             axis = tuple(range(x.ndim - len(self.normalized_shape)))
206 |             mean = x.mean(axis)
207 |             center_data = x - mean
208 |             var = core.square(center_data).mean(axis)
209 |             std_data = center_data / core.sqrt(var + self.eps)
210 |             self.running_mean *= (1 - self.momentum)
211 |             self.running_mean += self.momentum * mean
212 |             self.running_var *= (1 - self.momentum)
213 |             self.running_var += self.momentum * var
214 | 
215 |             return std_data * self.scale + self.shift
216 |         else:
217 |             return (x - self.running_mean) * self.scale / core.sqrt(
218 |                 self.running_var + self.eps) + self.shift
219 | 
220 | 
221 | class RMSNorm(Module):
222 | 
223 |     def __init__(
224 |         self,
225 |         normalized_shape: tuple,
226 |         eps: float = 1e-6,
227 |         device=None,
228 |         dtype=None,
229 |     ):
230 |         super().__init__()
231 |         kwargs = {"device": Device(device), "dtype": dtype}
232 |         if isinstance(normalized_shape, int):
233 |             normalized_shape = (normalized_shape, )
234 |         self.normalized_shape = tuple(normalized_shape)
235 |         self.sum_axis = tuple(
236 |             [-(i + 1) for i in range(len(self.normalized_shape))])
237 |         self.eps = eps
238 | 
239 |         self.weight = Parameter(empty(self.normalized_shape, **kwargs))
240 |         self.reset_parameters()
241 | 
242 |     def reset_parameters(self):
243 |         init.ones_(self.weight)
244 | 
245 |     def forward(self, x):
246 |         z = core.square(x).mean(self.sum_axis, keepdims=True)
247 |         z = x / core.sqrt(z + self.eps)
248 |         return z * self.weight
249 | 


--------------------------------------------------------------------------------
/examples/pytorch/transformer.py:
--------------------------------------------------------------------------------
  1 | from os.path import join
  2 | from tqdm import tqdm
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | from torch.optim import Adam
  8 | from pydynet.data import data_loader
  9 | 
 10 | import numpy as np
 11 | 
 12 | import numpy as np
 13 | import matplotlib.pyplot as plt
 14 | from sklearn.model_selection import train_test_split
 15 | 
 16 | np.random.seed(42)
 17 | 
 18 | path = r'examples/data/CoLA/tokenized'
 19 | 
 20 | 
 21 | def extract(line: str):
 22 |     lines = line.split('\t')
 23 |     y = int(lines[1])
 24 |     sentence = lines[-1][:-1]
 25 |     return sentence.split(), y
 26 | 
 27 | 
 28 | def load_data():
 29 | 
 30 |     with open(join(path, 'in_domain_train.tsv'), 'r', encoding='utf-8') as f:
 31 |         lines = f.readlines()
 32 | 
 33 |     sens, ys = [], []
 34 |     max_len = -1
 35 |     word_dict = set()
 36 |     for line in tqdm(lines):
 37 |         x, y = extract(line)
 38 |         word_dict = word_dict.union(set(x))
 39 |         max_len = max(max_len, len(x))
 40 |         sens.append(x)
 41 |         ys.append(y)
 42 |     word_dict = list(word_dict)
 43 | 
 44 |     X = np.zeros((len(lines), max_len), dtype=int)
 45 |     for i in tqdm(range(len(lines))):
 46 |         for j, word in enumerate(sens[i]):
 47 |             X[i, j] = word_dict.index(word) + 1
 48 |     y = np.array(ys)
 49 | 
 50 |     return X, y
 51 | 
 52 | 
 53 | class SelfAttention(nn.Module):
 54 | 
 55 |     def __init__(self, embed_size, heads):
 56 |         super(SelfAttention, self).__init__()
 57 |         self.embed_size = embed_size
 58 |         self.heads = heads
 59 |         self.head_dim = embed_size // heads
 60 | 
 61 |         assert (self.head_dim * heads == embed_size
 62 |                 ), "Embedding size needs to be divisible by heads"
 63 | 
 64 |         self.Q = nn.Linear(self.embed_size, self.embed_size, bias=False)
 65 |         self.K = nn.Linear(self.embed_size, self.embed_size, bias=False)
 66 |         self.V = nn.Linear(self.embed_size, self.embed_size, bias=False)
 67 |         self.O = nn.Linear(self.embed_size, self.embed_size, bias=False)
 68 | 
 69 |     def forward(self, values, keys, query, mask):
 70 |         N = query.shape[0]
 71 |         value_len, key_len, query_len = values.shape[1], keys.shape[
 72 |             1], query.shape[1]
 73 | 
 74 |         xq, xk, xv = (
 75 |             self.Q(values).reshape(N, value_len, self.heads, self.head_dim),
 76 |             self.K(values).reshape(N, key_len, self.heads, self.head_dim),
 77 |             self.V(values).reshape(N, query_len, self.heads, self.head_dim),
 78 |         )
 79 | 
 80 |         # Split the embedding into self.heads different pieces
 81 |         xq, xkT = xq.permute(0, 2, 1, 3), xk.permute(0, 2, 3, 1)
 82 |         attention = xq @ xkT / self.head_dim**.5
 83 | 
 84 |         if mask is not None:
 85 |             mask[mask.eq(1)] = -torch.inf
 86 |             attention = attention + mask
 87 | 
 88 |         attention = F.softmax(attention, dim=-1)
 89 |         output = attention @ xv.permute(0, 2, 1, 3)
 90 | 
 91 |         output = output.permute(0, 2, 1, 3).reshape(N, value_len, -1)
 92 |         return self.O(output)
 93 | 
 94 | 
 95 | class TransformerBlock(nn.Module):
 96 | 
 97 |     def __init__(self, embed_size, heads, dropout, forward_expansion):
 98 |         super(TransformerBlock, self).__init__()
 99 |         self.attention = SelfAttention(embed_size, heads)
100 |         self.norm1 = nn.LayerNorm(embed_size)
101 |         self.norm2 = nn.LayerNorm(embed_size)
102 | 
103 |         self.feed_forward = nn.Sequential(
104 |             nn.Linear(
105 |                 embed_size,
106 |                 forward_expansion * embed_size,
107 |             ),
108 |             nn.ReLU(),
109 |             nn.Linear(
110 |                 forward_expansion * embed_size,
111 |                 embed_size,
112 |             ),
113 |         )
114 | 
115 |     def forward(self, value, key, query, mask):
116 |         attention = self.attention(value, key, query, mask)
117 |         x = (self.norm1(attention + query))
118 |         forward = self.feed_forward(x)
119 |         out = (self.norm2(forward + x))
120 |         return out
121 | 
122 | 
123 | def sinusoidal_positional_encoding(max_len: int, d_model: int):
124 |     position = np.arange(max_len)[:, np.newaxis]
125 |     div_term = np.exp(np.arange(0, d_model, 2) * (-np.log(10000.0) / d_model))
126 |     pe = np.zeros((max_len, d_model))
127 |     pe[:, 0::2] = np.sin(position * div_term)
128 |     pe[:, 1::2] = np.cos(position * div_term)
129 | 
130 |     return torch.tensor(pe.astype(np.float32))
131 | 
132 | 
133 | @torch.no_grad()
134 | def construct_mask(x: torch.Tensor, padding_idx=0):
135 |     mask = x.eq(padding_idx)  # [batch_size, seq_len]
136 |     return torch.unsqueeze(mask, 1).unsqueeze(1)  # [batch_size, 1, 1, seq_len]
137 | 
138 | 
139 | class Transformer(nn.Module):
140 | 
141 |     def __init__(
142 |         self,
143 |         embed_size,
144 |         num_layers,
145 |         heads,
146 |         forward_expansion,
147 |         dropout,
148 |         vocab_size,
149 |         max_length,
150 |     ):
151 |         super(Transformer, self).__init__()
152 |         self.embed_size = embed_size
153 |         self.word_embedding = nn.Embedding(
154 |             vocab_size,
155 |             embed_size,
156 |             padding_idx=0,
157 |         )
158 |         self.position_embedding = nn.Parameter(
159 |             sinusoidal_positional_encoding(max_length, embed_size), False)
160 | 
161 |         self.layers = nn.ModuleList([
162 |             TransformerBlock(
163 |                 embed_size,
164 |                 heads,
165 |                 dropout=dropout,
166 |                 forward_expansion=forward_expansion,
167 |             ) for _ in range(num_layers)
168 |         ])
169 | 
170 |         self.fc_out = nn.Linear(embed_size, 1)
171 | 
172 |     def forward(self, x, mask):
173 |         a = self.word_embedding(x)
174 |         out = a + self.position_embedding
175 | 
176 |         for layer in self.layers:
177 |             out = layer(out, out, out, mask)
178 | 
179 |         out = out[:, 0, :]
180 |         return self.fc_out(out)
181 | 
182 | 
183 | if __name__ == "__main__":
184 |     LR = 5e-4
185 |     EPOCHES = 100
186 |     TRAIN_BATCH_SIZE = 128
187 |     TEST_BATCH_SIZE = 512
188 |     use_cuda = True
189 | 
190 |     device = 'cuda' if torch.cuda.is_available() and use_cuda else 'cpu'
191 | 
192 |     X, y = load_data()
193 |     y[y == 0] = -1
194 | 
195 |     train_X, test_X, train_y, test_y = train_test_split(
196 |         torch.tensor(X),
197 |         torch.tensor(y),
198 |         train_size=0.8,
199 |         stratify=y,
200 |         shuffle=True,
201 |     )
202 | 
203 |     ratio_pos = (train_y.float().mean() + 1) / 2
204 | 
205 |     train_loader = data_loader(
206 |         train_X,
207 |         train_y,
208 |         shuffle=False,
209 |         batch_size=TRAIN_BATCH_SIZE,
210 |     )
211 |     test_loader = data_loader(
212 |         test_X,
213 |         test_y,
214 |         shuffle=False,
215 |         batch_size=TEST_BATCH_SIZE,
216 |     )
217 | 
218 |     net = Transformer(512, 1, 4, 3, 0.05, X.max() + 1, 44).to(device)
219 |     optimizer = Adam(net.parameters(), lr=LR)
220 |     bar = tqdm(range(EPOCHES))
221 |     info_list = []
222 |     for epoch in bar:
223 | 
224 |         net.train()
225 | 
226 |         for batch_X, batch_y in train_loader:
227 |             input_, label = batch_X.to(device), batch_y.to(device)
228 |             output = net(input_, construct_mask(input_))
229 |             weight = torch.ones(label.shape)
230 |             weight[label == -1] = 1 / (1 - ratio_pos)
231 |             weight[label == 1] = 1 / ratio_pos
232 |             loss = (weight.to(device) *
233 |                     torch.log(1 + torch.exp(-label * torch.squeeze(output)))
234 |                     ).mean()
235 |             optimizer.zero_grad()
236 |             loss.backward()
237 |             optimizer.step()
238 | 
239 |         net.eval()
240 |         train_right, train_size = 0, 0
241 |         test_right, test_size = 0, 0
242 | 
243 |         with torch.no_grad():
244 |             for batch_X, batch_y in train_loader:
245 |                 input_, label = batch_X.to(device), batch_y.to(device)
246 |                 pred = torch.sign(
247 |                     torch.squeeze(net(input_, construct_mask(input_))))
248 |                 train_right += (pred.data == label.data).sum()
249 |                 train_size += batch_X.shape[0]
250 | 
251 |             for batch_X, batch_y in test_loader:
252 |                 input_, label = batch_X.to(device), batch_y.to(device)
253 |                 pred = torch.sign(
254 |                     torch.squeeze(net(input_, construct_mask(input_))))
255 |                 test_right += (pred.data == label.data).sum()
256 |                 test_size += batch_X.shape[0]
257 | 
258 |         train_acc, test_acc = train_right / train_size, test_right / test_size
259 |         bar.set_postfix(
260 |             Loss="{:.6f}".format(loss.item()),
261 |             TEST_ACC="{:.4f}".format(test_acc),
262 |             TRAIN_ACC="{:.4f}".format(train_acc),
263 |         )
264 |         info_list.append([train_acc.item(), test_acc.item()])
265 | 
266 |     info_list = np.array(info_list)
267 | 
268 |     plt.figure(figsize=(5, 3))
269 | 
270 |     plt.rcParams['font.sans-serif'] = ['Times New Roman']
271 |     plt.rcParams['mathtext.fontset'] = 'stix'
272 |     plt.rcParams['xtick.direction'] = 'in'
273 |     plt.rcParams['ytick.direction'] = 'in'
274 |     plt.rcParams['axes.linewidth'] = 0.5
275 | 
276 |     plt.grid(zorder=-10)
277 |     plot_kwargs = {'linewidth': 0.7, 'zorder': 10}
278 | 
279 |     x = np.arange(0, 100, 4) + 2
280 |     plt.plot(x,
281 |              info_list[::4, 0],
282 |              label="Training accuracy",
283 |              color='blue',
284 |              marker='^',
285 |              **plot_kwargs,
286 |              linestyle='-')
287 |     plt.plot(x,
288 |              info_list[::4, 1],
289 |              label="Test accuracy",
290 |              color='red',
291 |              marker='*',
292 |              **plot_kwargs,
293 |              linestyle='--')
294 | 
295 |     plt.xlim(0, 100)
296 |     plt.ylim(.4, 1)
297 | 
298 |     plt.yticks([.4, .6, .8, 1], size=13)
299 |     plt.xticks([20, 40, 60, 80, 100], size=13)
300 |     plt.xlabel("Epochs", size=13)
301 |     plt.legend()
302 |     plt.tight_layout()
303 |     plt.savefig("imgs/transformer.png")
304 | 


--------------------------------------------------------------------------------
/examples/pydynet/transformer.py:
--------------------------------------------------------------------------------
  1 | from os.path import join
  2 | from tqdm import tqdm
  3 | 
  4 | import pydynet as pdn
  5 | import pydynet.nn as nn
  6 | import pydynet.nn.functional as F
  7 | from pydynet.optim import Adam
  8 | from pydynet.data import data_loader
  9 | 
 10 | import numpy as np
 11 | 
 12 | import numpy as np
 13 | import matplotlib.pyplot as plt
 14 | from sklearn.model_selection import train_test_split
 15 | 
 16 | np.random.seed(42)
 17 | 
 18 | path = r'examples/data/CoLA/tokenized'
 19 | 
 20 | 
 21 | def extract(line: str):
 22 |     lines = line.split('\t')
 23 |     y = int(lines[1])
 24 |     sentence = lines[-1][:-1]
 25 |     return sentence.split(), y
 26 | 
 27 | 
 28 | def load_data():
 29 | 
 30 |     with open(join(path, 'in_domain_train.tsv'), 'r', encoding='utf-8') as f:
 31 |         lines = f.readlines()
 32 | 
 33 |     sens, ys = [], []
 34 |     max_len = -1
 35 |     word_dict = set()
 36 |     for line in tqdm(lines):
 37 |         x, y = extract(line)
 38 |         word_dict = word_dict.union(set(x))
 39 |         max_len = max(max_len, len(x))
 40 |         sens.append(x)
 41 |         ys.append(y)
 42 |     word_dict = list(word_dict)
 43 | 
 44 |     X = np.zeros((len(lines), max_len), dtype=int)
 45 |     for i in tqdm(range(len(lines))):
 46 |         for j, word in enumerate(sens[i]):
 47 |             X[i, j] = word_dict.index(word) + 1
 48 |     y = np.array(ys)
 49 | 
 50 |     return X, y
 51 | 
 52 | 
 53 | class SelfAttention(nn.Module):
 54 | 
 55 |     def __init__(self, embed_size, heads):
 56 |         super(SelfAttention, self).__init__()
 57 |         self.embed_size = embed_size
 58 |         self.heads = heads
 59 |         self.head_dim = embed_size // heads
 60 | 
 61 |         assert (self.head_dim * heads == embed_size
 62 |                 ), "Embedding size needs to be divisible by heads"
 63 | 
 64 |         self.Q = nn.Linear(self.embed_size,
 65 |                            self.embed_size,
 66 |                            bias=False,
 67 |                            dtype=np.float32)
 68 |         self.K = nn.Linear(self.embed_size,
 69 |                            self.embed_size,
 70 |                            bias=False,
 71 |                            dtype=np.float32)
 72 |         self.V = nn.Linear(self.embed_size,
 73 |                            self.embed_size,
 74 |                            bias=False,
 75 |                            dtype=np.float32)
 76 |         self.O = nn.Linear(self.embed_size,
 77 |                            self.embed_size,
 78 |                            bias=False,
 79 |                            dtype=np.float32)
 80 | 
 81 |     def forward(self, values, keys, query, mask):
 82 |         N = query.shape[0]
 83 |         value_len, key_len, query_len = values.shape[1], keys.shape[
 84 |             1], query.shape[1]
 85 | 
 86 |         xq, xk, xv = (
 87 |             self.Q(values).reshape(N, value_len, self.heads, self.head_dim),
 88 |             self.K(values).reshape(N, key_len, self.heads, self.head_dim),
 89 |             self.V(values).reshape(N, query_len, self.heads, self.head_dim),
 90 |         )
 91 | 
 92 |         # Split the embedding into self.heads different pieces
 93 |         xq, xkT = xq.transpose(0, 2, 1, 3), xk.transpose(0, 2, 3, 1)
 94 |         attention = xq @ xkT / self.head_dim**.5
 95 | 
 96 |         if mask is not None:
 97 |             mask[mask.eq(1)] = np.float32('-inf')
 98 |             attention = attention + mask
 99 | 
100 |         attention = F.softmax(attention, axis=-1)
101 |         output = attention @ xv.transpose(0, 2, 1, 3)
102 | 
103 |         output = output.transpose(0, 2, 1, 3).reshape(N, value_len, -1)
104 |         return self.O(output)
105 | 
106 | 
107 | class TransformerBlock(nn.Module):
108 | 
109 |     def __init__(self, embed_size, heads, dropout, forward_expansion):
110 |         super(TransformerBlock, self).__init__()
111 |         self.attention = SelfAttention(embed_size, heads)
112 |         self.norm1 = nn.LayerNorm(embed_size, dtype=np.float32)
113 |         self.norm2 = nn.LayerNorm(embed_size, dtype=np.float32)
114 | 
115 |         self.feed_forward = nn.Sequential(
116 |             nn.Linear(embed_size,
117 |                       forward_expansion * embed_size,
118 |                       dtype=np.float32),
119 |             nn.ReLU(),
120 |             nn.Linear(forward_expansion * embed_size,
121 |                       embed_size,
122 |                       dtype=np.float32),
123 |         )
124 | 
125 |     def forward(self, value, key, query, mask):
126 |         attention = self.attention(value, key, query, mask)
127 |         x = (self.norm1(attention + query))
128 |         forward = self.feed_forward(x)
129 |         out = (self.norm2(forward + x))
130 |         return out
131 | 
132 | 
133 | def sinusoidal_positional_encoding(max_len: int, d_model: int):
134 |     position = np.arange(max_len)[:, np.newaxis]
135 |     div_term = np.exp(np.arange(0, d_model, 2) * (-np.log(10000.0) / d_model))
136 |     pe = np.zeros((max_len, d_model))
137 |     pe[:, 0::2] = np.sin(position * div_term)
138 |     pe[:, 1::2] = np.cos(position * div_term)
139 | 
140 |     return pdn.Tensor(pe.astype(np.float32))
141 | 
142 | 
143 | @pdn.no_grad()
144 | def construct_mask(x: pdn.Tensor, padding_idx=0):
145 |     mask = x.eq(padding_idx)  # [batch_size, seq_len]
146 |     return pdn.unsqueeze(mask, (1, 2)).astype(
147 |         np.float32)  # [batch_size, 1, 1, seq_len]
148 | 
149 | 
150 | class Transformer(nn.Module):
151 | 
152 |     def __init__(
153 |         self,
154 |         embed_size,
155 |         num_layers,
156 |         heads,
157 |         forward_expansion,
158 |         dropout,
159 |         vocab_size,
160 |         max_length,
161 |     ):
162 |         super(Transformer, self).__init__()
163 |         self.embed_size = embed_size
164 |         self.word_embedding = nn.Embedding(
165 |             vocab_size,
166 |             embed_size,
167 |             padding_idx=0,
168 |             dtype=np.float32,
169 |         )
170 |         self.position_embedding = nn.Parameter(
171 |             sinusoidal_positional_encoding(max_length, embed_size), False)
172 | 
173 |         self.layers = nn.ModuleList([
174 |             TransformerBlock(
175 |                 embed_size,
176 |                 heads,
177 |                 dropout=dropout,
178 |                 forward_expansion=forward_expansion,
179 |             ) for _ in range(num_layers)
180 |         ])
181 | 
182 |         self.fc_out = nn.Linear(embed_size, 1, dtype=np.float32)
183 | 
184 |     def forward(self, x, mask):
185 |         a = self.word_embedding(x)
186 |         out = a + self.position_embedding
187 | 
188 |         for layer in self.layers:
189 |             out = layer(out, out, out, mask)
190 | 
191 |         out = out[:, 0, :]
192 |         return self.fc_out(out)
193 | 
194 | 
195 | if __name__ == "__main__":
196 |     LR = 5e-4
197 |     EPOCHES = 100
198 |     TRAIN_BATCH_SIZE = 128
199 |     TEST_BATCH_SIZE = 512
200 |     use_cuda = True
201 | 
202 |     device = 'cuda' if pdn.cuda.is_available() and use_cuda else 'cpu'
203 | 
204 |     X, y = load_data()
205 |     y[y == 0] = -1
206 | 
207 |     train_X, test_X, train_y, test_y = train_test_split(
208 |         pdn.Tensor(X),
209 |         pdn.Tensor(y),
210 |         train_size=0.8,
211 |         stratify=y,
212 |         shuffle=True,
213 |     )
214 | 
215 |     ratio_pos = (train_y.mean() + 1) / 2
216 | 
217 |     train_loader = data_loader(
218 |         train_X,
219 |         train_y,
220 |         shuffle=False,
221 |         batch_size=TRAIN_BATCH_SIZE,
222 |     )
223 |     test_loader = data_loader(
224 |         test_X,
225 |         test_y,
226 |         shuffle=False,
227 |         batch_size=TEST_BATCH_SIZE,
228 |     )
229 | 
230 |     net = Transformer(512, 1, 4, 3, 0.05, X.max() + 1, 44).to(device)
231 |     optimizer = Adam(net.parameters(), lr=LR)
232 |     bar = tqdm(range(EPOCHES))
233 |     info_list = []
234 |     for epoch in bar:
235 | 
236 |         net.train()
237 | 
238 |         for batch_X, batch_y in train_loader:
239 |             input_, label = batch_X.to(device), batch_y.to(device)
240 |             output = net(input_, construct_mask(input_))
241 |             weight = pdn.ones(label.shape, dtype=np.float32)
242 |             weight[label == -1] = 1 / (1 - ratio_pos)
243 |             weight[label == 1] = 1 / ratio_pos
244 |             loss = (weight.to(device) *
245 |                     pdn.log(1 + pdn.exp(-label * pdn.squeeze(output)))).mean()
246 |             optimizer.zero_grad()
247 |             loss.backward()
248 |             optimizer.step()
249 | 
250 |         net.eval()
251 |         train_right, train_size = 0, 0
252 |         test_right, test_size = 0, 0
253 | 
254 |         with pdn.no_grad():
255 |             for batch_X, batch_y in train_loader:
256 |                 input_, label = batch_X.to(device), batch_y.to(device)
257 |                 pred = pdn.sign(
258 |                     pdn.squeeze(net(input_, construct_mask(input_))))
259 |                 train_right += (pred.data == label.data).sum()
260 |                 train_size += batch_X.shape[0]
261 | 
262 |             for batch_X, batch_y in test_loader:
263 |                 input_, label = batch_X.to(device), batch_y.to(device)
264 |                 pred = pdn.sign(
265 |                     pdn.squeeze(net(input_, construct_mask(input_))))
266 |                 test_right += (pred.data == label.data).sum()
267 |                 test_size += batch_X.shape[0]
268 | 
269 |         train_acc, test_acc = train_right / train_size, test_right / test_size
270 |         bar.set_postfix(
271 |             Loss="{:.6f}".format(loss.item()),
272 |             TEST_ACC="{:.4f}".format(test_acc),
273 |             TRAIN_ACC="{:.4f}".format(train_acc),
274 |         )
275 |         info_list.append([train_acc.item(), test_acc.item()])
276 | 
277 |     info_list = np.array(info_list)
278 | 
279 |     plt.figure(figsize=(5, 3))
280 | 
281 |     plt.rcParams['font.sans-serif'] = ['Times New Roman']
282 |     plt.rcParams['mathtext.fontset'] = 'stix'
283 |     plt.rcParams['xtick.direction'] = 'in'
284 |     plt.rcParams['ytick.direction'] = 'in'
285 |     plt.rcParams['axes.linewidth'] = 0.5
286 | 
287 |     plt.grid(zorder=-10)
288 |     plot_kwargs = {'linewidth': 0.7, 'zorder': 10}
289 | 
290 |     x = np.arange(0, 100, 4) + 2
291 |     plt.plot(x,
292 |              info_list[::4, 0],
293 |              label="Training accuracy",
294 |              color='blue',
295 |              marker='^',
296 |              **plot_kwargs,
297 |              linestyle='-')
298 |     plt.plot(x,
299 |              info_list[::4, 1],
300 |              label="Test accuracy",
301 |              color='red',
302 |              marker='*',
303 |              **plot_kwargs,
304 |              linestyle='--')
305 | 
306 |     plt.xlim(0, 100)
307 |     plt.ylim(.4, 1)
308 | 
309 |     plt.yticks([.4, .6, .8, 1], size=13)
310 |     plt.xticks([20, 40, 60, 80, 100], size=13)
311 |     plt.xlabel("Epochs", size=13)
312 |     plt.legend()
313 |     plt.tight_layout()
314 |     plt.savefig("imgs/transformer.png")
315 | 


--------------------------------------------------------------------------------
/llm/clip/infer.py:
--------------------------------------------------------------------------------
  1 | import os, json, urllib, zipfile
  2 | import urllib.request
  3 | from PIL import Image
  4 | 
  5 | import numpy as np
  6 | import pydynet as pdn
  7 | import pydynet.nn.functional as F
  8 | 
  9 | from .tokenizer import SimpleTokenizer
 10 | from .model import CLIP
 11 | 
 12 | 
 13 | def download(url: str, filename: str, chunk_size: int = 10**6) -> None:
 14 |     # Create directories if they don't exist yet
 15 |     directories = os.path.dirname(filename)
 16 |     if directories:
 17 |         os.makedirs(directories, exist_ok=True)
 18 | 
 19 |     # Download the file
 20 |     with urllib.request.urlopen(url) as response:
 21 |         total = int(response.info()["Content-Length"])
 22 | 
 23 |         buf = b""
 24 |         while True:
 25 |             data = response.read(chunk_size)
 26 |             if not data:
 27 |                 break
 28 |             buf += data
 29 |             print(f"Downloading {filename} {len(buf) / total * 100:.2f} %")
 30 | 
 31 |     # Write the downloaded data to the file
 32 |     with open(filename, "wb") as f:
 33 |         f.write(buf)
 34 | 
 35 | 
 36 | def load_zip(path: str):
 37 |     files = {}
 38 | 
 39 |     with zipfile.ZipFile(path) as z:
 40 |         for file_info in z.infolist():
 41 |             with z.open(file_info) as f:
 42 |                 path = file_info.filename
 43 |                 files[path] = f.read()
 44 | 
 45 |     return files
 46 | 
 47 | 
 48 | class Params:
 49 | 
 50 |     def __init__(self, name: str, download_root: str = None) -> None:
 51 |         assert name == "ViT-B/32", f"Model {name} not supported yet. Only ViT-B-32 currently supported."
 52 | 
 53 |         model_urls = {
 54 |             "RN50":
 55 |             "https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt",
 56 |             "RN101":
 57 |             "https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt",
 58 |             "RN50x4":
 59 |             "https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt",
 60 |             "RN50x16":
 61 |             "https://openaipublic.azureedge.net/clip/models/52378b407f34354e150460fe41077663dd5b39c54cd0bfd2b27167a4a06ec9aa/RN50x16.pt",
 62 |             "RN50x64":
 63 |             "https://openaipublic.azureedge.net/clip/models/be1cfb55d75a9666199fb2206c106743da0f6468c9d327f3e0d0a543a9919d9c/RN50x64.pt",
 64 |             "ViT-B/32":
 65 |             "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
 66 |             "ViT-B/16":
 67 |             "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt",
 68 |             "ViT-L/14":
 69 |             "https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt",
 70 |             "ViT-L/14@336px":
 71 |             "https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt",
 72 |         }
 73 | 
 74 |         model_url = model_urls[name]
 75 | 
 76 |         name = name.replace("/", "-")
 77 | 
 78 |         if download_root is None:
 79 |             download_root = os.path.expanduser(f"~/.cache/clip")
 80 |             download_root = os.environ.get("CLIP_DIR", download_root)
 81 | 
 82 |         model_path = os.path.join(download_root, f"{name}.pt")
 83 | 
 84 |         if not os.path.isfile(model_path):
 85 |             print(f"Downloading {model_path} from {model_url}")
 86 |             download(model_url, model_path)
 87 | 
 88 |         self.files = load_zip(model_path)
 89 | 
 90 |         with open(f"{download_root}/{name}.json") as f:
 91 |             self.info = json.load(f)
 92 | 
 93 |     def get_int(self, name: str) -> int:
 94 |         info = self.info[name]
 95 | 
 96 |         value: int = info["value"]
 97 | 
 98 |         return value
 99 | 
100 |     def __getitem__(self, name: str):
101 |         info = self.info[name]
102 | 
103 |         path = info["path"]
104 |         dtype = info["dtype"]
105 |         shape = info["shape"]
106 |         start = info["start"]
107 |         end = info["end"]
108 | 
109 |         assert dtype in ["float16", "float32"]
110 | 
111 |         data = self.files[path][start:end]
112 | 
113 |         arr = np.frombuffer(data, dtype=dtype).reshape(shape)
114 |         arr = arr.astype(np.float32)
115 | 
116 |         return arr
117 | 
118 | 
119 | def tokenize(texts: list[str], context_length: int = 77):
120 |     tokenizer = SimpleTokenizer()
121 | 
122 |     sot_token = tokenizer.encoder["<|startoftext|>"]
123 |     eot_token = tokenizer.encoder["<|endoftext|>"]
124 | 
125 |     all_tokens = [[sot_token] + tokenizer.encode(text) + [eot_token]
126 |                   for text in texts]
127 | 
128 |     result = np.zeros((len(all_tokens), context_length), dtype=np.int64)
129 | 
130 |     for i, tokens in enumerate(all_tokens):
131 |         if len(tokens) > context_length:
132 |             raise RuntimeError(
133 |                 f"Input {texts[i]} is too long for context length {context_length}"
134 |             )
135 | 
136 |         result[i, :len(tokens)] = tokens
137 | 
138 |     return result
139 | 
140 | 
141 | def preprocess(image: Image.Image, image_size: int = 224):
142 |     # Scale image such that length of smaller side is 224
143 |     width, height = image.size
144 |     scale = image_size / min(width, height)
145 |     width = int(scale * width)
146 |     height = int(scale * height)
147 |     # Some Pillow versions have different interface
148 |     if hasattr(Image, "Resampling"):
149 |         image = image.resize((width, height), Image.Resampling.BICUBIC)
150 |     else:
151 |         image = image.resize((width, height), Image.BICUBIC)
152 | 
153 |     # Crop center
154 |     x0 = round((width - image_size) / 2)
155 |     y0 = round((height - image_size) / 2)
156 |     x1 = x0 + image_size
157 |     y1 = y0 + image_size
158 |     image = image.crop((x0, y0, x1, y1))
159 | 
160 |     image = image.convert("RGB")
161 | 
162 |     # Normalize
163 |     x = np.array(image, dtype=np.float32) / 255.0
164 |     mean = np.array([0.48145466, 0.4578275, 0.40821073], dtype=np.float32)
165 |     std = np.array([0.26862954, 0.26130258, 0.27577711], dtype=np.float32)
166 |     x = (x - mean) / std
167 | 
168 |     x = x.transpose(2, 0, 1)
169 | 
170 |     return pdn.Tensor(x, copy=None)
171 | 
172 | 
173 | @pdn.no_grad()
174 | def load_model(model: CLIP, param: Params):
175 | 
176 |     # with pdn.no_grad():
177 |     model.scale = pdn.exp(param["logit_scale"].astype(np.float32))
178 |     model.class_embed.data[0, 0] = param["visual.class_embedding"]
179 |     model.v_pos_emb.data[...] = param["visual.positional_embedding"]
180 |     model.t_pos_emb.data[...] = param["positional_embedding"]
181 | 
182 |     model.image_encoder.kernel.data[...] = param["visual.conv1.weight"]
183 |     model.image_encoder.pre_norm.scale[...] = param["visual.ln_pre.weight"]
184 |     model.image_encoder.pre_norm.shift[...] = param["visual.ln_pre.bias"]
185 |     model.image_encoder.post_norm.scale[...] = param["visual.ln_post.weight"]
186 |     model.image_encoder.post_norm.shift[...] = param["visual.ln_post.bias"]
187 | 
188 |     model.image_encoder.proj.weight[...] = param["visual.proj"]
189 | 
190 |     model.text_encoder.token_embed.weight[
191 |         ...] = param["token_embedding.weight"]
192 |     model.text_encoder.post_norm.scale[...] = param["ln_final.weight"]
193 |     model.text_encoder.post_norm.shift[...] = param["ln_final.bias"]
194 |     model.text_encoder.proj.weight[...] = param["text_projection"]
195 | 
196 |     prefix = "transformer.resblocks."
197 |     for i in range(12):
198 |         (
199 |             model.image_encoder.transformers[i].mha.QKV.weight.data[...],
200 |             model.image_encoder.transformers[i].mha.QKV.bias.data[...],
201 |             model.image_encoder.transformers[i].mha.O.weight.data[...],
202 |             model.image_encoder.transformers[i].mha.O.bias.data[...],
203 |             model.image_encoder.transformers[i].layer_norm1.scale.data[...],
204 |             model.image_encoder.transformers[i].layer_norm1.shift.data[...],
205 |             model.image_encoder.transformers[i].layer_norm2.scale.data[...],
206 |             model.image_encoder.transformers[i].layer_norm2.shift.data[...],
207 |             model.image_encoder.transformers[i].mlp.fc1.weight.data[...],
208 |             model.image_encoder.transformers[i].mlp.fc1.bias.data[...],
209 |             model.image_encoder.transformers[i].mlp.fc2.weight.data[...],
210 |             model.image_encoder.transformers[i].mlp.fc2.bias.data[...],
211 |             model.text_encoder.transformers[i].mha.QKV.weight.data[...],
212 |             model.text_encoder.transformers[i].mha.QKV.bias.data[...],
213 |             model.text_encoder.transformers[i].mha.O.weight.data[...],
214 |             model.text_encoder.transformers[i].mha.O.bias.data[...],
215 |             model.text_encoder.transformers[i].layer_norm1.scale.data[...],
216 |             model.text_encoder.transformers[i].layer_norm1.shift.data[...],
217 |             model.text_encoder.transformers[i].layer_norm2.scale.data[...],
218 |             model.text_encoder.transformers[i].layer_norm2.shift.data[...],
219 |             model.text_encoder.transformers[i].mlp.fc1.weight.data[...],
220 |             model.text_encoder.transformers[i].mlp.fc1.bias.data[...],
221 |             model.text_encoder.transformers[i].mlp.fc2.weight.data[...],
222 |             model.text_encoder.transformers[i].mlp.fc2.bias.data[...],
223 |         ) = (
224 |             param["visual." + prefix + f"{i}.attn.in_proj_weight"].T,
225 |             param["visual." + prefix + f"{i}.attn.in_proj_bias"],
226 |             param["visual." + prefix + f"{i}.attn.out_proj.weight"].T,
227 |             param["visual." + prefix + f"{i}.attn.out_proj.bias"],
228 |             param["visual." + prefix + f"{i}.ln_1.weight"],
229 |             param["visual." + prefix + f"{i}.ln_1.bias"],
230 |             param["visual." + prefix + f"{i}.ln_2.weight"],
231 |             param["visual." + prefix + f"{i}.ln_2.bias"],
232 |             param["visual." + prefix + f"{i}.mlp.c_fc.weight"].T,
233 |             param["visual." + prefix + f"{i}.mlp.c_fc.bias"],
234 |             param["visual." + prefix + f"{i}.mlp.c_proj.weight"].T,
235 |             param["visual." + prefix + f"{i}.mlp.c_proj.bias"],
236 |             param[prefix + f"{i}.attn.in_proj_weight"].T,
237 |             param[prefix + f"{i}.attn.in_proj_bias"],
238 |             param[prefix + f"{i}.attn.out_proj.weight"].T,
239 |             param[prefix + f"{i}.attn.out_proj.bias"],
240 |             param[prefix + f"{i}.ln_1.weight"],
241 |             param[prefix + f"{i}.ln_1.bias"],
242 |             param[prefix + f"{i}.ln_2.weight"],
243 |             param[prefix + f"{i}.ln_2.bias"],
244 |             param[prefix + f"{i}.mlp.c_fc.weight"].T,
245 |             param[prefix + f"{i}.mlp.c_fc.bias"],
246 |             param[prefix + f"{i}.mlp.c_proj.weight"].T,
247 |             param[prefix + f"{i}.mlp.c_proj.bias"],
248 |         )
249 |     return model
250 | 
251 | 
252 | image = preprocess(Image.open("llm/clip/picture.png"))[np.newaxis, :, :, :]
253 | text = tokenize(["a fish", "a dog", "a cat"])
254 | clip = load_model(CLIP(), Params("ViT-B/32", download_root='llm/clip/data'))
255 | 
256 | with pdn.no_grad():
257 |     logits_per_image = clip(image, text)
258 |     probs = F.softmax(logits_per_image, axis=-1)
259 |     print("Label probs:", probs.numpy()[0])
260 | 


--------------------------------------------------------------------------------
/pydynet/nn/functional.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from ..core import tensor, function
  4 | from .. import unsqueeze, no_grad
  5 | 
  6 | 
  7 | def linear(x: tensor.Tensor, weight: tensor.Tensor, bias: tensor.Tensor):
  8 |     affine = x @ weight
  9 |     if bias is not None:
 10 |         affine = affine + bias
 11 |     return affine
 12 | 
 13 | 
 14 | def embedding(x: tensor.Tensor, weight: tensor.Tensor, padding_idx: int):
 15 |     query = weight[x]
 16 |     if padding_idx is not None:
 17 |         with tensor.no_grad():
 18 |             mask = unsqueeze(x.ne(padding_idx), -1)
 19 |         query = query * mask
 20 |     return query
 21 | 
 22 | 
 23 | def sigmoid(x: tensor.Tensor):
 24 |     return tensor.sigmoid(x)
 25 | 
 26 | 
 27 | def tanh(x: tensor.Tensor):
 28 |     return tensor.tanh(x)
 29 | 
 30 | 
 31 | def relu(x: tensor.Tensor):
 32 |     return tensor.maximum(0., x)
 33 | 
 34 | 
 35 | def leaky_relu(x: tensor.Tensor, alpha: float):
 36 |     return tensor.maximum(x, alpha * x)
 37 | 
 38 | 
 39 | def silu(x: tensor.Tensor):
 40 |     return x / (1 + tensor.exp(-x))
 41 | 
 42 | 
 43 | def softmax(x: tensor.Tensor, axis=None):
 44 |     '''Softmax函数'''
 45 |     with no_grad():
 46 |         max_ = x.max(axis, keepdims=True)
 47 |     x_sub_max = x - max_
 48 |     exp_ = tensor.exp(x_sub_max)
 49 |     return exp_ / tensor.sum(exp_, axis=axis, keepdims=True)
 50 | 
 51 | 
 52 | def log_softmax(x: tensor.Tensor, axis=None, keepdims=False):
 53 |     '''log-softmax函数'''
 54 |     with no_grad():
 55 |         max_ = x.max(axis, keepdims=True)
 56 |     x_sub_max = x - max_
 57 |     return x_sub_max - tensor.log(
 58 |         tensor.sum(tensor.exp(x_sub_max), axis=axis, keepdims=keepdims))
 59 | 
 60 | 
 61 | class __im2col1d(tensor._UnaryOperator):
 62 | 
 63 |     def __init__(
 64 |         self,
 65 |         x: tensor.Tensor,
 66 |         kernel_size: int,
 67 |         stride: int,
 68 |     ) -> None:
 69 |         self.N, self.in_channels, self.n_features = x.shape
 70 |         self.kernel_size = kernel_size
 71 |         self.stride = stride
 72 |         self.n_output = (self.n_features - self.kernel_size) // stride + 1
 73 |         super().__init__(x)
 74 | 
 75 |     def forward_(self, x: tensor.Tensor) -> np.ndarray:
 76 |         s0, s1, s2 = x.strides
 77 |         shape = (x.shape[0], self.in_channels, self.kernel_size, self.n_output)
 78 |         self.__strides = (s0, s1, s2, s2 * self.stride)
 79 | 
 80 |         col = self.xp.lib.stride_tricks.as_strided(
 81 |             x.data,
 82 |             shape=shape,
 83 |             strides=self.__strides,
 84 |         ).copy()
 85 |         return col
 86 | 
 87 |     def grad_fn(self, x: tensor.Tensor, grad: np.ndarray) -> np.ndarray:
 88 |         grad_x = self.xp.zeros(x.shape, dtype=self.dtype)
 89 |         view = self.xp.lib.stride_tricks.as_strided(
 90 |             grad_x,
 91 |             shape=self.shape,
 92 |             strides=self.__strides,
 93 |         )
 94 |         self.xp.add.at(view, (..., ), grad)
 95 |         return grad_x
 96 | 
 97 | 
 98 | class __pad1d(tensor._UnaryOperator):
 99 | 
100 |     def __init__(self, x: tensor.Tensor, pad_width=0) -> None:
101 |         self.pad_width = pad_width
102 |         super().__init__(x)
103 | 
104 |     def forward_(self, x: tensor.Tensor) -> np.ndarray:
105 |         return self.xp.pad(x.data, [(0, 0), (0, 0),
106 |                                     (self.pad_width, self.pad_width)],
107 |                            'constant')
108 | 
109 |     def grad_fn(self, x: tensor.Tensor, grad: np.ndarray) -> np.ndarray:
110 |         if self.pad_width == 0:
111 |             return grad[...]
112 |         return grad[..., self.pad_width:-self.pad_width]
113 | 
114 | 
115 | def conv1d(
116 |     x: tensor.Tensor,
117 |     kernel: tensor.Tensor,
118 |     padding: int = 0,
119 |     stride: int = 1,
120 | ):
121 |     '''一维卷积函数
122 | 
123 |     基于im2col实现的一维卷积.
124 |     
125 |     Parameters
126 |     ----------
127 |     x : Tensor
128 |         输入数据, 形状为(N, in_channels, n_features);
129 |     kernel : Tensor
130 |         卷积核, 形状为(out_channels, in_channels, kernel_size);
131 |     padding : int, default=0
132 |         对输入特征两边补0数量;
133 |     stride : int, default=1
134 |         卷积步长.
135 |     '''
136 |     kernel_size = kernel.shape[-1]
137 |     pad_x = __pad1d(x, padding)
138 |     col = __im2col1d(pad_x, kernel_size, stride)
139 |     return (col @ kernel.transpose(1, 2, 0)).sum(1).swapaxes(1, 2)
140 | 
141 | 
142 | def max_pool1d(
143 |     x: tensor.Tensor,
144 |     kernel_size: int,
145 |     stride: int,
146 |     padding: int = 0,
147 | ):
148 |     '''一维池化函数
149 | 
150 |     基于im2col实现的一维池化.`
151 |     
152 |     Parameters
153 |     ----------
154 |     x : Tensor
155 |         输入数据, 形状为(N, in_channels, n_features);
156 |     kernel_size : int
157 |         池化核大小;
158 |     stride : int
159 |         卷积步长;
160 |     padding : int, default=0
161 |         对输入特征两边补0数量.
162 |     '''
163 |     pad_x = __pad1d(x, padding)
164 |     col = __im2col1d(pad_x, kernel_size, stride)
165 |     return col.max(-1)
166 | 
167 | 
168 | def avg_pool1d(
169 |     x: tensor.Tensor,
170 |     kernel_size: int,
171 |     stride: int,
172 |     padding: int = 0,
173 | ):
174 |     '''一维平均池化函数
175 | 
176 |     基于im2col实现的一维池化.`
177 |     
178 |     Parameters
179 |     ----------
180 |     x : Tensor
181 |         输入数据, 形状为(N, in_channels, n_features);
182 |     kernel_size : int
183 |         池化核大小;
184 |     stride : int
185 |         卷积步长;
186 |     padding : int, default=0
187 |         对输入特征两边补0数量.
188 |     '''
189 |     pad_x = __pad1d(x, padding)
190 |     col = __im2col1d(pad_x, kernel_size, stride)
191 |     return col.mean(-1)
192 | 
193 | 
194 | class __im2col2d(tensor._UnaryOperator):
195 | 
196 |     def __init__(
197 |         self,
198 |         x: tensor.Tensor,
199 |         kernel_size: int,
200 |         stride: int,
201 |     ) -> None:
202 |         _, self.in_channels, self.n_h, self.n_w = x.shape
203 |         self.kernel_size = kernel_size
204 |         self.stride = stride
205 |         self.out_h, self.out_w = (
206 |             self.n_h - self.kernel_size) // self.stride + 1, (
207 |                 self.n_w - self.kernel_size) // self.stride + 1
208 | 
209 |         super().__init__(x)
210 | 
211 |     def forward_(self, x: tensor.Tensor) -> np.ndarray:
212 |         s0, s1, s2, s3 = x.strides
213 |         shape = (x.shape[0], self.in_channels, self.kernel_size,
214 |                  self.kernel_size, self.out_h, self.out_w)
215 |         self.__strides = (s0, s1, s2, s3, s2 * self.stride, s3 * self.stride)
216 | 
217 |         col = self.xp.lib.stride_tricks.as_strided(
218 |             x.data,
219 |             shape=shape,
220 |             strides=self.__strides,
221 |         ).copy()
222 |         return col
223 | 
224 |     def grad_fn(self, x: tensor.Tensor, grad: np.ndarray) -> np.ndarray:
225 |         grad_x = self.xp.zeros(x.shape, dtype=self.dtype)
226 |         view = self.xp.lib.stride_tricks.as_strided(
227 |             grad_x,
228 |             shape=self.shape,
229 |             strides=self.__strides,
230 |         )
231 |         self.xp.add.at(view, (..., ), grad)
232 |         return grad_x
233 | 
234 | 
235 | class __pad2d(tensor._UnaryOperator):
236 | 
237 |     def __init__(self, x: tensor.Tensor, pad_width=0) -> None:
238 |         self.pad_width = pad_width
239 |         super().__init__(x)
240 | 
241 |     def forward_(self, x: tensor.Tensor) -> np.ndarray:
242 |         return self.xp.pad(x.data, [(0, 0), (0, 0),
243 |                                     (self.pad_width, self.pad_width),
244 |                                     (self.pad_width, self.pad_width)],
245 |                            'constant')
246 | 
247 |     def grad_fn(self, x: tensor.Tensor, grad: np.ndarray) -> np.ndarray:
248 |         if self.pad_width == 0:
249 |             return grad[...]
250 |         return grad[..., self.pad_width:-self.pad_width,
251 |                     self.pad_width:-self.pad_width]
252 | 
253 | 
254 | def conv2d(x: tensor.Tensor,
255 |            kernel: tensor.Tensor,
256 |            padding: int = 0,
257 |            stride: int = 1):
258 |     '''二维卷积函数
259 | 
260 |     基于im2col实现的二维卷积. 为了实现上的方便, 我们不考虑长宽不同的卷积核, 步长和补零。
261 |     
262 |     Parameters
263 |     ----------
264 |     x : Tensor
265 |         输入数据, 形状为(N, in_channels, n_height, n_width);
266 |     kernel : Tensor
267 |         卷积核, 形状为(out_channels, in_channels, kernel_height, kernel_width);
268 |     padding : int, default=0
269 |         对输入图片周围补0数量;
270 |     stride : int, default=1
271 |         卷积步长.
272 |     '''
273 |     N, _, _, _ = x.shape
274 |     out_channels, _, kernel_size, _ = kernel.shape
275 |     pad_x = __pad2d(x, padding)
276 |     col = __im2col2d(pad_x, kernel_size, stride)
277 |     out_h, out_w = col.shape[-2:]
278 |     col = col.transpose(0, 4, 5, 1, 2, 3).reshape(N * out_h * out_w, -1)
279 |     col_filter = kernel.reshape(out_channels, -1).T
280 |     out = col @ col_filter
281 |     return out.reshape(N, out_h, out_w, -1).transpose(0, 3, 1, 2)
282 | 
283 | 
284 | def max_pool2d(x: tensor.Tensor, kernel_size: int, stride: int, padding=0):
285 |     '''二维卷积函数池化
286 | 
287 |     基于im2col实现的二维卷积. 为了实现上的方便, 我们不考虑长宽不同的kernel_size, 步长和补零。
288 |     
289 |     Parameters
290 |     ----------
291 |     x : Tensor
292 |         输入数据, 形状为(N, in_channels, n_height, n_width);
293 |     kernel_size : int
294 |         池化核尺寸;
295 |     stride : int, default=1
296 |         卷积步长;
297 |     padding : int, default=0
298 |         对输入图片周围补0数量;
299 |     '''
300 |     N, in_channels, _, _ = x.shape
301 |     pad_x = __pad2d(x, padding)
302 |     col = __im2col2d(pad_x, kernel_size, stride)
303 |     out_h, out_w = col.shape[-2:]
304 |     col = col.transpose(0, 4, 5, 1, 2, 3).reshape(
305 |         -1,
306 |         kernel_size * kernel_size,
307 |     )
308 |     out = col.max(1)
309 |     out = out.reshape(N, out_h, out_w, in_channels).transpose(0, 3, 1, 2)
310 |     return out
311 | 
312 | 
313 | def avg_pool2d(x: tensor.Tensor, kernel_size: int, stride: int, padding=0):
314 |     '''二维平均池化
315 | 
316 |     基于im2col实现的二维池化. 为了实现上的方便, 我们不考虑长宽不同的kernel_size, 步长和补零。
317 |     
318 |     Parameters
319 |     ----------
320 |     x : Tensor
321 |         输入数据, 形状为(N, in_channels, n_height, n_width);
322 |     kernel_size : int
323 |         池化核尺寸;
324 |     stride : int, default=1
325 |         卷积步长;
326 |     padding : int, default=0
327 |         对输入图片周围补0数量;
328 |     '''
329 |     N, in_channels, _, _ = x.shape
330 |     pad_x = __pad2d(x, padding)
331 |     col = __im2col2d(pad_x, kernel_size, stride)
332 |     out_h, out_w = col.shape[-2:]
333 |     col = col.transpose(0, 4, 5, 1, 2, 3).reshape(
334 |         -1,
335 |         kernel_size * kernel_size,
336 |     )
337 |     out = col.mean(1)
338 |     out = out.reshape(N, out_h, out_w, in_channels).transpose(0, 3, 1, 2)
339 |     return out
340 | 
341 | 
342 | def mse_loss(y_pred, y_true, reduction='mean'):
343 |     '''均方误差'''
344 |     square_sum = function.square(y_pred - y_true)
345 |     if reduction == 'mean':
346 |         return tensor.mean(square_sum)
347 |     elif reduction == 'sum':
348 |         return tensor.sum(square_sum)
349 |     else:
350 |         raise ValueError("reduction must be mean or sum.")
351 | 
352 | 
353 | def nll_loss(y_pred, y_true, reduction='mean'):
354 |     '''负对数似然'''
355 |     nll = -y_pred * y_true
356 |     if reduction == 'mean':
357 |         return tensor.mean(nll)
358 |     elif reduction == 'sum':
359 |         return tensor.sum(nll)
360 |     else:
361 |         raise ValueError("reduction must be mean or sum.")
362 | 
363 | 
364 | def cross_entropy_loss(y_pred, y_true, reduction='mean'):
365 |     '''交叉熵损失'''
366 |     update_y_pred = y_pred - y_pred.max().item()
367 |     log_sum_exp = tensor.log(
368 |         tensor.sum(tensor.exp(update_y_pred), 1, keepdims=True))
369 | 
370 |     neg_log_sm = log_sum_exp - update_y_pred
371 |     if y_true.ndim == 1:
372 |         nll = neg_log_sm[range(len(neg_log_sm)), y_true]
373 |     else:
374 |         nll = neg_log_sm * y_true
375 | 
376 |     if reduction == 'mean':
377 |         return tensor.mean(nll)
378 |     elif reduction == 'sum':
379 |         return tensor.sum(nll)
380 |     else:
381 |         raise ValueError("reduction must be mean or sum.")
382 | 


--------------------------------------------------------------------------------
/pydynet/nn/modules/rnn.py:
--------------------------------------------------------------------------------
  1 | from .module import Module
  2 | from .. import init
  3 | from .. import functional as F
  4 | from ..parameter import Parameter
  5 | from ...special import empty, zeros
  6 | from ... import core
  7 | from ...cuda import Device
  8 | 
  9 | from typing import Literal, Optional, Tuple, List
 10 | import math
 11 | 
 12 | 
 13 | class RNNCell(Module):
 14 | 
 15 |     def __init__(
 16 |         self,
 17 |         input_size: int,
 18 |         hidden_size: int,
 19 |         bias: bool = True,
 20 |         nonlinearity: Literal['tanh', 'relu'] = 'tanh',
 21 |         device=None,
 22 |         dtype=None,
 23 |     ) -> None:
 24 |         super().__init__()
 25 |         self.input_size = input_size
 26 |         self.hidden_size = hidden_size
 27 |         self.kwargs = {"device": Device(device), "dtype": dtype}
 28 |         self.nonlinearity = nonlinearity
 29 |         self.fn = {'tanh': F.tanh, 'relu': F.relu}[nonlinearity]
 30 | 
 31 |         self.Wx = Parameter(empty((input_size, hidden_size), **self.kwargs))
 32 |         self.Wh = Parameter(empty((hidden_size, hidden_size), **self.kwargs))
 33 |         if bias:
 34 |             self.bias = Parameter(empty(self.hidden_size, **self.kwargs))
 35 |         self.has_bias = bias
 36 |         self.reset_paramters()
 37 | 
 38 |     def forward(self, x, h=None):
 39 |         if h is None:
 40 |             h = self.init_hidden(x)
 41 |         else:
 42 |             assert (x.ndim == 1 and h.shape == (self.hidden_size, )) or (
 43 |                 x.ndim == 2 and h.shape
 44 |                 == (x.shape[0], self.hidden_size)), "Wrong hidden state input!"
 45 | 
 46 |         lin = x @ self.Wx + h @ self.Wh
 47 |         if self.has_bias:
 48 |             lin = lin + self.bias
 49 |         return self.fn(lin)
 50 | 
 51 |     def reset_paramters(self):
 52 |         bound = math.sqrt(1 / self.hidden_size)
 53 |         init.uniform_(self.Wx, -bound, bound)
 54 |         init.uniform_(self.Wh, -bound, bound)
 55 |         if self.has_bias:
 56 |             init.uniform_(self.bias, -bound, bound)
 57 | 
 58 |     def init_hidden(self, x):
 59 |         assert x.ndim in {1, 2}
 60 |         if x.ndim == 1:
 61 |             return zeros(self.hidden_size, **self.kwargs)
 62 |         else:
 63 |             batch_size = x.shape[0]
 64 |             return zeros((batch_size, self.hidden_size), **self.kwargs)
 65 | 
 66 |     def __repr__(self) -> str:
 67 |         return "{}({}, {}, bias={}, nonlinearity={})".format(
 68 |             self.__class__.__name__,
 69 |             self.input_size,
 70 |             self.hidden_size,
 71 |             self.has_bias,
 72 |             self.nonlinearity,
 73 |         )
 74 | 
 75 |     def move(self, device):
 76 |         self.kwargs['device'] = device
 77 |         return super().move(device)
 78 | 
 79 | 
 80 | class RNN(Module):
 81 | 
 82 |     def __init__(
 83 |         self,
 84 |         input_size: int,
 85 |         hidden_size: int,
 86 |         num_layers: int = 1,
 87 |         nonlinearity: Literal['tanh', 'relu'] = 'tanh',
 88 |         bias: bool = True,
 89 |         batch_first: bool = False,
 90 |         bidirectional: bool = False,
 91 |         device=None,
 92 |         dtype=None,
 93 |     ) -> None:
 94 |         super().__init__()
 95 |         self.input_size = input_size
 96 |         self.hidden_size = hidden_size
 97 |         self.num_layers = num_layers
 98 |         self.nonlinearity = nonlinearity
 99 |         self.has_bias = bias
100 |         self.batch_first = batch_first
101 |         self.bidirectional = bidirectional
102 |         self.kwargs = {"device": Device(device), "dtype": dtype}
103 | 
104 |         assert num_layers > 0
105 |         size_list = [input_size] + [hidden_size] * (num_layers - 1)
106 |         self.RNNCells: List[RNNCell] = []
107 |         for i in range(num_layers):
108 |             cell = RNNCell(
109 |                 size_list[i],
110 |                 hidden_size,
111 |                 bias,
112 |                 nonlinearity,
113 |                 **self.kwargs,
114 |             )
115 |             setattr(self, 'rnn_{}'.format(i), cell)
116 |             self.RNNCells.append(cell)
117 |         if self.bidirectional:
118 |             self.rRNNCells: List[RNNCell] = []
119 |             for i in range(num_layers):
120 |                 cell = RNNCell(
121 |                     size_list[i],
122 |                     hidden_size,
123 |                     bias,
124 |                     nonlinearity,
125 |                     **self.kwargs,
126 |                 )
127 |                 setattr(self, 'rrnn_{}'.format(i), cell)
128 |                 self.rRNNCells.append(cell)
129 | 
130 |     def forward(self, x, h=None):
131 |         if self.batch_first and x.ndim == 3:
132 |             x = x.swapaxes(0, 1)
133 | 
134 |         if h is None:
135 |             h = self.init_hidden(x)
136 |         else:
137 |             d = 2 if self.bidirectional else 1
138 |             assert (x.ndim == 2
139 |                     and h.shape == (d * self.num_layers, self.hidden_size)
140 |                     ) or (x.ndim == 3 and h.shape
141 |                           == (d * self.num_layers, x.shape[1],
142 |                               self.hidden_size)), "Wrong hidden state input!"
143 | 
144 |         if self.num_layers == 1 and not self.bidirectional:
145 |             h_list = self.cell_forward(self.RNNCells[0], x, h[0])
146 |             output = core.concat(h_list)
147 |             hn = h_list[-1]
148 | 
149 |         elif self.num_layers == 1 and self.bidirectional:
150 |             h_list = self.cell_forward(self.RNNCells[0], x, h[0])
151 |             hr_list = self.cell_forward(self.rRNNCells[0], x[::-1], h[1])
152 |             output = core.concat(
153 |                 [core.concat(h_list),
154 |                  core.concat(hr_list[::-1])],
155 |                 axis=-1,
156 |             )
157 |             hn = core.concat([h_list[-1], hr_list[-1]])
158 | 
159 |         elif self.num_layers > 1 and not self.bidirectional:
160 |             hn_list = []
161 |             for i in range(self.num_layers):
162 |                 h_list = self.cell_forward(
163 |                     self.RNNCells[i],
164 |                     x if i == 0 else core.concat(h_list),
165 |                     h[i],
166 |                 )
167 |                 hn_list.append(h_list[-1])
168 |             output = core.concat(h_list)
169 |             hn = core.concat(hn_list)
170 | 
171 |         else:
172 |             hn_list = []
173 |             hrn_list = []
174 |             for i in range(self.num_layers):
175 |                 h_list = self.cell_forward(
176 |                     self.RNNCells[i],
177 |                     x if i == 0 else core.concat(h_list),
178 |                     h[i],
179 |                 )
180 |                 hr_list = self.cell_forward(
181 |                     self.rRNNCells[i],
182 |                     x[::-1] if i == 0 else core.concat(hr_list),
183 |                     h[i + self.num_layers],
184 |                 )
185 |                 hn_list.append(h_list[-1])
186 |                 hrn_list.append(hr_list[-1])
187 |             output = core.concat(
188 |                 [core.concat(h_list),
189 |                  core.concat(hr_list[::-1])], axis=-1)
190 |             hn = core.concat(hn_list + hrn_list)
191 | 
192 |         if self.batch_first and x.ndim == 3:
193 |             output = output.swapaxes(0, 1)
194 |             hn = hn.swapaxes(0, 1)
195 |         return output, hn
196 | 
197 |     def reset_parameters(self):
198 |         for i in range(self.num_layers):
199 |             self.RNNCells[i].reset_paramters()
200 |         if self.bidirectional:
201 |             for i in range(self.num_layers):
202 |                 self.rRNNCells[i].reset_paramters()
203 | 
204 |     def init_hidden(self, x):
205 |         assert x.ndim in {2, 3}
206 |         d = 2 if self.bidirectional else 1
207 |         if x.ndim == 2:
208 |             return zeros(
209 |                 (d * self.num_layers, self.hidden_size),
210 |                 **self.kwargs,
211 |             )
212 |         else:
213 |             batch_size = x.shape[1]
214 |             return zeros(
215 |                 (d * self.num_layers, batch_size, self.hidden_size),
216 |                 **self.kwargs,
217 |             )
218 | 
219 |     def cell_forward(self, cell: RNNCell, x, h):
220 |         seq_len = x.shape[0]
221 |         h_list = []
222 |         for i in range(seq_len):
223 |             h = cell(x[i], h)
224 |             h_list.append(core.unsqueeze(h, axis=0))
225 |         return h_list
226 | 
227 |     def __repr__(self) -> str:
228 |         return "{}({}, {}, num_layers={}, nonlinearity={}, bias={}, batch_first={}, bidirectional={})".format(
229 |             self.__class__.__name__,
230 |             self.input_size,
231 |             self.hidden_size,
232 |             self.num_layers,
233 |             self.nonlinearity,
234 |             self.has_bias,
235 |             self.batch_first,
236 |             self.bidirectional,
237 |         )
238 | 
239 |     def move(self, device):
240 |         self.kwargs['device'] = device
241 |         return super().move(device)
242 | 
243 | 
244 | class LSTMCell(Module):
245 | 
246 |     def __init__(
247 |         self,
248 |         input_size: int,
249 |         hidden_size: int,
250 |         bias: bool = True,
251 |         device=None,
252 |         dtype=None,
253 |     ) -> None:
254 |         super().__init__()
255 |         self.input_size = input_size
256 |         self.hidden_size = hidden_size
257 |         self.kwargs = {"device": Device(device), "dtype": dtype}
258 | 
259 |         self.Wx = Parameter(empty((input_size, 4 * hidden_size),
260 |                                   **self.kwargs))
261 |         self.Wh = Parameter(
262 |             empty((hidden_size, 4 * hidden_size), **self.kwargs))
263 |         if bias:
264 |             self.bias = Parameter(empty(4 * self.hidden_size, **self.kwargs))
265 |         self.has_bias = bias
266 |         self.reset_paramters()
267 | 
268 |     def forward(self, x, hx: Optional[Tuple] = None):
269 |         if hx is None:
270 |             h = self.init_hidden(x)
271 |             c = self.init_hidden(x)
272 |         else:
273 |             h, c = hx
274 |             assert (x.ndim == 1 and h.shape == (self.hidden_size, )) or (
275 |                 x.ndim == 2 and h.shape
276 |                 == (x.shape[0], self.hidden_size)), "Wrong hidden state input!"
277 |             assert (x.ndim == 1 and c.shape == (self.hidden_size, )) or (
278 |                 x.ndim == 2 and c.shape
279 |                 == (x.shape[0], self.hidden_size)), "Wrong cell state input!"
280 |         lin = x @ self.Wx + h @ self.Wh
281 |         if self.has_bias:
282 |             lin = lin + self.bias
283 |         fio, g = core.hsplit(lin, [3 * self.hidden_size])
284 |         sig_fio, tanh_g = F.sigmoid(fio), F.tanh(g)
285 |         f, i, o = core.hsplit(sig_fio, 3)
286 |         c = f * c + i * tanh_g
287 |         h = o * F.tanh(c)
288 |         return h, c
289 | 
290 |     def init_hidden(self, x):
291 |         assert x.ndim in {1, 2}
292 |         if x.ndim == 1:
293 |             return zeros(self.hidden_size, **self.kwargs)
294 |         else:
295 |             batch_size = x.shape[0]
296 |             return zeros((batch_size, self.hidden_size), **self.kwargs)
297 | 
298 |     def reset_paramters(self):
299 |         bound = math.sqrt(1 / self.hidden_size)
300 |         init.uniform_(self.Wx, -bound, bound)
301 |         init.uniform_(self.Wh, -bound, bound)
302 |         if self.has_bias:
303 |             init.uniform_(self.bias, -bound, bound)
304 | 
305 |     def __repr__(self) -> str:
306 |         return "{}({}, {}, bias={})".format(
307 |             self.__class__.__name__,
308 |             self.input_size,
309 |             self.hidden_size,
310 |             self.has_bias,
311 |         )
312 | 
313 |     def move(self, device):
314 |         self.kwargs['device'] = device
315 |         return super().move(device)
316 | 
317 | 
318 | class LSTM(Module):
319 | 
320 |     def __init__(
321 |         self,
322 |         input_size: int,
323 |         hidden_size: int,
324 |         num_layers: int = 1,
325 |         bias: bool = True,
326 |         batch_first: bool = False,
327 |         bidirectional: bool = False,
328 |         device=None,
329 |         dtype=None,
330 |     ) -> None:
331 |         super().__init__()
332 |         self.input_size = input_size
333 |         self.hidden_size = hidden_size
334 |         self.num_layers = num_layers
335 |         self.has_bias = bias
336 |         self.batch_first = batch_first
337 |         self.bidirectional = bidirectional
338 |         self.kwargs = {"device": Device(device), "dtype": dtype}
339 | 
340 |         assert num_layers > 0
341 |         size_list = [input_size] + [hidden_size] * (num_layers - 1)
342 |         self.LSTMCells: List[LSTMCell] = []
343 |         for i in range(num_layers):
344 |             cell = LSTMCell(
345 |                 size_list[i],
346 |                 hidden_size,
347 |                 bias,
348 |                 **self.kwargs,
349 |             )
350 |             setattr(self, 'lstm_{}'.format(i), cell)
351 |             self.LSTMCells.append(cell)
352 |         if self.bidirectional:
353 |             self.rLSTMCells: List[LSTMCell] = []
354 |             for i in range(num_layers):
355 |                 cell = LSTMCell(
356 |                     size_list[i],
357 |                     hidden_size,
358 |                     bias,
359 |                     **self.kwargs,
360 |                 )
361 |                 setattr(self, 'rlstm_{}'.format(i), cell)
362 |                 self.rLSTMCells.append(cell)
363 | 
364 |     def forward(self, x, hx: Optional[Tuple] = None):
365 |         if self.batch_first and x.ndim == 3:
366 |             x = x.swapaxes(0, 1)
367 | 
368 |         if hx is None:
369 |             h = self.init_hidden(x)
370 |             c = self.init_hidden(x)
371 |         else:
372 |             d = 2 if self.bidirectional else 1
373 |             h, c = hx
374 |             assert (x.ndim == 2
375 |                     and h.shape == (d * self.num_layers, self.hidden_size)
376 |                     ) or (x.ndim == 3 and h.shape
377 |                           == (d * self.num_layers, x.shape[1],
378 |                               self.hidden_size)), "Wrong hidden state input!"
379 |             assert (x.ndim == 2
380 |                     and c.shape == (d * self.num_layers, self.hidden_size)
381 |                     ) or (x.ndim == 3 and c.shape
382 |                           == (d * self.num_layers, x.shape[1],
383 |                               self.hidden_size)), "Wrong cell state input!"
384 | 
385 |         if self.num_layers == 1 and not self.bidirectional:
386 |             h_list, c_list = self.cell_forward(
387 |                 self.LSTMCells[0],
388 |                 x,
389 |                 (h[0], c[0]),
390 |             )
391 |             output = core.concat(h_list)
392 |             hn = h_list[-1]
393 |             cn = c_list[-1]
394 |         elif self.num_layers == 1 and self.bidirectional:
395 |             h_list, c_list = self.cell_forward(
396 |                 self.LSTMCells[0],
397 |                 x,
398 |                 (h[0], c[0]),
399 |             )
400 |             hr_list, cr_list = self.cell_forward(
401 |                 self.rLSTMCells[0],
402 |                 x[::-1],
403 |                 (h[1], c[1]),
404 |             )
405 |             output = core.concat(
406 |                 [core.concat(h_list),
407 |                  core.concat(hr_list[::-1])], axis=-1)
408 |             hn = core.concat([h_list[-1], hr_list[-1]])
409 |             cn = core.concat([c_list[-1], cr_list[-1]])
410 |         elif self.num_layers > 1 and not self.bidirectional:
411 |             hn_list, cn_list = [], []
412 |             for i in range(self.num_layers):
413 |                 h_list, c_list = self.cell_forward(
414 |                     self.LSTMCells[i],
415 |                     x if i == 0 else core.concat(h_list),
416 |                     (h[i], c[i]),
417 |                 )
418 |                 hn_list.append(h_list[-1])
419 |                 cn_list.append(c_list[-1])
420 |             output = core.concat(h_list)
421 |             hn = core.concat(hn_list)
422 |             cn = core.concat(cn_list)
423 |         else:
424 |             hn_list, hrn_list = [], []
425 |             cn_list, crn_list = [], []
426 |             for i in range(self.num_layers):
427 |                 h_list, c_list = self.cell_forward(
428 |                     self.LSTMCells[i],
429 |                     x if i == 0 else core.concat(h_list),
430 |                     (h[i], c[i]),
431 |                 )
432 |                 hr_list, cr_list = self.cell_forward(
433 |                     self.rLSTMCells[i],
434 |                     x[::-1] if i == 0 else core.concat(hr_list),
435 |                     (h[i + self.num_layers], c[i + self.num_layers]),
436 |                 )
437 |                 hn_list.append(h_list[-1])
438 |                 hrn_list.append(hr_list[-1])
439 |                 cn_list.append(c_list[-1])
440 |                 crn_list.append(cr_list[-1])
441 |             output = core.concat(
442 |                 [core.concat(h_list),
443 |                  core.concat(hr_list[::-1])], axis=-1)
444 |             hn = core.concat(hn_list + hrn_list)
445 |             cn = core.concat(cn_list + crn_list)
446 |         if self.batch_first and x.ndim == 3:
447 |             output = output.swapaxes(0, 1)
448 |             hn = hn.swapaxes(0, 1)
449 |             cn = cn.swapaxes(0, 1)
450 | 
451 |         return output, (hn, cn)
452 | 
453 |     def reset_parameters(self):
454 |         for i in range(self.num_layers):
455 |             self.LSTMCells[i].reset_paramters()
456 |         if self.bidirectional:
457 |             for i in range(self.num_layers):
458 |                 self.rLSTMCells[i].reset_paramters()
459 | 
460 |     def init_hidden(self, x):
461 |         assert x.ndim in {2, 3}
462 |         d = 2 if self.bidirectional else 1
463 |         if x.ndim == 2:
464 |             return zeros(
465 |                 (d * self.num_layers, self.hidden_size),
466 |                 **self.kwargs,
467 |             )
468 |         else:
469 |             batch_size = x.shape[1]
470 |             return zeros(
471 |                 (d * self.num_layers, batch_size, self.hidden_size),
472 |                 **self.kwargs,
473 |             )
474 | 
475 |     def cell_forward(self, cell: RNNCell, x, h: Tuple):
476 |         seq_len = x.shape[0]
477 |         h_list, c_list = [], []
478 |         for i in range(seq_len):
479 |             h = cell(x[i], h)  # Infact, `h` here is a tuple (h, c)
480 |             h_list.append(core.unsqueeze(h[0], axis=0))
481 |             c_list.append(core.unsqueeze(h[1], axis=0))
482 |         return h_list, c_list
483 | 
484 |     def __repr__(self) -> str:
485 |         return "{}({}, {}, num_layers={}, bias={}, batch_first={}, bidirectional={})".format(
486 |             self.__class__.__name__,
487 |             self.input_size,
488 |             self.hidden_size,
489 |             self.num_layers,
490 |             self.has_bias,
491 |             self.batch_first,
492 |             self.bidirectional,
493 |         )
494 | 
495 |     def move(self, device):
496 |         self.kwargs['device'] = device
497 |         return super().move(device)
498 | 
499 | 
500 | class GRUCell(Module):
501 | 
502 |     def __init__(
503 |         self,
504 |         input_size: int,
505 |         hidden_size: int,
506 |         bias: bool = True,
507 |         device=None,
508 |         dtype=None,
509 |     ) -> None:
510 |         super().__init__()
511 |         self.input_size = input_size
512 |         self.hidden_size = hidden_size
513 |         self.kwargs = {"device": Device(device), "dtype": dtype}
514 | 
515 |         self.Wx1 = Parameter(
516 |             empty((input_size, 2 * hidden_size), **self.kwargs))
517 |         self.Wh1 = Parameter(
518 |             empty((hidden_size, 2 * hidden_size), **self.kwargs))
519 |         self.Wx2 = Parameter(empty((input_size, hidden_size), **self.kwargs))
520 |         self.Wh2 = Parameter(empty((hidden_size, hidden_size), **self.kwargs))
521 | 
522 |         if bias:
523 |             self.bias1 = Parameter(empty(2 * self.hidden_size, **self.kwargs))
524 |             self.bias2 = Parameter(empty(self.hidden_size, **self.kwargs))
525 | 
526 |         self.has_bias = bias
527 |         self.reset_parameters()
528 | 
529 |     def forward(self, x, h=None):
530 |         if h is None:
531 |             h = self.init_hidden(x)
532 |         else:
533 |             assert (x.ndim == 1 and h.shape == (self.hidden_size, )) or (
534 |                 x.ndim == 2 and h.shape
535 |                 == (x.shape[0], self.hidden_size)), "Wrong hidden state input!"
536 | 
537 |         lin1 = x @ self.Wx1 + h @ self.Wh1
538 |         if self.has_bias:
539 |             lin1 = lin1 + self.bias1
540 |         z, r = core.split(F.sigmoid(lin1), 2, axis=1)
541 |         lin2 = x @ self.Wx2 + (r * h) @ self.Wh2
542 |         if self.has_bias:
543 |             lin2 = lin2 + self.bias2
544 |         return (1 - z) * h + z * F.tanh(lin2)
545 | 
546 |     def reset_parameters(self):
547 |         bound = math.sqrt(1 / self.hidden_size)
548 |         init.uniform_(self.Wx1, -bound, bound)
549 |         init.uniform_(self.Wx2, -bound, bound)
550 |         init.uniform_(self.Wh1, -bound, bound)
551 |         init.uniform_(self.Wh2, -bound, bound)
552 |         if self.has_bias:
553 |             init.uniform_(self.bias1, -bound, bound)
554 |             init.uniform_(self.bias2, -bound, bound)
555 | 
556 |     def init_hidden(self, x):
557 |         assert x.ndim in {1, 2}
558 |         if x.ndim == 1:
559 |             return zeros(self.hidden_size, **self.kwargs)
560 |         else:
561 |             batch_size = x.shape[0]
562 |             return zeros((batch_size, self.hidden_size), **self.kwargs)
563 | 
564 |     def __repr__(self) -> str:
565 |         return "{}({}, {}, bias={})".format(
566 |             self.__class__.__name__,
567 |             self.input_size,
568 |             self.hidden_size,
569 |             self.has_bias,
570 |         )
571 | 
572 |     def move(self, device):
573 |         self.kwargs['device'] = device
574 |         return super().move(device)
575 | 
576 | 
577 | class GRU(Module):
578 | 
579 |     def __init__(
580 |         self,
581 |         input_size: int,
582 |         hidden_size: int,
583 |         num_layers: int = 1,
584 |         bias: bool = True,
585 |         batch_first: bool = False,
586 |         bidirectional: bool = False,
587 |         device=None,
588 |         dtype=None,
589 |     ) -> None:
590 |         super().__init__()
591 |         self.input_size = input_size
592 |         self.hidden_size = hidden_size
593 |         self.num_layers = num_layers
594 |         self.has_bias = bias
595 |         self.batch_first = batch_first
596 |         self.bidirectional = bidirectional
597 |         self.kwargs = {"device": Device(device), "dtype": dtype}
598 | 
599 |         assert num_layers > 0
600 |         size_list = [input_size] + [hidden_size] * (num_layers - 1)
601 |         self.GRUCells: List[GRUCell] = []
602 |         for i in range(num_layers):
603 |             cell = GRUCell(
604 |                 size_list[i],
605 |                 hidden_size,
606 |                 bias,
607 |                 **self.kwargs,
608 |             )
609 |             setattr(self, 'gru_{}'.format(i), cell)
610 |             self.GRUCells.append(cell)
611 |         if self.bidirectional:
612 |             self.rGRUCells: List[GRUCell] = []
613 |             for i in range(num_layers):
614 |                 cell = GRUCell(
615 |                     size_list[i],
616 |                     hidden_size,
617 |                     bias,
618 |                     **self.kwargs,
619 |                 )
620 |                 setattr(self, 'rgru_{}'.format(i), cell)
621 |                 self.rGRUCells.append(cell)
622 | 
623 |     def forward(self, x, h=None):
624 |         if self.batch_first and x.ndim == 3:
625 |             x = x.swapaxes(0, 1)
626 | 
627 |         if h is None:
628 |             h = self.init_hidden(x)
629 |         else:
630 |             d = 2 if self.bidirectional else 1
631 |             assert (x.ndim == 2
632 |                     and h.shape == (d * self.num_layers, self.hidden_size)
633 |                     ) or (x.ndim == 3 and h.shape
634 |                           == (d * self.num_layers, x.shape[1],
635 |                               self.hidden_size)), "Wrong hidden state input!"
636 | 
637 |         if self.num_layers == 1 and not self.bidirectional:
638 |             h_list = self.cell_forward(self.GRUCells[0], x, h[0])
639 |             output = core.concat(h_list)
640 |             hn = h_list[-1]
641 | 
642 |         elif self.num_layers == 1 and self.bidirectional:
643 |             h_list = self.cell_forward(self.GRUCells[0], x, h[0])
644 |             hr_list = self.cell_forward(self.rGRUCells[0], x[::-1], h[1])
645 |             output = core.concat(
646 |                 [core.concat(h_list),
647 |                  core.concat(hr_list[::-1])], axis=-1)
648 |             hn = core.concat([h_list[-1], hr_list[-1]])
649 | 
650 |         elif self.num_layers > 1 and not self.bidirectional:
651 |             hn_list = []
652 |             for i in range(self.num_layers):
653 |                 h_list = self.cell_forward(
654 |                     self.GRUCells[i],
655 |                     x if i == 0 else core.concat(h_list),
656 |                     h[i],
657 |                 )
658 |                 hn_list.append(h_list[-1])
659 |             output = core.concat(h_list)
660 |             hn = core.concat(hn_list)
661 | 
662 |         else:
663 |             hn_list = []
664 |             hrn_list = []
665 |             for i in range(self.num_layers):
666 |                 h_list = self.cell_forward(
667 |                     self.GRUCells[i],
668 |                     x if i == 0 else core.concat(h_list),
669 |                     h[i],
670 |                 )
671 |                 hr_list = self.cell_forward(
672 |                     self.rGRUCells[i],
673 |                     x[::-1] if i == 0 else core.concat(hr_list),
674 |                     h[i + self.num_layers],
675 |                 )
676 |                 hn_list.append(h_list[-1])
677 |                 hrn_list.append(hr_list[-1])
678 |             output = core.concat(
679 |                 [core.concat(h_list),
680 |                  core.concat(hr_list[::-1])], axis=-1)
681 |             hn = core.concat(hn_list + hrn_list)
682 | 
683 |         if self.batch_first and x.ndim == 3:
684 |             output = output.swapaxes(0, 1)
685 |             hn = hn.swapaxes(0, 1)
686 |         return output, hn
687 | 
688 |     def init_hidden(self, x):
689 |         assert x.ndim in {2, 3}
690 |         d = 2 if self.bidirectional else 1
691 |         if x.ndim == 2:
692 |             return zeros(
693 |                 (d * self.num_layers, self.hidden_size),
694 |                 **self.kwargs,
695 |             )
696 |         else:
697 |             return zeros(
698 |                 (d * self.num_layers, x.shape[1], self.hidden_size),
699 |                 **self.kwargs,
700 |             )
701 | 
702 |     def cell_forward(self, cell: GRUCell, x, h):
703 |         seq_len = x.shape[0]
704 |         h_list = []
705 |         for i in range(seq_len):
706 |             h = cell(x[i], h)
707 |             h_list.append(core.unsqueeze(h, axis=0))
708 |         return h_list
709 | 
710 |     def __repr__(self) -> str:
711 |         return "{}({}, {}, num_layers={}, bias={}, batch_first={}, bidirectional={})".format(
712 |             self.__class__.__name__,
713 |             self.input_size,
714 |             self.hidden_size,
715 |             self.num_layers,
716 |             self.has_bias,
717 |             self.batch_first,
718 |             self.bidirectional,
719 |         )
720 | 
721 |     def move(self, device):
722 |         self.kwargs['device'] = device
723 |         return super().move(device)
724 | 


--------------------------------------------------------------------------------