├── tests
    ├── __init__.py
    ├── test_pnn
    │   ├── __init__.py
    │   └── test_estimators.py
    ├── test_autoint
    │   ├── __init__.py
    │   ├── test_estimators.py
    │   └── test_modules.py
    ├── test_embedding
    │   ├── __init__.py
    │   ├── test_ragged
    │   │   ├── __init__.py
    │   │   └── test_common.py
    │   ├── test_uniform
    │   │   ├── __init__.py
    │   │   └── test_numeric.py
    │   ├── utils.py
    │   └── test_common.py
    ├── test_fibinet
    │   ├── __init__.py
    │   └── test_estimators.py
    ├── test_mlpnet
    │   ├── __init__.py
    │   ├── test_estimators.py
    │   └── test_modules.py
    ├── test_xdeepfm
    │   ├── __init__.py
    │   ├── test_estimators.py
    │   └── test_modules.py
    ├── test_base_classes
    │   ├── __init__.py
    │   └── test_modules.py
    ├── conftest.py
    ├── test_ghost_norm.py
    ├── common.py
    └── test_dataset.py
├── xynn
    ├── __init__.py
    ├── base_classes
    │   └── __init__.py
    ├── mlpnet
    │   ├── __init__.py
    │   ├── modules.py
    │   └── estimators.py
    ├── autoint
    │   ├── __init__.py
    │   └── estimators.py
    ├── fibinet
    │   ├── __init__.py
    │   └── estimators.py
    ├── xdeepfm
    │   ├── __init__.py
    │   └── estimators.py
    ├── pnn
    │   ├── __init__.py
    │   └── estimators.py
    ├── embedding
    │   ├── ragged
    │   │   ├── __init__.py
    │   │   ├── common.py
    │   │   └── fast_ragged.py
    │   ├── uniform
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── numeric.py
    │   │   ├── fast_categorical.py
    │   │   └── categorical.py
    │   ├── __init__.py
    │   ├── utils.py
    │   └── common.py
    ├── ghost_norm.py
    ├── dataset.py
    ├── preprocessing.py
    └── mlp.py
├── requirements
    ├── test.txt
    └── examples.txt
├── requirements.txt
├── pyproject.toml
├── .gitignore
├── README.md
└── setup.py


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/xynn/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_pnn/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements/test.txt:
--------------------------------------------------------------------------------
1 | pytest>=6.0


--------------------------------------------------------------------------------
/tests/test_autoint/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_embedding/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_fibinet/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_mlpnet/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_xdeepfm/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/xynn/base_classes/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements/examples.txt:
--------------------------------------------------------------------------------
1 | pandas>=1.2.3


--------------------------------------------------------------------------------
/tests/test_base_classes/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_embedding/test_ragged/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_embedding/test_uniform/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.20.2
2 | scikit-learn>=0.24.1
3 | torch>=1.8.1
4 | tqdm>=4.59.0


--------------------------------------------------------------------------------
/xynn/mlpnet/__init__.py:
--------------------------------------------------------------------------------
1 | from .modules import MLPNet
2 | from .estimators import MLPClassifier, MLPRegressor
3 | 


--------------------------------------------------------------------------------
/xynn/autoint/__init__.py:
--------------------------------------------------------------------------------
1 | from .modules import AutoInt
2 | from .estimators import AutoIntClassifier, AutoIntRegressor
3 | 


--------------------------------------------------------------------------------
/xynn/fibinet/__init__.py:
--------------------------------------------------------------------------------
1 | from .modules import FiBiNet
2 | from .estimators import FiBiNetRegressor, FiBiNetClassifier
3 | 


--------------------------------------------------------------------------------
/xynn/xdeepfm/__init__.py:
--------------------------------------------------------------------------------
1 | from .modules import XDeepFM
2 | from .estimators import XDeepFMClassifier, XDeepFMRegressor
3 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 |     "setuptools",
4 |     "wheel",
5 | ]
6 | build-backend = "setuptools.build_meta"


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .ipynb_checkpoints/
 2 | *.pyc
 3 | .coverage
 4 | __pycache__/
 5 | build/
 6 | lightning_logs/
 7 | dist/
 8 | shim.egg-info/
 9 | .vscode/
10 | 


--------------------------------------------------------------------------------
/xynn/pnn/__init__.py:
--------------------------------------------------------------------------------
1 | from .modules import PNN, PNNPlus
2 | from .estimators import PNNClassifier, PNNPlusClassifier
3 | from .estimators import PNNRegressor, PNNPlusRegressor
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # XyNN: Experimental code for tabular neural networks
2 | 
3 | This repo implements models from the paper [Simple Modifications to Improve Tabular Neural Networks](https://arxiv.org/abs/2108.03214).
4 | 


--------------------------------------------------------------------------------
/xynn/embedding/ragged/__init__.py:
--------------------------------------------------------------------------------
1 | from .common import RaggedBase
2 | from .ragged import RaggedEmbedding, RaggedDefaultEmbedding
3 | from .fast_ragged import FastRaggedEmbedding, FastRaggedDefaultEmbedding
4 | 


--------------------------------------------------------------------------------
/xynn/embedding/uniform/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import UniformBase
2 | from .numeric import LinearEmbedding, DenseEmbedding
3 | from .categorical import BasicEmbedding, DefaultEmbedding
4 | from .fast_categorical import FastBasicEmbedding, FastDefaultEmbedding
5 | 


--------------------------------------------------------------------------------
/xynn/embedding/__init__.py:
--------------------------------------------------------------------------------
1 | from .common import EmbeddingBase
2 | from .uniform import LinearEmbedding, DenseEmbedding
3 | from .uniform import BasicEmbedding, DefaultEmbedding
4 | from .uniform import FastBasicEmbedding, FastDefaultEmbedding
5 | from .ragged import RaggedEmbedding, RaggedDefaultEmbedding
6 | from .ragged import FastRaggedEmbedding, FastRaggedDefaultEmbedding
7 | from .utils import fit_embeddings, check_embeddings, check_uniform_embeddings
8 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | def pytest_addoption(parser):
 2 |     parser.addoption("--device", default="cpu")
 3 | 
 4 | 
 5 | def pytest_generate_tests(metafunc):
 6 |     # This is called for every test. Only get/set command line arguments
 7 |     # if the argument is specified in the list of test "fixturenames".
 8 |     option_value = metafunc.config.option.device
 9 |     if "device" in metafunc.fixturenames and option_value is not None:
10 |         metafunc.parametrize("device", [option_value])
11 | 


--------------------------------------------------------------------------------
/xynn/embedding/uniform/base.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Base class for uniform embeddings, with fixed vector size across fields
 3 | 
 4 | """
 5 | 
 6 | from typing import Tuple
 7 | 
 8 | from torch import Tensor
 9 | 
10 | from ..common import EmbeddingBase
11 | 
12 | 
13 | class UniformBase(EmbeddingBase):
14 |     """Base class for embeddings that have a single vector size for all fields"""
15 | 
16 |     def weight_sum(self) -> Tuple[Tensor, Tensor]:
17 |         """
18 |         Sum of absolute value and square of embedding weights
19 | 
20 |         Return
21 |         ------
22 |         e1_sum : sum of absolute value of embedding values
23 |         e2_sum : sum of squared embedding values
24 |         """
25 |         if not self._isfit:
26 |             return 0.0, 0.0
27 |         e1_sum = self.embedding.weight.abs().sum()
28 |         e2_sum = (self.embedding.weight ** 2).sum()
29 |         return e1_sum, e2_sum
30 | 


--------------------------------------------------------------------------------
/tests/test_embedding/utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | import pandas as pd
 4 | from torch import nn
 5 | from torch.utils.data import Dataset
 6 | 
 7 | 
 8 | class Reshape(nn.Module):
 9 |     def forward(self, X):
10 |         return X.reshape((X.shape[0], -1))
11 | 
12 | 
13 | def example_data():
14 |     data = pd.DataFrame(
15 |         {
16 |             "num_a": [i / 10 for i in range(10)],
17 |             "num_b": range(10, 0, -1),
18 |             "cat_a": [0, 1, 2, 3, 0, 1, 2, 0, 1, 0],
19 |             "cat_b": [0, 1, 1, 0, 1, 0, 2, 1, 0, 1],
20 |             "cat_c": [1, 1, 0, 0, 1, 1, 0, np.nan, 1, 1],
21 |         }
22 |     )
23 |     return data
24 | 
25 | 
26 | class SimpleDataset(Dataset):
27 | 
28 |     def __init__(self, data):
29 |         self.data = data
30 | 
31 |     def __len__(self):
32 |         return len(self.data)
33 | 
34 |     def __getitem__(self, idx):
35 |         return self.data[idx]
36 | 


--------------------------------------------------------------------------------
/tests/test_ghost_norm.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | from torch import nn
 4 | 
 5 | from xynn.ghost_norm import GhostBatchNorm
 6 | 
 7 | 
 8 | def test_ghostbatchnorm():
 9 |     gbn = GhostBatchNorm(3, 4, 0.2)
10 |     assert gbn.inner_norm.num_features == 3
11 |     assert gbn.inner_norm.momentum == 0.2
12 |     assert gbn.virtual_batch_size == 4
13 | 
14 |     x = torch.tensor(
15 |         [
16 |             [-1,  0,  3],
17 |             [ 0,  2, -3],
18 |             [ 1, -2,  0],
19 |             [ 0,  0,  0],
20 |             [-2,  0,  1],
21 |             [ 0,  3, -1],
22 |             [ 2, -3,  0],
23 |             [ 0,  0,  0],
24 |             [-3,  0,  2],
25 |             [ 0,  1, -2],
26 |             [ 3, -1,  0],
27 |             [ 0,  0,  0],
28 |         ],
29 |         dtype=torch.float,
30 |     )
31 |     out = gbn(x)
32 |     expected = torch.tensor(
33 |         [
34 |             [-1.4142,  0.0000,  1.4142],
35 |             [ 0.0000,  1.4142, -1.4142],
36 |             [ 1.4142, -1.4142,  0.0000],
37 |             [ 0.0000,  0.0000,  0.0000],
38 |             [-1.4142,  0.0000,  1.4142],
39 |             [ 0.0000,  1.4142, -1.4142],
40 |             [ 1.4142, -1.4142,  0.0000],
41 |             [ 0.0000,  0.0000,  0.0000],
42 |             [-1.4142,  0.0000,  1.4142],
43 |             [ 0.0000,  1.4142, -1.4142],
44 |             [ 1.4142, -1.4142,  0.0000],
45 |             [ 0.0000,  0.0000,  0.0000],
46 |         ]
47 |     )
48 |     assert torch.allclose(out, expected)
49 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from typing import List
 4 | 
 5 | from setuptools import find_packages, setup
 6 | 
 7 | # copied from pytorch-lightning
 8 | _PATH_ROOT = os.path.dirname(__file__)
 9 | _PATH_REQUIRE = os.path.join(_PATH_ROOT, 'requirements')
10 | 
11 | 
12 | def _load_requirements(path_dir: str, file_name: str = 'requirements.txt', comment_char: str = '#') -> List[str]:
13 |     """Load requirements from a file
14 |     >>> _load_requirements(_PROJECT_ROOT)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
15 |     ['numpy...', 'torch...', ...]
16 |     """
17 |     with open(os.path.join(path_dir, file_name), 'r') as file:
18 |         lines = [ln.strip() for ln in file.readlines()]
19 |     reqs = []
20 |     for ln in lines:
21 |         # filer all comments
22 |         if comment_char in ln:
23 |             ln = ln[:ln.index(comment_char)].strip()
24 |         # skip directly installed dependencies
25 |         if ln.startswith('http'):
26 |             continue
27 |         if ln:  # if requirement is not empty
28 |             reqs.append(ln)
29 |     return reqs
30 | 
31 | # https://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-extras
32 | # Define package extras. These are only installed if you specify them.
33 | # From remote, use like `pip install pytorch-lightning[dev, docs]`
34 | # From local copy of repo, use like `pip install ".[dev, docs]"`
35 | extras = {
36 |     'examples': _load_requirements(path_dir=_PATH_REQUIRE, file_name='examples.txt'),
37 |     'test': _load_requirements(path_dir=_PATH_REQUIRE, file_name='test.txt')
38 | }
39 | extras['dev'] = extras['examples'] + extras['test']
40 | extras['all'] = extras['dev']
41 | 
42 | 
43 | setup(
44 |     name='xynn',
45 |     version='0.1',
46 |     description='A collection of Tabular NN models with a Scikit-learn API',
47 |     url='https://github.com/jrfiedler/xynn',
48 |     author='James Fiedler',
49 |     author_email='jrfiedler@gmail.com',
50 |     license='MIT',
51 |     python_requires=">=3.7",
52 |     packages=find_packages(exclude=['tests','tests/*',]),
53 |     zip_safe=False,
54 |     keywords=['deep learning', 'pytorch', 'AI'],
55 |     setup_requires=[],
56 |     install_requires=_load_requirements(_PATH_ROOT),
57 |     extras_require=extras,
58 |     )


--------------------------------------------------------------------------------
/tests/test_embedding/test_ragged/test_common.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import torch
 4 | from torch import nn
 5 | import pytest
 6 | 
 7 | from xynn.embedding.ragged.common import _check_embedding_size, _parse_embedding_size
 8 | 
 9 | 
10 | def test__check_embedding_size_raises_error_for_bad_embedding_size():
11 |     # bad name
12 |     with pytest.raises(
13 |         ValueError,
14 |         match=(
15 |             "str embedding_size value must be one of {'sqrt', 'log', 'fastai'}; "
16 |             "got 'fourth_rt'"
17 |         ),
18 |     ):
19 |         _check_embedding_size("fourth_rt")
20 | 
21 |     # single int not allowed
22 |     with pytest.raises(
23 |         TypeError,
24 |         match="embedding_size 5 not understood",
25 |     ):
26 |         _check_embedding_size(5)
27 | 
28 |     # float values not allowed
29 |     with pytest.raises(
30 |         TypeError,
31 |         match="embedding_size \[5, 10, 15.0\] not understood",
32 |     ):
33 |         _check_embedding_size([5, 10, 15.0])
34 | 
35 |     # wrong number of ints
36 |     with pytest.raises(
37 |         ValueError,
38 |         match="number of embeddings must match number of fields, got 3 sizes and 4 fields",
39 |     ):
40 |         _check_embedding_size([5, 10, 15], [10, 20, 30, 40])
41 | 
42 | 
43 | def test__check_embedding_size_with_uppercase():
44 |     assert _check_embedding_size("SQRT") == "sqrt"
45 |     assert _check_embedding_size("Log") == "log"
46 |     assert _check_embedding_size("FastAI") == "fastai"
47 | 
48 | 
49 | def test__check_embedding_size_with_ints():
50 |     assert _check_embedding_size([5, 10, 15]) == [5, 10, 15]
51 |     assert _check_embedding_size((5, 10, 15)) == (5, 10, 15)
52 | 
53 | 
54 | def test__parse_embedding_size_with_sqrt():
55 |     output = _parse_embedding_size("sqrt", 20, [4, 25, 64, 196, 400, 625, 1600])
56 |     assert output == [2, 5, 8, 14, 20, 20, 20]
57 | 
58 | 
59 | def test__parse_embedding_size_with_log():
60 |     output = _parse_embedding_size("log", 7, [4, 25, 64, 196, 400, 625, 1600])
61 |     assert output == [2, 4, 5, 6, 6, 7, 7]
62 | 
63 | 
64 | def test__parse_embedding_size_with_fastai():
65 |     output = _parse_embedding_size("fastai", 50, [4, 25, 64, 196, 400, 625, 1600])
66 |     assert output == [3, 10, 16, 31, 46, 50, 50]
67 | 
68 | 
69 | def test__parse_embedding_size_with_ints():
70 |     output = _parse_embedding_size(
71 |         [5, 5, 5, 5, 5, 5, 5], 20, [4, 25, 64, 196, 400, 625, 1600]
72 |     )
73 |     assert output == [5] * 7
74 | 


--------------------------------------------------------------------------------
/xynn/ghost_norm.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Module for Ghost Batch Norm and variations.
 3 | Ghost Batch Norm: https://arxiv.org/pdf/1705.08741.pdf
 4 | 
 5 | """
 6 | 
 7 | from math import ceil
 8 | from typing import Union
 9 | 
10 | import torch
11 | from torch import Tensor
12 | from torch import nn
13 | 
14 | 
15 | class GhostNorm(nn.Module):
16 |     """
17 |     Ghost Normalization
18 |     https://arxiv.org/pdf/1705.08741.pdf
19 | 
20 |     """
21 | 
22 |     def __init__(
23 |         self,
24 |         inner_norm: nn.Module,
25 |         virtual_batch_size: int,
26 |         device: Union[str, torch.device] = "cpu",
27 |     ):
28 |         """
29 |         Parameters
30 |         ----------
31 |         inner_norm : torch.nn.Module (initialiezd)
32 |             examples: `nn.BatchNorm1d`, `nn.LayerNorm`
33 |         virtual_batch_size : int
34 |         device : string or torch.device, optional
35 |             default is "cpu"
36 | 
37 |         """
38 |         super().__init__()
39 |         self.virtual_batch_size = virtual_batch_size
40 |         self.inner_norm = inner_norm
41 |         self.to(device)
42 | 
43 |     def forward(self, x: Tensor) -> Tensor:
44 |         """
45 |         Transform the input tensor
46 | 
47 |         Parameters
48 |         ----------
49 |         x : torch.Tensor
50 | 
51 |         Return
52 |         ------
53 |         torch.Tensor
54 | 
55 |         """
56 |         chunk_size = int(ceil(x.shape[0] / self.virtual_batch_size))
57 |         chunk_norm = [self.inner_norm(chunk) for chunk in x.chunk(chunk_size, dim=0)]
58 |         return torch.cat(chunk_norm, dim=0)
59 | 
60 | 
61 | class GhostBatchNorm(GhostNorm):
62 |     """
63 |     Ghost Normalization, using BatchNorm1d as inner normalization
64 |     https://arxiv.org/pdf/1705.08741.pdf
65 | 
66 |     """
67 | 
68 |     def __init__(
69 |         self,
70 |         num_features: int,
71 |         virtual_batch_size: int = 64,
72 |         momentum: float = 0.1,
73 |         device: Union[str, torch.device] = "cpu",
74 |     ):
75 |         """
76 |         Parameters
77 |         ----------
78 |         num_features : int
79 |         virtual_batch_size : int, optional
80 |             default is 64
81 |         momentum : float, optional
82 |             default is 0.1
83 |         device : string or torch.device, optional
84 |             default is "cpu"
85 | 
86 |         """
87 |         super().__init__(
88 |             inner_norm=nn.BatchNorm1d(num_features, momentum=momentum),
89 |             virtual_batch_size=virtual_batch_size,
90 |         )
91 | 


--------------------------------------------------------------------------------
/tests/test_xdeepfm/test_estimators.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | 
 3 | from xynn.xdeepfm import XDeepFMRegressor, XDeepFMClassifier
 4 | from xynn.embedding import LinearEmbedding, DefaultEmbedding
 5 | 
 6 | from ..common import check_estimator_learns
 7 | 
 8 | 
 9 | def test_that_xdeepfmregressor_learns():
10 |     estimator = XDeepFMRegressor(
11 |         mlp_hidden_sizes=[10, 8, 8, 6],
12 |         mlp_use_bn=False,
13 |     )
14 |     check_estimator_learns(estimator, task="regression")
15 |     assert estimator.init_parameters == {
16 |         "embedding_num": "auto",
17 |         "embedding_cat": "auto",
18 |         "embedding_l1_reg": 0.0,
19 |         "embedding_l2_reg": 0.0,
20 |         "cin_layer_sizes": (128, 128),
21 |         "cin_activation": nn.Identity,
22 |         "cin_full_agg": False,
23 |         "cin_use_bn": True,
24 |         "cin_bn_momentum": 0.1,
25 |         "cin_use_residual": True,
26 |         "cin_use_mlp": True,
27 |         "mlp_hidden_sizes": [10, 8, 8, 6],
28 |         "mlp_activation": nn.LeakyReLU,
29 |         "mlp_use_bn": False,
30 |         "mlp_bn_momentum": 0.1,
31 |         "mlp_ghost_batch": None,
32 |         "mlp_dropout": 0.0,
33 |         "mlp_l1_reg": 0.0,
34 |         "mlp_l2_reg": 0.0,
35 |         "mlp_use_skip": True,
36 |         "use_leaky_gate": True,
37 |         "loss_fn": "auto",
38 |         "seed": None,
39 |         "device": "cpu",
40 |     }
41 | 
42 | 
43 | def test_that_xdeepfmclassifier_learns():
44 |     estimator = XDeepFMClassifier(
45 |         cin_layer_sizes=[64, 64],
46 |         cin_activation=nn.ReLU,
47 |         cin_full_agg=True,
48 |         cin_use_bn=False,
49 |         mlp_hidden_sizes=[10, 8, 8, 6],
50 |         mlp_use_bn=False,
51 |         mlp_use_skip=False,
52 |         use_leaky_gate=False,
53 |     )
54 |     assert estimator
55 |     check_estimator_learns(estimator, task="classification")
56 |     assert estimator.init_parameters == {
57 |         "embedding_num": "auto",
58 |         "embedding_cat": "auto",
59 |         "embedding_l1_reg": 0.0,
60 |         "embedding_l2_reg": 0.0,
61 |         "cin_layer_sizes": [64, 64],
62 |         "cin_activation": nn.ReLU,
63 |         "cin_full_agg": True,
64 |         "cin_use_bn": False,
65 |         "cin_bn_momentum": 0.1,
66 |         "cin_use_residual": True,
67 |         "cin_use_mlp": True,
68 |         "mlp_hidden_sizes": [10, 8, 8, 6],
69 |         "mlp_activation": nn.LeakyReLU,
70 |         "mlp_use_bn": False,
71 |         "mlp_bn_momentum": 0.1,
72 |         "mlp_ghost_batch": None,
73 |         "mlp_dropout": 0.0,
74 |         "mlp_l1_reg": 0.0,
75 |         "mlp_l2_reg": 0.0,
76 |         "mlp_use_skip": False,
77 |         "use_leaky_gate": False,
78 |         "loss_fn": "auto",
79 |         "seed": None,
80 |         "device": "cpu",
81 |     }
82 | 


--------------------------------------------------------------------------------
/tests/test_autoint/test_estimators.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | 
 3 | from xynn.autoint import AutoIntRegressor, AutoIntClassifier
 4 | from xynn.embedding import LinearEmbedding, DefaultEmbedding
 5 | 
 6 | from ..common import check_estimator_learns
 7 | 
 8 | 
 9 | def test_that_autointregressor_learns():
10 |     estimator = AutoIntRegressor(
11 |         mlp_hidden_sizes=[10, 8, 8, 6],
12 |         mlp_use_bn=False,
13 |     )
14 |     check_estimator_learns(estimator, task="regression")
15 |     assert estimator.init_parameters == {
16 |         "embedding_num": "auto",
17 |         "embedding_cat": "auto",
18 |         "embedding_l1_reg": 0.0,
19 |         "embedding_l2_reg": 0.0,
20 |         "attn_embedding_size": 8,
21 |         "attn_num_layers": 3,
22 |         "attn_num_heads": 2,
23 |         "attn_activation": None,
24 |         "attn_use_residual": True,
25 |         "attn_dropout": 0.1,
26 |         "attn_normalize": True,
27 |         "attn_use_mlp": True,
28 |         "mlp_hidden_sizes": [10, 8, 8, 6],
29 |         "mlp_activation": nn.LeakyReLU,
30 |         "mlp_use_bn": False,
31 |         "mlp_bn_momentum": 0.1,
32 |         "mlp_ghost_batch": None,
33 |         "mlp_dropout": 0.0,
34 |         "mlp_l1_reg": 0.0,
35 |         "mlp_l2_reg": 0.0,
36 |         "mlp_use_skip": True,
37 |         "use_leaky_gate": True,
38 |         "weighted_sum": True,
39 |         "loss_fn": "auto",
40 |         "seed": None,
41 |         "device": "cpu",
42 |     }
43 | 
44 | 
45 | def test_that_autointclassifier_learns():
46 |     estimator = AutoIntClassifier(
47 |         attn_embedding_size=12,
48 |         attn_activation=nn.ReLU,
49 |         attn_dropout=0.0,
50 |         attn_use_mlp=False,
51 |         mlp_hidden_sizes=[10, 8, 8, 6],
52 |         mlp_use_bn=False,
53 |         mlp_use_skip=False,
54 |         use_leaky_gate=False,
55 |     )
56 |     assert estimator
57 |     check_estimator_learns(estimator, task="classification")
58 |     assert estimator.init_parameters == {
59 |         "embedding_num": "auto",
60 |         "embedding_cat": "auto",
61 |         "embedding_l1_reg": 0.0,
62 |         "embedding_l2_reg": 0.0,
63 |         "attn_embedding_size": 12,
64 |         "attn_num_layers": 3,
65 |         "attn_num_heads": 2,
66 |         "attn_activation": nn.ReLU,
67 |         "attn_use_residual": True,
68 |         "attn_dropout": 0.0,
69 |         "attn_normalize": True,
70 |         "attn_use_mlp": False,
71 |         "mlp_hidden_sizes": [10, 8, 8, 6],
72 |         "mlp_activation": nn.LeakyReLU,
73 |         "mlp_use_bn": False,
74 |         "mlp_bn_momentum": 0.1,
75 |         "mlp_ghost_batch": None,
76 |         "mlp_dropout": 0.0,
77 |         "mlp_l1_reg": 0.0,
78 |         "mlp_l2_reg": 0.0,
79 |         "mlp_use_skip": False,
80 |         "use_leaky_gate": False,
81 |         "weighted_sum": True,
82 |         "loss_fn": "auto",
83 |         "seed": None,
84 |         "device": "cpu",
85 |     }
86 | 


--------------------------------------------------------------------------------
/xynn/embedding/ragged/common.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Base class for ragged embeddings and helpers for parsing embedding size
 3 | 
 4 | FastAI embedding size: arXiv preprint arXiv:2002.04688
 5 | 
 6 | """
 7 | 
 8 | from collections.abc import Iterable as IterableClass
 9 | from typing import List, Optional, Tuple
10 | 
11 | import numpy as np
12 | from torch import nn, Tensor
13 | 
14 | from ..common import EmbeddingBase
15 | 
16 | 
17 | def _check_embedding_size(embedding_size, num_categories=None):
18 |     """Check that given `embedding_size` makes sense for ragged embeddings"""
19 |     if isinstance(embedding_size, str):
20 |         embedding_size = embedding_size.lower()
21 |         if embedding_size not in ("sqrt", "log", "fastai"):
22 |             raise ValueError(
23 |                 "str embedding_size value must be one of {'sqrt', 'log', 'fastai'}; "
24 |                 f"got '{embedding_size}'"
25 |             )
26 |     elif not isinstance(embedding_size, IterableClass) or not all(
27 |         isinstance(size, int) for size in embedding_size
28 |     ):
29 |         raise TypeError(f"embedding_size {repr(embedding_size)} not understood")
30 |     elif num_categories is not None and len(embedding_size) != len(num_categories):
31 |         raise ValueError(
32 |             "number of embeddings must match number of fields, got "
33 |             f"{len(embedding_size)} sizes and {len(num_categories)} fields"
34 |         )
35 |     return embedding_size
36 | 
37 | 
38 | def _parse_embedding_size(embedding_size, max_size, num_categories) -> List[int]:
39 |     """
40 |     Parse given `embedding_size` into a list of individual sizes,
41 |     for ragged embeddings
42 |     """
43 |     _check_embedding_size(embedding_size, num_categories)
44 |     # calculate the individual values if "sqrt" or "log"
45 |     if isinstance(embedding_size, str):
46 |         num_categories = np.array(num_categories)
47 |         if embedding_size == "sqrt":
48 |             base_size = np.ceil(np.sqrt(num_categories))
49 |         elif embedding_size == "log":
50 |             base_size = np.ceil(np.log(num_categories))
51 |         else:  # embedding_size == "fastai":
52 |             base_size = (1.6 * num_categories ** 0.56).round()
53 |         clipped_size = np.clip(1, max_size, base_size).astype("int")
54 |         embedding_size = list(clipped_size)
55 |     else:  # iterable of int
56 |         pass
57 |     return embedding_size
58 | 
59 | 
60 | class RaggedBase(EmbeddingBase):
61 |     """Base class for embeddings that allow a different vector size for each field"""
62 | 
63 |     def __init__(self):
64 |         super().__init__()
65 |         self.embedding: Optional[nn.ModuleList] = None
66 | 
67 |     def weight_sum(self) -> Tuple[Tensor, Tensor]:
68 |         """
69 |         Sum of absolute value and square of embedding weights
70 | 
71 |         Return
72 |         ------
73 |         e1_sum : sum of absolute value of embedding values
74 |         e2_sum : sum of squared embedding values
75 |         """
76 |         if not self._isfit:
77 |             return 0.0, 0.0
78 |         e1_sum = 0.0
79 |         e2_sum = 0.0
80 |         for embedding in self.embedding:
81 |             e1_sum += embedding.weight.abs().sum()
82 |             e2_sum += (embedding.weight ** 2).sum()
83 |         return e1_sum, e2_sum
84 | 


--------------------------------------------------------------------------------
/tests/test_mlpnet/test_estimators.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import random
  3 | from tempfile import NamedTemporaryFile
  4 | 
  5 | import torch
  6 | from torch import nn
  7 | import numpy as np
  8 | 
  9 | from xynn.base_classes.estimators import _set_seed
 10 | from xynn.embedding import DenseEmbedding, RaggedEmbedding
 11 | from xynn.mlpnet import MLPRegressor, MLPClassifier
 12 | 
 13 | from ..common import check_estimator_learns
 14 | 
 15 | 
 16 | def test_that_basic_params_are_passed_to_mlpnet_module():
 17 |     X = torch.rand((100, 10)) - 0.5
 18 |     y = X[:, 0] - X[:, 1] + X[:, 2] - X[:, 4] + 2 * X[:, 6] - 2 * X[:, 8]
 19 |     estimator = MLPRegressor(
 20 |         embedding_cat=None,
 21 |         embedding_l1_reg=0.1,
 22 |         mlp_l2_reg=0.2,
 23 |     )
 24 |     estimator.fit(
 25 |         X_num=X,
 26 |         X_cat=None,
 27 |         y=y,
 28 |         optimizer=torch.optim.Adam,
 29 |         opt_kwargs={"lr": 1e-1},
 30 |         num_epochs=1,
 31 |     )
 32 | 
 33 |     model = estimator._model
 34 | 
 35 |     assert model.task == "regression"
 36 |     assert model.num_epochs == 1
 37 |     assert isinstance(model.loss_fn, nn.MSELoss)
 38 |     assert model.embedding_num is not None
 39 |     assert model.embedding_cat is None
 40 |     assert model.embedding_l1_reg == 0.1
 41 |     assert model.embedding_l2_reg == 0.0
 42 |     assert model.mlp_l1_reg == 0.0
 43 |     assert model.mlp_l2_reg == 0.2
 44 |     assert model.optimizer is not None
 45 |     assert model.optimizer_info != {}
 46 |     assert model.scheduler == {}
 47 |     assert model._device == torch.device("cpu")
 48 | 
 49 | 
 50 | def test_that_mlpregressor_learns():
 51 |     _set_seed(10101)
 52 |     X = torch.rand((100, 10)) - 0.5
 53 |     y = X[:, 0] - X[:, 1] + X[:, 2] - X[:, 4] + 2 * X[:, 6] - 2 * X[:, 8]
 54 |     estimator = MLPRegressor(
 55 |         mlp_hidden_sizes=[10, 8, 8, 6],
 56 |         mlp_use_bn=False,
 57 |         mlp_use_skip=False,
 58 |         use_leaky_gate=False,
 59 |     )
 60 |     check_estimator_learns(estimator, task="regression", data=(X, None, y))
 61 | 
 62 | 
 63 | def test_that_mlpclassifier_learns():
 64 |     _set_seed(10101)
 65 |     X = torch.rand((100, 10)) - 0.5
 66 |     y_cont = X[:, 0] - X[:, 1] + X[:, 2] - X[:, 4] + 2 * X[:, 6] - 2 * X[:, 8]
 67 |     y = (y_cont > 0)
 68 |     estimator = MLPClassifier(
 69 |         mlp_hidden_sizes=[10, 8, 8, 6],
 70 |         mlp_use_bn=False,
 71 |         mlp_use_skip=False,
 72 |         use_leaky_gate=False,
 73 |     )
 74 |     check_estimator_learns(estimator, task="regression", data=(X, None, y))
 75 | 
 76 | 
 77 | def test_that_mlpregressor_allows_dense_and_ragged_embeddings():
 78 |     _set_seed(10101)
 79 |     estimator = MLPClassifier(
 80 |         embedding_num=DenseEmbedding(),
 81 |         embedding_cat=RaggedEmbedding(),
 82 |         mlp_hidden_sizes=[10, 8, 8, 6],
 83 |         mlp_use_bn=False,
 84 |         mlp_use_skip=False,
 85 |         use_leaky_gate=False,
 86 |     )
 87 |     check_estimator_learns(estimator, task="regression")
 88 | 
 89 | 
 90 | def test_that_mlpclassifier_doesnt_require_numeric_embedding():
 91 |     _set_seed(10101)
 92 |     estimator = MLPClassifier(
 93 |         embedding_num=None,
 94 |         mlp_hidden_sizes=[10, 8, 8, 6],
 95 |         mlp_use_bn=False,
 96 |         mlp_use_skip=False,
 97 |         use_leaky_gate=False,
 98 |     )
 99 |     check_estimator_learns(estimator, task="regression")
100 | 


--------------------------------------------------------------------------------
/tests/test_fibinet/test_estimators.py:
--------------------------------------------------------------------------------
  1 | from torch import nn
  2 | 
  3 | from xynn.fibinet import FiBiNetRegressor, FiBiNetClassifier
  4 | from xynn.embedding import LinearEmbedding, DefaultEmbedding
  5 | 
  6 | from ..common import check_estimator_learns
  7 | 
  8 | 
  9 | def test_that_fibinetregressor_learns():
 10 |     estimator = FiBiNetRegressor(
 11 |         mlp_hidden_sizes=[10, 8, 8, 6],
 12 |         mlp_use_bn=False,
 13 |         mlp_use_skip=False,
 14 |         use_leaky_gate=False,
 15 |     )
 16 |     check_estimator_learns(estimator, task="regression")
 17 |     assert estimator.init_parameters == {
 18 |         "embedding_num": "auto",
 19 |         "embedding_cat": "auto",
 20 |         "embedding_l1_reg": 0.0,
 21 |         "embedding_l2_reg": 0.0,
 22 |         "fibi_reduction_ratio": 3,
 23 |         "fibi_activation": nn.LeakyReLU,
 24 |         "fibi_senet_product": "sym-interaction",
 25 |         "fibi_embed_product": "sym-interaction",
 26 |         "fibi_senet_skip": True,
 27 |         "mlp_hidden_sizes": [10, 8, 8, 6],
 28 |         "mlp_activation": nn.LeakyReLU,
 29 |         "mlp_use_bn": False,
 30 |         "mlp_bn_momentum": 0.1,
 31 |         "mlp_ghost_batch": None,
 32 |         "mlp_dropout": 0.0,
 33 |         "mlp_l1_reg": 0.0,
 34 |         "mlp_l2_reg": 0.0,
 35 |         "mlp_use_skip": False,
 36 |         "use_leaky_gate": False,
 37 |         "loss_fn": "auto",
 38 |         "seed": None,
 39 |         "device": "cpu",
 40 |     }
 41 | 
 42 | 
 43 | def test_that_fibinetclassifier_learns():
 44 |     estimator = FiBiNetClassifier(
 45 |         fibi_reduction_ratio=4,
 46 |         fibi_activation=nn.ReLU,
 47 |         fibi_senet_product="field-each",
 48 |         fibi_embed_product="shared",
 49 |         mlp_hidden_sizes=[10, 8, 8, 6],
 50 |         mlp_use_bn=False,
 51 |         mlp_use_skip=False,
 52 |         use_leaky_gate=False,
 53 |     )
 54 |     assert estimator
 55 |     check_estimator_learns(estimator, task="classification")
 56 |     assert estimator.init_parameters == {
 57 |         "embedding_num": "auto",
 58 |         "embedding_cat": "auto",
 59 |         "embedding_l1_reg": 0.0,
 60 |         "embedding_l2_reg": 0.0,
 61 |         "fibi_reduction_ratio": 4,
 62 |         "fibi_activation": nn.ReLU,
 63 |         "fibi_senet_product": "field-each",
 64 |         "fibi_embed_product": "shared",
 65 |         "fibi_senet_skip": True,
 66 |         "mlp_hidden_sizes": [10, 8, 8, 6],
 67 |         "mlp_activation": nn.LeakyReLU,
 68 |         "mlp_use_bn": False,
 69 |         "mlp_bn_momentum": 0.1,
 70 |         "mlp_ghost_batch": None,
 71 |         "mlp_dropout": 0.0,
 72 |         "mlp_l1_reg": 0.0,
 73 |         "mlp_l2_reg": 0.0,
 74 |         "mlp_use_skip": False,
 75 |         "use_leaky_gate": False,
 76 |         "loss_fn": "auto",
 77 |         "seed": None,
 78 |         "device": "cpu",
 79 |     }
 80 | 
 81 | 
 82 | def test_that_fibinetclassifier_learns_with_hadamard_products():
 83 |     estimator = FiBiNetClassifier(
 84 |         fibi_reduction_ratio=4,
 85 |         fibi_activation=nn.ReLU,
 86 |         fibi_senet_product="hadamard",
 87 |         fibi_embed_product="shared",
 88 |         mlp_hidden_sizes=[10, 8, 8, 6],
 89 |         mlp_use_bn=False,
 90 |         mlp_use_skip=False,
 91 |         use_leaky_gate=False,
 92 |     )
 93 |     check_estimator_learns(estimator, task="classification")
 94 |     assert estimator.init_parameters == {
 95 |         "embedding_num": "auto",
 96 |         "embedding_cat": "auto",
 97 |         "embedding_l1_reg": 0.0,
 98 |         "embedding_l2_reg": 0.0,
 99 |         "fibi_reduction_ratio": 4,
100 |         "fibi_activation": nn.ReLU,
101 |         "fibi_senet_product": "hadamard",
102 |         "fibi_embed_product": "shared",
103 |         "fibi_senet_skip": True,
104 |         "mlp_hidden_sizes": [10, 8, 8, 6],
105 |         "mlp_activation": nn.LeakyReLU,
106 |         "mlp_use_bn": False,
107 |         "mlp_bn_momentum": 0.1,
108 |         "mlp_ghost_batch": None,
109 |         "mlp_dropout": 0.0,
110 |         "mlp_l1_reg": 0.0,
111 |         "mlp_l2_reg": 0.0,
112 |         "mlp_use_skip": False,
113 |         "use_leaky_gate": False,
114 |         "loss_fn": "auto",
115 |         "seed": None,
116 |         "device": "cpu",
117 |     }
118 | 


--------------------------------------------------------------------------------
/xynn/mlpnet/modules.py:
--------------------------------------------------------------------------------
  1 | """
  2 | PyTorch module for the MLP model
  3 | 
  4 | """
  5 | 
  6 | import textwrap
  7 | from typing import Union, Tuple, Callable, Optional, Type, List
  8 | 
  9 | import torch
 10 | from torch import Tensor
 11 | from torch import nn
 12 | 
 13 | from ..base_classes.modules import BaseNN, MODULE_INIT_DOC
 14 | from ..mlp import MLP
 15 | from ..embedding import check_embeddings
 16 | from ..embedding.common import EmbeddingBase
 17 | 
 18 | 
 19 | INIT_DOC = MODULE_INIT_DOC.format(
 20 |     textwrap.dedent(
 21 |         """\
 22 |         num_numeric_fields : int or "auto", optional
 23 |             an integer must be specified when embedding_num is None;
 24 |             default is \"auto\""""
 25 |     )
 26 | )
 27 | 
 28 | 
 29 | class MLPNet(BaseNN):
 30 |     """ A model consisting of just an MLP """
 31 | 
 32 |     def __init__(
 33 |         self,
 34 |         task: str,
 35 |         output_size: int,
 36 |         embedding_num: Optional[EmbeddingBase],
 37 |         embedding_cat: Optional[EmbeddingBase],
 38 |         embedding_l1_reg: float = 0.0,
 39 |         embedding_l2_reg: float = 0.0,
 40 |         num_numeric_fields: Union[int, str] = "auto",
 41 |         mlp_hidden_sizes: Union[int, Tuple[int, ...], List[int]] = (512, 256, 128, 64),
 42 |         mlp_activation: Type[nn.Module] = nn.LeakyReLU,
 43 |         mlp_use_bn: bool = True,
 44 |         mlp_bn_momentum: float = 0.1,
 45 |         mlp_ghost_batch: Optional[int] = None,
 46 |         mlp_dropout: float = 0.0,
 47 |         mlp_use_skip: bool = True,
 48 |         mlp_l1_reg: float = 0.0,
 49 |         mlp_l2_reg: float = 0.0,
 50 |         use_leaky_gate: bool = True,
 51 |         weighted_sum: bool = True,
 52 |         loss_fn: Union[str, Callable] = "auto",
 53 |         device: Union[str, torch.device] = "cpu",
 54 |     ):
 55 |         super().__init__(
 56 |             task,
 57 |             embedding_num,
 58 |             embedding_cat,
 59 |             embedding_l1_reg,
 60 |             embedding_l2_reg,
 61 |             mlp_l1_reg,
 62 |             mlp_l2_reg,
 63 |             loss_fn,
 64 |             device,
 65 |         )
 66 | 
 67 |         embed_info = check_embeddings(embedding_num, embedding_cat)
 68 | 
 69 |         if embedding_num is not None:
 70 |             input_size = embed_info.output_size
 71 |         elif not isinstance(num_numeric_fields, int):
 72 |             raise TypeError(
 73 |                 "when embedding_num is None, num_numeric_fields must be an integer"
 74 |             )
 75 |         else:
 76 |             input_size = embed_info.output_size + num_numeric_fields
 77 | 
 78 |         self.mlp = MLP(
 79 |             task,
 80 |             input_size=input_size,
 81 |             hidden_sizes=mlp_hidden_sizes,
 82 |             output_size=output_size,
 83 |             activation=mlp_activation,
 84 |             dropout=mlp_dropout,
 85 |             dropout_first=True,
 86 |             use_bn=mlp_use_bn,
 87 |             bn_momentum=mlp_bn_momentum,
 88 |             ghost_batch=mlp_ghost_batch,
 89 |             leaky_gate=use_leaky_gate,
 90 |             use_skip=mlp_use_skip,
 91 |             weighted_sum=weighted_sum,
 92 |             device=device,
 93 |         )
 94 | 
 95 |         self.mix = self.mlp.mix
 96 |         #self.to(device)
 97 | 
 98 |     __init__.__doc__ = INIT_DOC
 99 | 
100 |     @staticmethod
101 |     def diagram():
102 |         """ Print a text diagram of this model """
103 |         gram = """\
104 | 
105 |         if mlp_use_skip=True (default)
106 |         ------------------------------
107 |         X_num ─ Num. embedding? ┐ ┌─── MLP ──┐
108 |                                 ├─┤          w+ ── output
109 |         X_cat ─ Cat. embedding ─┘ └─ Linear ─┘
110 | 
111 |         if mlp_use_skip=False
112 |         ---------------------
113 |         X_num ─ Num. embedding? ┐
114 |                                 ├─── MLP ── output
115 |         X_cat ─ Cat. embedding ─┘
116 | 
117 |         splits are copies and joins are concatenations;
118 |         'w+' is weighted element-wise addition;
119 |         the numeric embedding is optional
120 |         """
121 |         print("\n" + textwrap.dedent(gram))
122 | 
123 |     def mlp_weight_sum(self) -> Tuple[Tensor, Tensor]:
124 |         """
125 |         Sum of absolute value and square of weights in MLP layers
126 | 
127 |         Return
128 |         ------
129 |         w1 : sum of absolute value of MLP weights
130 |         w2 : sum of squared MLP weights
131 | 
132 |         """
133 |         return  self.mlp.weight_sum()
134 | 
135 |     def forward(self, X_num: Tensor, X_cat: Tensor) -> Tensor:
136 |         """
137 |         Transform the input tensor
138 | 
139 |         Parameters
140 |         ----------
141 |         X_num : torch.Tensor
142 |             numeric fields
143 |         X_cat : torch.Tensor
144 |             categorical fields
145 | 
146 |         Return
147 |         ------
148 |         torch.Tensor
149 | 
150 |         """
151 |         embedded = self.embed(X_num, X_cat, num_dim=2)
152 |         return self.mlp(embedded)
153 | 


--------------------------------------------------------------------------------
/xynn/dataset.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Simple DataLoader-like class for tabular X_num, X_cat, y
  3 | 
  4 | """
  5 | 
  6 | from typing import Union, Tuple, Optional
  7 | 
  8 | import numpy as np
  9 | import torch
 10 | from torch import Tensor
 11 | from torch.utils.data import Dataset
 12 | 
 13 | 
 14 | def _validate_x(X, y, X_name, device):
 15 |     if isinstance(X, (Tensor, np.ndarray)):
 16 |         if not X.shape[0] == y.shape[0]:
 17 |             raise ValueError(
 18 |                 f"shape mismatch; got y.shape[0] == {y.shape[0]}, "
 19 |                 f"{X_name}.shape[0] == {X.shape[0]}"
 20 |             )
 21 |         if len(X.shape) != 2:
 22 |             raise ValueError(
 23 |                 f"{X_name} should be 2-d; got shape {X.shape}"
 24 |             )
 25 |         if isinstance(X, np.ndarray):
 26 |             X = torch.from_numpy(X).to(dtype=torch.float32)
 27 |     elif X is None:
 28 |         X = torch.empty((y.shape[0], 0))
 29 |     else:
 30 |         raise TypeError(f"input {X_name} should be Tensor, NumPy array, or None")
 31 |     return X
 32 | 
 33 | 
 34 | def _validate_y(y, task, device):
 35 |     if isinstance(y, (Tensor, np.ndarray)):
 36 |         if any(size == 0 for size in y.shape):
 37 |             raise ValueError(f"y has a zero-sized dimension; got shape {y.shape}")
 38 | 
 39 |         if task == "regression" and len(y.shape) == 1:
 40 |             y = y.reshape((-1, 1))
 41 |         elif task == "classification" and len(y.shape) == 2:
 42 |             if y.shape[1] != 1:
 43 |                 raise ValueError("for classification y must be 1-d or 2-d with one column")
 44 |             y = y.reshape((-1,))
 45 |         elif len(y.shape) > 2:
 46 |             raise ValueError(f"y has too many dimensions; got shape {y.shape}")
 47 | 
 48 |         if isinstance(y, np.ndarray):
 49 |             y = torch.from_numpy(y).to(dtype=torch.float32)
 50 |     else:
 51 |         raise TypeError("y should be Tensor or NumPy array")
 52 |     return y
 53 | 
 54 | 
 55 | class TabularDataLoader:
 56 |     """
 57 |     A DataLoader-like class that aims to be faster for tabular data.
 58 | 
 59 |     Based on `FastTensorDataLoader` by Jesse Mu
 60 |     https://discuss.pytorch.org/t/dataloader-much-slower-than-manual-batching/27014/6
 61 | 
 62 |     """
 63 |     def __init__(
 64 |         self,
 65 |         task: str,
 66 |         X_num: Optional[Union[np.ndarray, Tensor]],
 67 |         X_cat: Optional[Union[np.ndarray, Tensor]],
 68 |         y: Union[np.ndarray, Tensor],
 69 |         batch_size: int = 32,
 70 |         shuffle: bool = False,
 71 |         device: Union[str, torch.device] = "cpu",
 72 |     ):
 73 |         """
 74 |         Parameters
 75 |         ----------
 76 |         task : {"regression", "classification"}
 77 |         X_num : PyTorch Tensor, NumPy array, or None
 78 |             numeric input fields
 79 |         X_cat : PyTorch Tensor, NumPy array, or None
 80 |             categorical input fields (represented as numeric values)
 81 |         y : PyTorch Tensor, NumPy array, or None
 82 |             target field
 83 |         batch_size : int, optional
 84 |             default is 32
 85 |         shuffle : bool, optional
 86 |             default is False
 87 |         device : string or torch.device, optional
 88 |             default is "cpu"
 89 | 
 90 |         """
 91 |         if X_num is None and X_cat is None:
 92 |             raise TypeError("X_num and X_cat cannot both be None")
 93 | 
 94 |         self.y = _validate_y(y, task, device)
 95 |         self.X_num = _validate_x(X_num, self.y, "X_num", device)
 96 |         self.X_cat = _validate_x(X_cat, self.y, "X_cat", device)
 97 |         self.dataset_len = y.shape[0]
 98 |         self.batch_size = batch_size
 99 |         self.shuffle = shuffle
100 |         self.device = device
101 | 
102 |         # Calculate # batches
103 |         n_batches, remainder = divmod(self.dataset_len, self.batch_size)
104 |         if remainder > 0:
105 |             n_batches += 1
106 |         self.n_batches = n_batches
107 | 
108 |     def __iter__(self):
109 |         if self.shuffle:
110 |             self.indices = torch.randperm(self.dataset_len)
111 |         else:
112 |             self.indices = None
113 |         self.i = 0
114 |         return self
115 | 
116 |     def __next__(self):
117 |         if self.i >= self.dataset_len:
118 |             raise StopIteration
119 |         if self.indices is not None:
120 |             indices = self.indices[self.i:self.i+self.batch_size]
121 |             batch = (
122 |                 torch.index_select(self.X_num, 0, indices).to(device=self.device),
123 |                 torch.index_select(self.X_cat, 0, indices).to(device=self.device),
124 |                 torch.index_select(self.y, 0, indices).to(device=self.device),
125 |             )
126 |         else:
127 |             batch = (
128 |                 self.X_num[self.i:self.i+self.batch_size].to(device=self.device),
129 |                 self.X_cat[self.i:self.i+self.batch_size].to(device=self.device),
130 |                 self.y[self.i:self.i+self.batch_size].to(device=self.device),
131 |             )
132 |         self.i += self.batch_size
133 |         return batch
134 | 
135 |     def __len__(self):
136 |         return self.n_batches
137 | 


--------------------------------------------------------------------------------
/xynn/mlpnet/estimators.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Scikit-learn style classes for the MLP model
  3 | 
  4 | """
  5 | 
  6 | from typing import Union, Callable, Optional, Type, List, Tuple
  7 | 
  8 | import torch
  9 | from torch import nn
 10 | 
 11 | from ..base_classes.estimators import BaseClassifier, BaseRegressor, ESTIMATOR_INIT_DOC
 12 | from ..embedding import EmbeddingBase
 13 | from .modules import MLPNet
 14 | 
 15 | 
 16 | INIT_DOC = ESTIMATOR_INIT_DOC.format("")
 17 | 
 18 | 
 19 | class MLPClassifier(BaseClassifier):
 20 |     """
 21 |     Scikit-learn style classification model for the MLP model
 22 | 
 23 |     """
 24 | 
 25 |     diagram = MLPNet.diagram
 26 | 
 27 |     def __init__(
 28 |         self,
 29 |         embedding_num: Optional[Union[str, EmbeddingBase]] = "auto",
 30 |         embedding_cat: Optional[Union[str, EmbeddingBase]] = "auto",
 31 |         embedding_l1_reg: float = 0.0,
 32 |         embedding_l2_reg: float = 0.0,
 33 |         mlp_hidden_sizes: Union[int, Tuple[int, ...], List[int]] = (512, 256, 128, 64),
 34 |         mlp_activation: Type[nn.Module] = nn.LeakyReLU,
 35 |         mlp_use_bn: bool = True,
 36 |         mlp_bn_momentum: float = 0.1,
 37 |         mlp_ghost_batch: Optional[int] = None,
 38 |         mlp_dropout: float = 0.0,
 39 |         mlp_l1_reg: float = 0.0,
 40 |         mlp_l2_reg: float = 0.0,
 41 |         mlp_use_skip: bool = True,
 42 |         use_leaky_gate: bool = True,
 43 |         weighted_sum: bool = True,
 44 |         loss_fn: Union[str, Callable] = "auto",
 45 |         seed: Union[int, None] = None,
 46 |         device: Union[str, torch.device] = "cpu",
 47 |     ):
 48 |         super().__init__(
 49 |             embedding_num=embedding_num,
 50 |             embedding_cat=embedding_cat,
 51 |             embedding_l1_reg=embedding_l1_reg,
 52 |             embedding_l2_reg=embedding_l2_reg,
 53 |             mlp_hidden_sizes=mlp_hidden_sizes,
 54 |             mlp_activation=mlp_activation,
 55 |             mlp_use_bn=mlp_use_bn,
 56 |             mlp_bn_momentum=mlp_bn_momentum,
 57 |             mlp_ghost_batch=mlp_ghost_batch,
 58 |             mlp_dropout=mlp_dropout,
 59 |             mlp_l1_reg=mlp_l1_reg,
 60 |             mlp_l2_reg=mlp_l2_reg,
 61 |             mlp_use_skip=mlp_use_skip,
 62 |             use_leaky_gate=use_leaky_gate,
 63 |             weighted_sum=weighted_sum,
 64 |             loss_fn=loss_fn,
 65 |             seed=seed,
 66 |             device=device,
 67 |         )
 68 |         self._model_class = MLPNet
 69 |         self._require_numeric_embedding = False
 70 | 
 71 |     __init__.__doc__ = INIT_DOC
 72 | 
 73 |     def _create_model(self):
 74 |         self._model = self._model_class(
 75 |             task="classification",
 76 |             output_size=len(self.classes),
 77 |             embedding_num=self.embedding_num,
 78 |             embedding_cat=self.embedding_cat,
 79 |             num_numeric_fields=self._num_numeric_fields,
 80 |             loss_fn=self.loss_fn,
 81 |             device=self._device,
 82 |             **self.model_kwargs
 83 |         )
 84 | 
 85 | 
 86 | class MLPRegressor(BaseRegressor):
 87 |     """
 88 |     Scikit-learn style regression model for the MLP model
 89 | 
 90 |     """
 91 | 
 92 |     diagram = MLPNet.diagram
 93 | 
 94 |     def __init__(
 95 |         self,
 96 |         embedding_num: Optional[Union[str, EmbeddingBase]] = "auto",
 97 |         embedding_cat: Optional[Union[str, EmbeddingBase]] = "auto",
 98 |         embedding_l1_reg: float = 0.0,
 99 |         embedding_l2_reg: float = 0.0,
100 |         mlp_hidden_sizes: Union[int, Tuple[int, ...], List[int]] = (512, 256, 128, 64),
101 |         mlp_activation: Type[nn.Module] = nn.LeakyReLU,
102 |         mlp_use_bn: bool = True,
103 |         mlp_bn_momentum: float = 0.1,
104 |         mlp_ghost_batch: Optional[int] = None,
105 |         mlp_dropout: float = 0.0,
106 |         mlp_l1_reg: float = 0.0,
107 |         mlp_l2_reg: float = 0.0,
108 |         mlp_use_skip: bool = True,
109 |         use_leaky_gate: bool = True,
110 |         weighted_sum: bool = True,
111 |         loss_fn: Union[str, Callable] = "auto",
112 |         seed: Optional[int] = None,
113 |         device: Union[str, torch.device] = "cpu",
114 |     ):
115 |         super().__init__(
116 |             embedding_num=embedding_num,
117 |             embedding_cat=embedding_cat,
118 |             embedding_l1_reg=embedding_l1_reg,
119 |             embedding_l2_reg=embedding_l2_reg,
120 |             mlp_hidden_sizes=mlp_hidden_sizes,
121 |             mlp_activation=mlp_activation,
122 |             mlp_use_bn=mlp_use_bn,
123 |             mlp_bn_momentum=mlp_bn_momentum,
124 |             mlp_ghost_batch=mlp_ghost_batch,
125 |             mlp_dropout=mlp_dropout,
126 |             mlp_l1_reg=mlp_l1_reg,
127 |             mlp_l2_reg=mlp_l2_reg,
128 |             mlp_use_skip=mlp_use_skip,
129 |             use_leaky_gate=use_leaky_gate,
130 |             weighted_sum=weighted_sum,
131 |             loss_fn=loss_fn,
132 |             seed=seed,
133 |             device=device,
134 |         )
135 |         self._model_class = MLPNet
136 |         self._require_numeric_embedding = False
137 | 
138 |     __init__.__doc__ = INIT_DOC
139 | 
140 |     def _create_model(self):
141 |         self._model = self._model_class(
142 |             task="regression",
143 |             output_size=self.num_targets,
144 |             embedding_num=self.embedding_num,
145 |             embedding_cat=self.embedding_cat,
146 |             num_numeric_fields=self._num_numeric_fields,
147 |             loss_fn=self.loss_fn,
148 |             device=self._device,
149 |             **self.model_kwargs,
150 |         )
151 | 


--------------------------------------------------------------------------------
/tests/test_embedding/test_common.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import torch
  4 | import pytest
  5 | 
  6 | from xynn.embedding.common import _isnan, _isnan_index, _unique, _value_counts
  7 | from .utils import example_data
  8 | 
  9 | 
 10 | def test__isnan():
 11 |     assert _isnan(float("nan"))
 12 |     assert not _isnan("NaN")
 13 |     assert not _isnan(20.22)
 14 |     assert not _isnan(20122)
 15 | 
 16 | 
 17 | def test__isnan_index_with_simple_example():
 18 |     data = pd.Series([10, 8, 6, 4, 2], index=[0, np.nan, 1, 4, np.nan])
 19 |     assert np.all(_isnan_index(data) == [False, True, False, False, True])
 20 | 
 21 | 
 22 | def test_that__unique_raises_error_on_bad_input():
 23 |     msg = "input should be Pandas DataFrame, NumPy array, or PyTorch Tensor"
 24 |     with pytest.raises(TypeError, match=msg):
 25 |         _unique([10, 8, 6, 8, 4, 2, 0, 10, 2, 0, 2])
 26 | 
 27 | 
 28 | def test__unique_on_numpy_examples():
 29 |     data = example_data()[["cat_a", "cat_b", "cat_c"]]
 30 |     data["cat_d"] = ["a", "b", np.nan, "c", "a", np.nan, np.nan, "a", "b", "c"]
 31 |     data["cat_e"] = [1, 1, np.nan, 0, 0, 0, np.nan, 1, 1, 0]
 32 |     uniques, has_nan = _unique(data.values)
 33 |     assert [set(values) for values in uniques] == [
 34 |         set([0, 1, 2, 3]), set([0, 1, 2]), set([0, 1]), set("abc"), set([0, 1])
 35 |     ]
 36 |     assert has_nan == [False, False, True, True, True]
 37 | 
 38 |     data = example_data()[["cat_c"]]
 39 |     data["cat_e"] = [1, 1, np.nan, 0, 0, 0, np.nan, 1, 1, 0]
 40 |     uniques, has_nan = _unique(data.values)
 41 |     assert [set(values) for values in uniques] == [set([0, 1]), set([0, 1])]
 42 |     assert has_nan == [True, True]
 43 | 
 44 | 
 45 | def test__unique_on_pandas_examples():
 46 |     data = example_data()[["cat_a", "cat_b", "cat_c"]]
 47 |     data["cat_d"] = ["a", "b", np.nan, "c", "a", np.nan, np.nan, "a", "b", "c"]
 48 |     data["cat_e"] = [1, 1, np.nan, 0, 0, 0, np.nan, 1, 1, 0]
 49 |     uniques, has_nan = _unique(data)
 50 |     assert [set(values) for values in uniques] == [
 51 |         set([0, 1, 2, 3]), set([0, 1, 2]), set([0, 1]), set("abc"), set([0, 1])
 52 |     ]
 53 |     assert has_nan == [False, False, True, True, True]
 54 | 
 55 |     data = example_data()[["cat_c"]]
 56 |     data["cat_e"] = [1, 1, np.nan, 0, 0, 0, np.nan, 1, 1, 0]
 57 |     uniques, has_nan = _unique(data)
 58 |     assert [set(values) for values in uniques] == [set([0, 1]), set([0, 1])]
 59 |     assert has_nan == [True, True]
 60 | 
 61 | 
 62 | def test__unique_on_tensor_example():
 63 |     data = example_data()[["cat_a", "cat_b", "cat_c"]]
 64 |     data["cat_e"] = [1, 1, np.nan, 0, 0, 0, np.nan, 1, 1, 0]
 65 |     uniques, has_nan = _unique(torch.from_numpy(data.values))
 66 |     assert [set(values) for values in uniques] == [
 67 |         set([0, 1, 2, 3]), set([0, 1, 2]), set([0, 1]), set([0, 1])
 68 |     ]
 69 |     assert has_nan == [False, False, True, True]
 70 | 
 71 |     data = example_data()[["cat_c"]]
 72 |     data["cat_e"] = [1, 1, np.nan, 0, 0, 0, np.nan, 1, 1, 0]
 73 |     uniques, has_nan = _unique(torch.from_numpy(data.values))
 74 |     assert [set(values) for values in uniques] == [set([0, 1]), set([0, 1])]
 75 |     assert has_nan == [True, True]
 76 | 
 77 | 
 78 | def test_that__value_counts_raises_error_on_bad_input():
 79 |     msg = "input should be Pandas DataFrame, NumPy array, or PyTorch Tensor"
 80 |     with pytest.raises(TypeError, match=msg):
 81 |         _value_counts([10, 8, 6, 8, 4, 2, 0, 10, 2, 0, 2])
 82 | 
 83 | 
 84 | def test__value_counts_on_numpy_examples():
 85 |     data = example_data()[["cat_a", "cat_b", "cat_c"]]
 86 |     data["cat_d"] = ["a", "b", np.nan, "c", "a", np.nan, np.nan, "a", "b", "c"]
 87 |     data["cat_e"] = [1, 1, np.nan, 0, 0, 0, np.nan, 1, 1, 0]
 88 |     unique_counts, nan_counts = _value_counts(data.values)
 89 |     assert unique_counts == [
 90 |         {0: 4, 1: 3, 2: 2, 3: 1},
 91 |         {0: 4, 1: 5, 2: 1},
 92 |         {0: 3, 1: 6},
 93 |         {"a": 3, "b": 2, "c": 2},
 94 |         {0: 4, 1: 4},
 95 |     ]
 96 |     assert nan_counts == [0, 0, 1, 3, 2]
 97 | 
 98 |     data = example_data()[["cat_c"]]
 99 |     data["cat_e"] = [1, 1, np.nan, 0, 0, 0, np.nan, 1, 1, 0]
100 |     unique_counts, nan_counts = _value_counts(data.values)
101 |     assert unique_counts == [{0: 3, 1: 6}, {0: 4, 1: 4}]
102 |     assert nan_counts == [1, 2]
103 | 
104 | 
105 | def test__value_counts_on_pandas_examples():
106 |     data = example_data()[["cat_a", "cat_b", "cat_c"]]
107 |     data["cat_d"] = ["a", "b", np.nan, "c", "a", np.nan, np.nan, "a", "b", "c"]
108 |     data["cat_e"] = [1, 1, np.nan, 0, 0, 0, np.nan, 1, 1, 0]
109 |     unique_counts, nan_counts = _value_counts(data)
110 |     assert unique_counts == [
111 |         {0: 4, 1: 3, 2: 2, 3: 1},
112 |         {0: 4, 1: 5, 2: 1},
113 |         {0: 3, 1: 6},
114 |         {"a": 3, "b": 2, "c": 2},
115 |         {0: 4, 1: 4},
116 |     ]
117 |     assert nan_counts == [0, 0, 1, 3, 2]
118 | 
119 |     data = example_data()[["cat_c"]]
120 |     data["cat_e"] = [1, 1, np.nan, 0, 0, 0, np.nan, 1, 1, 0]
121 |     unique_counts, nan_counts = _value_counts(data.values)
122 |     assert unique_counts == [{0: 3, 1: 6}, {0: 4, 1: 4}]
123 |     assert nan_counts == [1, 2]
124 | 
125 | 
126 | def test__value_counts_on_tensor_examples():
127 |     data = example_data()[["cat_a", "cat_b", "cat_c"]]
128 |     data["cat_e"] = [1, 1, np.nan, 0, 0, 0, np.nan, 1, 1, 0]
129 |     unique_counts, nan_counts = _value_counts(torch.from_numpy(data.values))
130 |     assert unique_counts == [
131 |         {0: 4, 1: 3, 2: 2, 3: 1}, {0: 4, 1: 5, 2: 1}, {0: 3, 1: 6}, {0: 4, 1: 4}
132 |     ]
133 |     assert nan_counts == [0, 0, 1, 2]
134 | 
135 |     data = example_data()[["cat_c"]]
136 |     data["cat_e"] = [1, 1, np.nan, 0, 0, 0, np.nan, 1, 1, 0]
137 |     unique_counts, nan_counts = _value_counts(torch.from_numpy(data.values))
138 |     assert unique_counts == [{0: 3, 1: 6}, {0: 4, 1: 4}]
139 |     assert nan_counts == [1, 2]
140 | 


--------------------------------------------------------------------------------
/xynn/embedding/utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Functions for fitting embeddings or checking embeddings
  3 | 
  4 | """
  5 | from typing import Tuple, Optional
  6 | from collections import namedtuple
  7 | 
  8 | from torch.utils.data import DataLoader
  9 | 
 10 | from .common import _linear_agg, _unique_agg, _value_counts_agg, EmbeddingBase
 11 | from .uniform import UniformBase, LinearEmbedding, BasicEmbedding
 12 | from .ragged import RaggedEmbedding
 13 | 
 14 | 
 15 | EmbeddingInfo = namedtuple("EmbeddingInfo", ["num_fields", "output_size"])
 16 | UniformEmbeddingInfo = namedtuple(
 17 |     "EmbeddingInfo", ["num_fields", "embedding_size", "output_size"]
 18 | )
 19 | 
 20 | 
 21 | def _init_embed_info(embedding):
 22 |     if isinstance(embedding, (LinearEmbedding)):
 23 |         info_1 = 0
 24 |         info_2 = None
 25 |         agg_fn = _linear_agg
 26 |     else:
 27 |         info_1 = []
 28 |         info_2 = []
 29 |         if isinstance(embedding, (BasicEmbedding, RaggedEmbedding)):
 30 |             agg_fn = _unique_agg
 31 |         else:
 32 |             agg_fn = _value_counts_agg
 33 |     return info_1, info_2, agg_fn
 34 | 
 35 | 
 36 | def fit_embeddings(
 37 |     data: DataLoader,
 38 |     embedding_num: Optional[EmbeddingBase],
 39 |     embedding_cat: Optional[EmbeddingBase],
 40 | ) -> Tuple[Optional[EmbeddingBase], Optional[EmbeddingBase]]:
 41 |     """
 42 |     Create the internal embedding info for the given numerical and categorical
 43 |     embeddings.
 44 | 
 45 |     Note: the passed embeddings are modified in place
 46 | 
 47 |     Parameters
 48 |     ----------
 49 |     data : torch.utils.data.DataLoader
 50 |     embedding_num : initialized embedding or None
 51 |     embedding_cat : initialized embedding or None
 52 | 
 53 |     Returns
 54 |     -------
 55 |     embedding_num, embedding_cat inputs, modified in place
 56 | 
 57 |     """
 58 |     if embedding_num is None and embedding_cat is None:
 59 |         return None, None
 60 | 
 61 |     # get initial values and aggregation functions
 62 |     if embedding_num is not None:
 63 |         num_info_1, num_info_2, num_agg_fn = _init_embed_info(embedding_num)
 64 |     if embedding_cat is not None:
 65 |         cat_info_1, cat_info_2, cat_agg_fn = _init_embed_info(embedding_cat)
 66 | 
 67 |     # iterate and update the initial aggregation values/objects
 68 |     for batch in data:
 69 |         if embedding_num is not None:
 70 |             num_info_1, num_info_2 = num_agg_fn(num_info_1, num_info_2, batch[0])
 71 |         if embedding_cat is not None:
 72 |             cat_info_1, cat_info_2 = cat_agg_fn(cat_info_1, cat_info_2, batch[1])
 73 | 
 74 |     # use aggregated values to set the embeddings
 75 |     if embedding_num is None:
 76 |         pass
 77 |     elif isinstance(embedding_num, LinearEmbedding):
 78 |         embedding_num.from_summary(num_info_1)
 79 |     else:
 80 |         embedding_num.from_summary(num_info_1, num_info_2)
 81 | 
 82 |     if embedding_cat is not None:
 83 |         embedding_cat.from_summary(cat_info_1, cat_info_2)
 84 | 
 85 |     return embedding_num, embedding_cat
 86 | 
 87 | 
 88 | def _check_is_uniform(embedding, name):
 89 |     if embedding is None:
 90 |         return
 91 |     if not isinstance(embedding, UniformBase):
 92 |         raise TypeError(
 93 |             "only 'uniform' embeddings are allowed for this model; "
 94 |             f"{name} is not a uniform embedding"
 95 |         )
 96 | 
 97 | 
 98 | def check_uniform_embeddings(
 99 |     embedding_num: Optional[EmbeddingBase],
100 |     embedding_cat: Optional[EmbeddingBase],
101 | ) -> EmbeddingInfo:
102 |     """
103 |     Check that embeddings are uniform, are not both None, and have same
104 |     embedding_size
105 | 
106 |     Parameters
107 |     ----------
108 |     embedding_num : XyNN embedding or None
109 |     embedding_cat : XyNN embedding or None
110 | 
111 |     Return
112 |     ------
113 |     UniformEmbeddingInfo NamedTuple containing
114 |     - num_fields
115 |     - embedding_size
116 |     - output_size = num_fields * embedding_size
117 | 
118 |     """
119 |     # check embedding sizes and get derived values
120 |     if embedding_num is None and embedding_cat is None:
121 |         raise ValueError("embedding_num and embedding_cat cannot both be None")
122 | 
123 |     _check_is_uniform(embedding_num, "embedding_num")
124 |     _check_is_uniform(embedding_cat, "embedding_cat")
125 | 
126 |     if (
127 |         embedding_num is not None
128 |         and embedding_cat is not None
129 |         and not embedding_num.embedding_size == embedding_cat.embedding_size
130 |     ):
131 |         raise ValueError(
132 |             "embedding sizes must be the same for numeric and catgorical; got "
133 |             f"{embedding_num.embedding_size} and {embedding_cat.embedding_size}"
134 |         )
135 | 
136 |     num_fields = 0
137 |     if embedding_num is not None:
138 |         num_fields += embedding_num.num_fields
139 |         embedding_size = embedding_num.embedding_size
140 | 
141 |     if embedding_cat is not None:
142 |         num_fields += embedding_cat.num_fields
143 |         embedding_size = embedding_cat.embedding_size
144 | 
145 |     return UniformEmbeddingInfo(num_fields, embedding_size, num_fields * embedding_size)
146 | 
147 | 
148 | def check_embeddings(
149 |     embedding_num: Optional[EmbeddingBase],
150 |     embedding_cat: Optional[EmbeddingBase],
151 | ) -> EmbeddingInfo:
152 |     """
153 |     Return combined embedding info
154 | 
155 |     Parameters
156 |     ----------
157 |     embedding_num : XyNN embedding or None
158 |     embedding_cat : XyNN embedding or None
159 | 
160 |     Return
161 |     ------
162 |     EmbeddingInfo NamedTuple containing
163 |     - num_fields
164 |     - output_size = sum of individual output sizes
165 | 
166 |     """
167 |     # get number of fields and total output size
168 |     if embedding_num is None and embedding_cat is None:
169 |         return EmbeddingInfo(0, 0)
170 | 
171 |     num_fields = 0
172 |     output_size = 0
173 |     if embedding_num is not None:
174 |         num_fields += embedding_num.num_fields
175 |         output_size += embedding_num.output_size
176 | 
177 |     if embedding_cat is not None:
178 |         num_fields += embedding_cat.num_fields
179 |         output_size += embedding_cat.output_size
180 | 
181 |     return EmbeddingInfo(num_fields, output_size)
182 | 


--------------------------------------------------------------------------------
/xynn/embedding/uniform/numeric.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Classes for embedding numeric fields
  3 | 
  4 | LinearEmbedding
  5 |   - embed each numeric *field* with a vector; for each numeric value, multiply the
  6 |     field vector by the value
  7 | DenseEmbedding
  8 |   - a dense linear layer followed by an activation
  9 | 
 10 | """
 11 | 
 12 | from typing import Union, List, Optional, Tuple, Type
 13 | from functools import reduce
 14 | import operator
 15 | 
 16 | import torch
 17 | from torch import Tensor
 18 | from torch import nn
 19 | 
 20 | from .base import UniformBase
 21 | 
 22 | 
 23 | class LinearEmbedding(UniformBase):
 24 |     """
 25 |     An embedding for numeric fields. There is one embedded vector for each field.
 26 |     The embedded vector for a value is that value times its field's vector.
 27 | 
 28 |     """
 29 | 
 30 |     def __init__(self, embedding_size: int = 10, device: Union[str, torch.device] = "cpu"):
 31 |         """
 32 |         Parameters
 33 |         ----------
 34 |         embedding_size : int, optional
 35 |             size of each value's embedding vector; default is 10
 36 |         device : string or torch.device
 37 | 
 38 |         """
 39 |         super().__init__()
 40 |         self.num_fields = 0
 41 |         self.output_size = 0
 42 |         self.embedding: Optional[nn.Embedding] = None
 43 |         self.embedding_size = embedding_size
 44 |         self._device = device
 45 |         self.to(device)
 46 |         self._isfit = False
 47 | 
 48 |     def __repr__(self):
 49 |         return f"LinearEmbedding({self.embedding_size}, {repr(self._device)})"
 50 | 
 51 |     def from_summary(self, num_fields: int):
 52 |         """
 53 |         Create the embedding for the given number of fields
 54 | 
 55 |         Parameters
 56 |         ----------
 57 |         num_fields : int
 58 | 
 59 |         Return
 60 |         ------
 61 |         self
 62 | 
 63 |         """
 64 |         self.num_fields = num_fields
 65 |         self.output_size = num_fields * self.embedding_size
 66 |         self.embedding = nn.Embedding(num_fields, self.embedding_size).to(device=self._device)
 67 |         nn.init.xavier_uniform_(self.embedding.weight)
 68 | 
 69 |         self._isfit = True
 70 | 
 71 |         return self
 72 | 
 73 |     def _fit_array(self, X):
 74 |         self.from_summary(X.shape[1])
 75 | 
 76 |     def _fit_iterable(self, X):
 77 |         for batch in X:
 78 |             self._fit_array(batch)
 79 |             break
 80 | 
 81 |     def forward(self, X: Tensor) -> Tensor:
 82 |         """
 83 |         Produce embedding for each value in input
 84 | 
 85 |         Parameters
 86 |         ----------
 87 |         X : torch.Tensor
 88 | 
 89 |         Return
 90 |         ------
 91 |         torch.Tensor
 92 | 
 93 |         """
 94 |         if not self._isfit:
 95 |             raise RuntimeError("need to call `fit` or `from_summary` first")
 96 |         return self.embedding.weight * X.unsqueeze(dim=-1)
 97 | 
 98 | 
 99 | class DenseEmbedding(UniformBase):
100 |     """
101 |     An embedding for numeric fields, consisting of just a linear transformation with
102 |     an activation. Maps an input with shape n_rows * n_fields to an output with shape
103 |     n_rows * 1 * embedding_size if one value passed for embedding_size or
104 |     n_rows * embeddin_size[0] * embedding_size[1] if two values are passed
105 | 
106 |     """
107 | 
108 |     def __init__(
109 |         self,
110 |         embedding_size: Union[int, Tuple[int, ...], List[int]] = 10,
111 |         activation: Type[nn.Module] = nn.LeakyReLU,
112 |         device: Union[str, torch.device] = "cpu",
113 |     ):
114 |         """
115 |         Parameters
116 |         ----------
117 |         embedding_size : int, tuple of ints, or list of ints; optional
118 |             size of each value's embedding vector; default is 10
119 |         activation : subclass of torch.nn.Module, optional
120 |             default is nn.LeakyReLU
121 |         device : string or torch.device
122 | 
123 |         """
124 |         super().__init__()
125 | 
126 |         if isinstance(embedding_size, int):
127 |             embedding_size = (1, embedding_size)
128 |         elif len(embedding_size) == 1:
129 |             embedding_size = (1, embedding_size[0])
130 | 
131 |         self.num_fields = 0
132 |         self.output_size = 0
133 |         self.embedding_w: Optional[nn.Parameter] = None
134 |         self.embedding_b: Optional[nn.Parameter] = None
135 |         self.dense_out_size = embedding_size
136 |         self.embedding_size = embedding_size[-1]
137 |         self.activation = activation().to(device=device)
138 |         self._device = device
139 |         self.to(device)
140 |         self._isfit = False
141 | 
142 |     def __repr__(self):
143 |         dense_size = self.dense_out_size
144 |         activation = self.activation.__class__.__name__
145 |         device = repr(self._device)
146 |         return f"DenseEmbedding({dense_size}, {activation}, {device})"
147 | 
148 |     def from_summary(self, num_fields: int):
149 |         """
150 |         Create the embedding for the given number of fields
151 | 
152 |         Parameters
153 |         ----------
154 |         num_fields : int
155 | 
156 |         Return
157 |         ------
158 |         self
159 | 
160 |         """
161 |         self.num_fields = num_fields
162 |         self.output_size = reduce(operator.mul, self.dense_out_size, 1)
163 |         self.embedding_w = nn.Parameter(
164 |             torch.zeros((num_fields, *self.dense_out_size))
165 |         ).to(device=self._device)
166 |         self.embedding_b = nn.Parameter(
167 |             torch.zeros(self.dense_out_size)
168 |         ).to(device=self._device)
169 |         nn.init.xavier_uniform_(self.embedding_w)
170 | 
171 |         self._isfit = True
172 | 
173 |         return self
174 | 
175 |     def _fit_array(self, X):
176 |         self.from_summary(X.shape[1])
177 | 
178 |     def _fit_iterable(self, X):
179 |         for batch in X:
180 |             self._fit_array(batch)
181 |             break
182 | 
183 |     def forward(self, X: Tensor) -> Tensor:
184 |         """
185 |         Produce embedding for each value in input
186 | 
187 |         Parameters
188 |         ----------
189 |         X : torch.Tensor
190 | 
191 |         Return
192 |         ------
193 |         torch.Tensor
194 | 
195 |         """
196 |         if not self._isfit:
197 |             raise RuntimeError("need to call `fit` or `from_summary` first")
198 |         embedded = self.embedding_w.T.matmul(X.T.to(dtype=torch.float)).T + self.embedding_b
199 |         embedded = self.activation(embedded.reshape((X.shape[0], -1)))
200 |         return embedded.reshape((X.shape[0], *self.dense_out_size))
201 | 


--------------------------------------------------------------------------------
/xynn/xdeepfm/estimators.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Scikit-learn style classes for the xDeepFM model
  3 | 
  4 | """
  5 | 
  6 | import textwrap
  7 | from typing import Type, Union, Callable, Tuple, List, Optional
  8 | 
  9 | import torch
 10 | from torch import nn
 11 | 
 12 | from .modules import XDeepFM
 13 | from ..base_classes.estimators import BaseClassifier, BaseRegressor, ESTIMATOR_INIT_DOC
 14 | from ..embedding import EmbeddingBase
 15 | 
 16 | 
 17 | INIT_DOC = ESTIMATOR_INIT_DOC.format(
 18 |     textwrap.dedent(
 19 |         """\
 20 |         cin_layer_sizes : int, tuple of int or list of int; optional
 21 |             if `cin_full_agg` is False, all sizes except the last must be even;
 22 |             default is (128, 128)
 23 |         cin_activation : subclass of torch.nn.Module, optional
 24 |             default is nn.Identity
 25 |         cin_full_agg : bool, optional
 26 |             if True, each intermediate output is aggregated in the final CIN output;
 27 |             if False, half of each intermediate output is aggregated;
 28 |             default is False
 29 |         cin_use_bn : bool, optional
 30 |             default is True
 31 |         cin_bn_momentum: float, optional
 32 |             default is 0.1
 33 |         cin_use_residual: bool, optional
 34 |             whether to use a skip connection from CIN to output; default is True
 35 |         cin_use_mlp : bool, optional
 36 |             default is True"""
 37 |     )
 38 | )
 39 | 
 40 | 
 41 | class XDeepFMClassifier(BaseClassifier):
 42 |     """
 43 |     Scikit-learn style classification model for the xDeepFM model
 44 | 
 45 |     """
 46 | 
 47 |     diagram = XDeepFM.diagram
 48 | 
 49 |     def __init__(
 50 |         self,
 51 |         embedding_num: Optional[Union[str, EmbeddingBase]] = "auto",
 52 |         embedding_cat: Optional[Union[str, EmbeddingBase]] = "auto",
 53 |         embedding_l1_reg: float=0.0,
 54 |         embedding_l2_reg: float=0.0,
 55 |         cin_layer_sizes: Union[int, Tuple[int, ...], List[int]] = (128, 128),
 56 |         cin_activation: Type[nn.Module] = nn.Identity,
 57 |         cin_full_agg: bool = False,
 58 |         cin_use_bn: bool = True,
 59 |         cin_bn_momentum: float = 0.1,
 60 |         cin_use_residual: bool = True,
 61 |         cin_use_mlp: bool = True,
 62 |         mlp_hidden_sizes: Union[int, Tuple[int, ...], List[int]] = (512, 256, 128, 64),
 63 |         mlp_activation: Type[nn.Module] = nn.LeakyReLU,
 64 |         mlp_use_bn: bool = True,
 65 |         mlp_bn_momentum: float = 0.1,
 66 |         mlp_ghost_batch: Optional[int] = None,
 67 |         mlp_dropout: float = 0.0,
 68 |         mlp_l1_reg: float = 0.0,
 69 |         mlp_l2_reg: float = 0.0,
 70 |         mlp_use_skip: bool = True,
 71 |         use_leaky_gate: bool = True,
 72 |         loss_fn: Union[str, Callable] = "auto",
 73 |         seed: Union[int, None] = None,
 74 |         device: Union[str, torch.device] = "cpu",
 75 |     ):
 76 |         super().__init__(
 77 |             embedding_num=embedding_num,
 78 |             embedding_cat=embedding_cat,
 79 |             embedding_l1_reg=embedding_l1_reg,
 80 |             embedding_l2_reg=embedding_l2_reg,
 81 |             cin_layer_sizes=cin_layer_sizes,
 82 |             cin_activation=cin_activation,
 83 |             cin_full_agg=cin_full_agg,
 84 |             cin_use_bn=cin_use_bn,
 85 |             cin_bn_momentum=cin_bn_momentum,
 86 |             cin_use_residual=cin_use_residual,
 87 |             cin_use_mlp=cin_use_mlp,
 88 |             mlp_hidden_sizes=mlp_hidden_sizes,
 89 |             mlp_activation=mlp_activation,
 90 |             mlp_use_bn=mlp_use_bn,
 91 |             mlp_bn_momentum=mlp_bn_momentum,
 92 |             mlp_ghost_batch=mlp_ghost_batch,
 93 |             mlp_dropout=mlp_dropout,
 94 |             mlp_l1_reg=mlp_l1_reg,
 95 |             mlp_l2_reg=mlp_l2_reg,
 96 |             mlp_use_skip=mlp_use_skip,
 97 |             use_leaky_gate=use_leaky_gate,
 98 |             loss_fn=loss_fn,
 99 |             seed=seed,
100 |             device=device,
101 |         )
102 |         self._model_class = XDeepFM
103 | 
104 |     __init__.__doc__ = INIT_DOC
105 | 
106 | 
107 | class XDeepFMRegressor(BaseRegressor):
108 |     """
109 |     Scikit-learn style regression model for the xDeepFM model
110 | 
111 |     """
112 | 
113 |     diagram = XDeepFM.diagram
114 | 
115 |     def __init__(
116 |         self,
117 |         embedding_num: Optional[Union[str, EmbeddingBase]] = "auto",
118 |         embedding_cat: Optional[Union[str, EmbeddingBase]] = "auto",
119 |         embedding_l1_reg: float=0.0,
120 |         embedding_l2_reg: float=0.0,
121 |         cin_layer_sizes: Union[int, Tuple[int, ...], List[int]] = (128, 128),
122 |         cin_activation: Type[nn.Module] = nn.Identity,
123 |         cin_full_agg: bool = False,
124 |         cin_use_bn: bool = True,
125 |         cin_bn_momentum: float = 0.1,
126 |         cin_use_residual: bool = True,
127 |         cin_use_mlp: bool = True,
128 |         mlp_hidden_sizes: Union[int, Tuple[int, ...], List[int]] = (512, 256, 128, 64),
129 |         mlp_activation: Type[nn.Module] = nn.LeakyReLU,
130 |         mlp_use_bn: bool = True,
131 |         mlp_bn_momentum: float = 0.1,
132 |         mlp_ghost_batch: Optional[int] = None,
133 |         mlp_dropout: float = 0.0,
134 |         mlp_l1_reg: float = 0.0,
135 |         mlp_l2_reg: float = 0.0,
136 |         mlp_use_skip: bool = True,
137 |         use_leaky_gate: bool = True,
138 |         loss_fn: Union[str, Callable] = "auto",
139 |         seed: Union[int, None] = None,
140 |         device: Union[str, torch.device] = "cpu",
141 |     ):
142 |         super().__init__(
143 |             embedding_num=embedding_num,
144 |             embedding_cat=embedding_cat,
145 |             embedding_l1_reg=embedding_l1_reg,
146 |             embedding_l2_reg=embedding_l2_reg,
147 |             cin_layer_sizes=cin_layer_sizes,
148 |             cin_activation=cin_activation,
149 |             cin_full_agg=cin_full_agg,
150 |             cin_use_bn=cin_use_bn,
151 |             cin_bn_momentum=cin_bn_momentum,
152 |             cin_use_residual=cin_use_residual,
153 |             cin_use_mlp=cin_use_mlp,
154 |             mlp_hidden_sizes=mlp_hidden_sizes,
155 |             mlp_activation=mlp_activation,
156 |             mlp_use_bn=mlp_use_bn,
157 |             mlp_bn_momentum=mlp_bn_momentum,
158 |             mlp_ghost_batch=mlp_ghost_batch,
159 |             mlp_dropout=mlp_dropout,
160 |             mlp_l1_reg=mlp_l1_reg,
161 |             mlp_l2_reg=mlp_l2_reg,
162 |             mlp_use_skip=mlp_use_skip,
163 |             use_leaky_gate=use_leaky_gate,
164 |             loss_fn=loss_fn,
165 |             seed=seed,
166 |             device=device,
167 |         )
168 |         self._model_class = XDeepFM
169 | 
170 |     __init__.__doc__ = INIT_DOC
171 | 


--------------------------------------------------------------------------------
/tests/test_mlpnet/test_modules.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import random
  3 | 
  4 | import pytest
  5 | import torch
  6 | from torch import nn
  7 | import numpy as np
  8 | 
  9 | from xynn.base_classes.estimators import _set_seed
 10 | from xynn.base_classes.modules import BaseNN
 11 | from xynn.mlpnet.modules import MLPNet
 12 | from xynn.embedding import LinearEmbedding
 13 | from xynn.mlp import LeakyGate, GhostBatchNorm
 14 | 
 15 | from ..common import simple_train_inputs, simple_model_train_loop, SimpleEmbedding
 16 | 
 17 | 
 18 | def test_that_mlpnet_raises_error_without_numeric_field_info():
 19 |     with pytest.raises(
 20 |         TypeError,
 21 |         match="when embedding_num is None, num_numeric_fields must be an integer"
 22 |     ):
 23 |         MLPNet(
 24 |             task="classification",
 25 |             output_size=3,
 26 |             embedding_num=None,
 27 |             embedding_cat=None,
 28 |         )
 29 | 
 30 | 
 31 | def test_that_mlpnet_subclasses_basenn():
 32 |     assert issubclass(MLPNet, BaseNN)
 33 | 
 34 | 
 35 | def test_that_mlpnet_uses_basenn_init():
 36 |     embedding_num = SimpleEmbedding(20, 3)
 37 |     model = MLPNet(
 38 |         task="classification",
 39 |         output_size=3,
 40 |         embedding_num=embedding_num,
 41 |         embedding_cat=None,
 42 |         embedding_l1_reg=0.1,
 43 |         num_numeric_fields=20,
 44 |         mlp_l2_reg=0.2,
 45 |     )
 46 | 
 47 |     assert model.task == "classification"
 48 |     assert model.num_epochs == 0
 49 |     assert isinstance(model.loss_fn, nn.CrossEntropyLoss)
 50 |     assert model.embedding_num is embedding_num
 51 |     assert model.embedding_cat is None
 52 |     assert model.embedding_l1_reg == 0.1
 53 |     assert model.embedding_l2_reg == 0.0
 54 |     assert model.mlp_l1_reg == 0.0
 55 |     assert model.mlp_l2_reg == 0.2
 56 |     assert model.optimizer is None
 57 |     assert model.optimizer_info == {}
 58 |     assert model.scheduler == {}
 59 |     assert model._device == "cpu"
 60 | 
 61 | 
 62 | def test_that_activation_and_sizes_are_passed_to_mlp_module():
 63 |     embedding_num = SimpleEmbedding(20, 3)
 64 |     model = MLPNet(
 65 |         task="classification",
 66 |         output_size=3,
 67 |         embedding_num=embedding_num,
 68 |         embedding_cat=None,
 69 |         num_numeric_fields=20,
 70 |         mlp_activation=nn.ReLU,
 71 |         mlp_hidden_sizes=(512, 128, 32),
 72 |         mlp_use_bn=False,
 73 |         mlp_use_skip=False,
 74 |         use_leaky_gate=False,
 75 |     )
 76 |     expected_classes = [
 77 |         nn.Linear,
 78 |         nn.ReLU,
 79 |         nn.Linear,
 80 |         nn.ReLU,
 81 |         nn.Linear,
 82 |         nn.ReLU,
 83 |         nn.Linear,
 84 |     ]
 85 |     assert len(model.mlp.main_layers) == len(expected_classes)
 86 |     for layer, expected_class in zip(model.mlp.main_layers, expected_classes):
 87 |         assert isinstance(layer, expected_class)
 88 |     assert model.mlp.skip_layers is None
 89 | 
 90 | 
 91 | def test_that_more_parameters_are_passed_to_mlp_module():
 92 |     embedding_num = SimpleEmbedding(20, 3)
 93 |     model = MLPNet(
 94 |         task="classification",
 95 |         output_size=3,
 96 |         embedding_num=embedding_num,
 97 |         embedding_cat=None,
 98 |         num_numeric_fields=20,
 99 |         mlp_hidden_sizes=(512, 64),
100 |         mlp_use_bn=True,
101 |         mlp_dropout=0.1,
102 |         mlp_use_skip=True,
103 |         use_leaky_gate=True,
104 |     )
105 | 
106 |     expected_classes = [
107 |         LeakyGate,
108 |         nn.Dropout,
109 |         nn.Linear,
110 |         nn.BatchNorm1d,
111 |         nn.LeakyReLU,
112 |         nn.Dropout,
113 |         nn.Linear,
114 |         nn.BatchNorm1d,
115 |         nn.LeakyReLU,
116 |         nn.Dropout,
117 |         nn.Linear,
118 |     ]
119 |     assert len(model.mlp.main_layers) == len(expected_classes)
120 |     for layer, expected_class in zip(model.mlp.main_layers, expected_classes):
121 |         assert isinstance(layer, expected_class)
122 | 
123 |     expected_classes = [LeakyGate, nn.Linear]
124 |     assert len(model.mlp.skip_layers) == len(expected_classes)
125 |     for layer, expected_class in zip(model.mlp.skip_layers, expected_classes):
126 |         assert isinstance(layer, expected_class)
127 | 
128 | 
129 | def test_mlp_module_layers_with_ghost_batch():
130 |     embedding_num = SimpleEmbedding(20, 3)
131 |     model = MLPNet(
132 |         task="classification",
133 |         output_size=3,
134 |         embedding_num=embedding_num,
135 |         embedding_cat=None,
136 |         num_numeric_fields=20,
137 |         mlp_hidden_sizes=(512, 64),
138 |         mlp_use_bn=True,
139 |         mlp_ghost_batch=16,
140 |         mlp_use_skip=True,
141 |         use_leaky_gate=False,
142 |     )
143 | 
144 |     expected_classes = [
145 |         nn.Linear,
146 |         GhostBatchNorm,
147 |         nn.LeakyReLU,
148 |         nn.Linear,
149 |         GhostBatchNorm,
150 |         nn.LeakyReLU,
151 |         nn.Linear,
152 |     ]
153 |     assert len(model.mlp.main_layers) == len(expected_classes)
154 |     for layer, expected_class in zip(model.mlp.main_layers, expected_classes):
155 |         assert isinstance(layer, expected_class)
156 | 
157 |     assert isinstance(model.mlp.skip_layers, nn.Linear)
158 | 
159 | 
160 | def test_that_diagram_exists_and_prints_something(capsys):
161 |     MLPNet.diagram()
162 |     captured = capsys.readouterr()
163 |     assert len(captured.out.split("\n")) > 5
164 | 
165 | 
166 | def test_mlp_weight():
167 |     model = MLPNet(
168 |         task="regression",
169 |         output_size=1,
170 |         embedding_num=SimpleEmbedding(20, 3),
171 |         embedding_cat=None,
172 |         num_numeric_fields=3,
173 |         mlp_use_bn=False,
174 |         mlp_use_skip=False,
175 |         use_leaky_gate=False,
176 |     )
177 |     mlp = model.mlp
178 |     w1, w2 = model.mlp_weight_sum()
179 |     exp_w1 = sum(l.weight.abs().sum().item() for l in mlp.main_layers[::2])
180 |     exp_w2 = sum((l.weight ** 2).sum().item() for l in mlp.main_layers[::2])
181 |     assert np.isclose(w1.item(), exp_w1)
182 |     assert np.isclose(w2.item(), exp_w2)
183 | 
184 | 
185 | def test_that_mlpnet_learns():
186 |     _set_seed(10101)
187 | 
188 |     X = torch.randint(0, 10, (100, 10))
189 |     y = torch.rand((100, 1)) * 6 - 3
190 |     embedding_num = LinearEmbedding(embedding_size=3).fit(X)
191 |     model = MLPNet(
192 |         task="regression",
193 |         output_size=1,
194 |         embedding_num=embedding_num,
195 |         embedding_cat=None,
196 |         num_numeric_fields=10,
197 |         mlp_hidden_sizes=[10, 8, 8, 6],
198 |         mlp_use_bn=False,
199 |         mlp_use_skip=False,
200 |         use_leaky_gate=False,
201 |     )
202 |     loss_func = nn.MSELoss()
203 |     optimizer = torch.optim.Adam(model.parameters(), lr=1e-1)
204 |     loss_vals = simple_model_train_loop(model, X, None, y, loss_func, optimizer, num_epochs=5)
205 |     assert loss_vals[0] > loss_vals[-1]
206 | 


--------------------------------------------------------------------------------
/xynn/autoint/estimators.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Scikit-learn style classes for the AutoInt model
  3 | 
  4 | """
  5 | 
  6 | import textwrap
  7 | from typing import Type, Union, Callable, Tuple, List, Optional
  8 | 
  9 | import torch
 10 | from torch import nn
 11 | 
 12 | from .modules import AutoInt
 13 | from ..base_classes.estimators import BaseClassifier, BaseRegressor, ESTIMATOR_INIT_DOC
 14 | from ..embedding import EmbeddingBase
 15 | 
 16 | 
 17 | INIT_DOC = ESTIMATOR_INIT_DOC.format(
 18 |     textwrap.dedent(
 19 |         """\
 20 |         attn_embedding_size : int, optional
 21 |             default is 8
 22 |         attn_num_layers : int, optional
 23 |             default is 3
 24 |         attn_num_head : int, optional
 25 |             default is 2
 26 |         attn_activation : subclass of torch.nn.Module or None, optional
 27 |             applied to the transformation tensors; default is None
 28 |         attn_use_residual : bool, optional
 29 |             default is True
 30 |         attn_dropout : float, optional
 31 |             amount of dropout to use on the product of queries and keys;
 32 |             default is 0.1
 33 |         attn_normalize : bool, optional
 34 |             whether to normalize each attn layer output; default is True"""
 35 |     )
 36 | )
 37 | 
 38 | 
 39 | class AutoIntClassifier(BaseClassifier):
 40 |     """
 41 |     Scikit-learn style classification model for the AutoInt model
 42 | 
 43 |     """
 44 | 
 45 |     diagram = AutoInt.diagram
 46 | 
 47 |     def __init__(
 48 |         self,
 49 |         embedding_num: Optional[Union[str, EmbeddingBase]] = "auto",
 50 |         embedding_cat: Optional[Union[str, EmbeddingBase]] = "auto",
 51 |         embedding_l1_reg: float=0.0,
 52 |         embedding_l2_reg: float=0.0,
 53 |         attn_embedding_size: int = 8,
 54 |         attn_num_layers: int = 3,
 55 |         attn_num_heads: int = 2,
 56 |         attn_activation: Optional[Type[nn.Module]] = None,
 57 |         attn_use_residual: bool = True,
 58 |         attn_dropout: float = 0.1,
 59 |         attn_normalize: bool = True,
 60 |         attn_use_mlp: bool = True,
 61 |         mlp_hidden_sizes: Union[int, Tuple[int, ...], List[int]] = (512, 256, 128, 64),
 62 |         mlp_activation: Type[nn.Module] = nn.LeakyReLU,
 63 |         mlp_use_bn: bool = True,
 64 |         mlp_bn_momentum: float = 0.1,
 65 |         mlp_ghost_batch: Optional[int] = None,
 66 |         mlp_dropout: float = 0.0,
 67 |         mlp_l1_reg: float = 0.0,
 68 |         mlp_l2_reg: float = 0.0,
 69 |         mlp_use_skip: bool = True,
 70 |         use_leaky_gate: bool = True,
 71 |         weighted_sum: bool = True,
 72 |         loss_fn: Union[str, Callable] = "auto",
 73 |         seed: Union[int, None] = None,
 74 |         device: Union[str, torch.device] = "cpu",
 75 |     ):
 76 |         super().__init__(
 77 |             embedding_num=embedding_num,
 78 |             embedding_cat=embedding_cat,
 79 |             embedding_l1_reg=embedding_l1_reg,
 80 |             embedding_l2_reg=embedding_l2_reg,
 81 |             attn_embedding_size=attn_embedding_size,
 82 |             attn_num_layers=attn_num_layers,
 83 |             attn_num_heads=attn_num_heads,
 84 |             attn_activation=attn_activation,
 85 |             attn_use_residual=attn_use_residual,
 86 |             attn_dropout=attn_dropout,
 87 |             attn_normalize=attn_normalize,
 88 |             attn_use_mlp=attn_use_mlp,
 89 |             mlp_hidden_sizes=mlp_hidden_sizes,
 90 |             mlp_activation=mlp_activation,
 91 |             mlp_use_bn=mlp_use_bn,
 92 |             mlp_bn_momentum=mlp_bn_momentum,
 93 |             mlp_ghost_batch=mlp_ghost_batch,
 94 |             mlp_dropout=mlp_dropout,
 95 |             mlp_l1_reg=mlp_l1_reg,
 96 |             mlp_l2_reg=mlp_l2_reg,
 97 |             mlp_use_skip=mlp_use_skip,
 98 |             use_leaky_gate=use_leaky_gate,
 99 |             weighted_sum=weighted_sum,
100 |             loss_fn=loss_fn,
101 |             seed=seed,
102 |             device=device,
103 |         )
104 |         self._model_class = AutoInt
105 |         self._require_numeric_embedding = True
106 | 
107 |     __init__.__doc__ = INIT_DOC
108 | 
109 | 
110 | class AutoIntRegressor(BaseRegressor):
111 |     """
112 |     Scikit-learn style regression model for the AutoInt model
113 | 
114 |     """
115 | 
116 |     diagram = AutoInt.diagram
117 | 
118 |     def __init__(
119 |         self,
120 |         embedding_num: Optional[Union[str, EmbeddingBase]] = "auto",
121 |         embedding_cat: Optional[Union[str, EmbeddingBase]] = "auto",
122 |         embedding_l1_reg: float=0.0,
123 |         embedding_l2_reg: float=0.0,
124 |         attn_embedding_size: int = 8,
125 |         attn_num_layers: int = 3,
126 |         attn_num_heads: int = 2,
127 |         attn_activation: Optional[Type[nn.Module]] = None,
128 |         attn_use_residual: bool = True,
129 |         attn_dropout: float = 0.1,
130 |         attn_normalize: bool = True,
131 |         attn_use_mlp: bool = True,
132 |         mlp_hidden_sizes: Union[int, Tuple[int, ...], List[int]] = (512, 256, 128, 64),
133 |         mlp_activation: Type[nn.Module] = nn.LeakyReLU,
134 |         mlp_use_bn: bool = True,
135 |         mlp_bn_momentum: float = 0.1,
136 |         mlp_ghost_batch: Optional[int] = None,
137 |         mlp_dropout: float = 0.0,
138 |         mlp_l1_reg: float = 0.0,
139 |         mlp_l2_reg: float = 0.0,
140 |         mlp_use_skip: bool = True,
141 |         use_leaky_gate: bool = True,
142 |         weighted_sum: bool = True,
143 |         loss_fn: Union[str, Callable] = "auto",
144 |         seed: Union[int, None] = None,
145 |         device: Union[str, torch.device] = "cpu",
146 |     ):
147 |         super().__init__(
148 |             embedding_num=embedding_num,
149 |             embedding_cat=embedding_cat,
150 |             embedding_l1_reg=embedding_l1_reg,
151 |             embedding_l2_reg=embedding_l2_reg,
152 |             attn_embedding_size=attn_embedding_size,
153 |             attn_num_layers=attn_num_layers,
154 |             attn_num_heads=attn_num_heads,
155 |             attn_activation=attn_activation,
156 |             attn_use_residual=attn_use_residual,
157 |             attn_dropout=attn_dropout,
158 |             attn_normalize=attn_normalize,
159 |             attn_use_mlp=attn_use_mlp,
160 |             mlp_hidden_sizes=mlp_hidden_sizes,
161 |             mlp_activation=mlp_activation,
162 |             mlp_use_bn=mlp_use_bn,
163 |             mlp_bn_momentum=mlp_bn_momentum,
164 |             mlp_ghost_batch=mlp_ghost_batch,
165 |             mlp_dropout=mlp_dropout,
166 |             mlp_l1_reg=mlp_l1_reg,
167 |             mlp_l2_reg=mlp_l2_reg,
168 |             mlp_use_skip=mlp_use_skip,
169 |             use_leaky_gate=use_leaky_gate,
170 |             weighted_sum=weighted_sum,
171 |             loss_fn=loss_fn,
172 |             seed=seed,
173 |             device=device,
174 |         )
175 |         self._model_class = AutoInt
176 |         self._require_numeric_embedding = True
177 | 
178 |     __init__.__doc__ = INIT_DOC
179 | 


--------------------------------------------------------------------------------
/xynn/preprocessing.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Data preprocessing
  3 | 
  4 | """
  5 | 
  6 | from typing import Union, Any, Iterator, List
  7 | 
  8 | import torch
  9 | from torch import Tensor
 10 | import numpy as np
 11 | from sklearn.preprocessing import LabelEncoder
 12 | 
 13 | 
 14 | UTA = Union[Tensor, np.ndarray]
 15 | 
 16 | 
 17 | def _ismissing(column: UTA) -> UTA:
 18 |     # tensor
 19 |     if isinstance(column, Tensor):
 20 |         if column.dtype in (torch.float, torch.double, torch.half, torch.bfloat16):
 21 |             return torch.isnan(column)
 22 |         return torch.full(column.shape, False, dtype=torch.bool)
 23 | 
 24 |     # ndarray
 25 |     if np.issubdtype(column.dtype, np.floating):
 26 |         return np.isnan(column)
 27 |     elif column.dtype == np.dtype("O"):
 28 |         return np.array([isinstance(x, float) and np.isnan(x) for x in column])
 29 |     return np.full(column.shape, False, dtype=np.bool)
 30 | 
 31 | 
 32 | def _columns(X: UTA) -> Iterator[UTA]:
 33 |     """
 34 |     Split 2d input into 1d columns
 35 | 
 36 |     Parameters
 37 |     ----------
 38 |     X : NumPy array or PyTorch Tensor
 39 |         should be 2d
 40 | 
 41 |     Yields
 42 |     ------
 43 |     NumPy arrays or PyTorch Tensors
 44 | 
 45 |     """
 46 |     if isinstance(X, Tensor):
 47 |         columns = X.split(1, dim=1)
 48 |     else:
 49 |         columns = np.split(X, X.shape[1], axis=1)
 50 | 
 51 |     for column in columns:
 52 |         yield column.reshape((-1,))
 53 | 
 54 | 
 55 | def _isin(column: UTA, test_values: np.ndarray) -> UTA:
 56 |     if isinstance(column, Tensor):
 57 |         test_values = torch.from_numpy(test_values)
 58 |         return (column[..., None] == test_values).any(-1)
 59 |     return np.isin(column, test_values)
 60 | 
 61 | 
 62 | class IntegerEncoder:
 63 |     """
 64 |     Convert categorical inputs to integers.
 65 |     Input: 2d Tensor or NumPy array
 66 |     Output: 2d integer-valued Tensor
 67 | 
 68 |     """
 69 | 
 70 |     def __init__(self, unexpected="increment"):
 71 |         """
 72 |         Parameters
 73 |         ----------
 74 |         unexpected : {"increment", "raise"}, optional
 75 |             when encountering unexpected values in `transform`,
 76 |             whether to use a new label for them ("increment") or
 77 |             whether to raise an error ("raise"); default is "increment"
 78 | 
 79 |         """
 80 |         self.encoders: List[LabelEncoder] = []
 81 |         self.classes_: List[np.ndarray] = []
 82 |         self.nan_labels: List[int] = []
 83 |         self.num_classes: List[int] = []
 84 |         self.class_counts: List[List[int]] = []
 85 |         self._isfit = False
 86 |         self.unexpected = unexpected
 87 | 
 88 |     def fit(self, X: UTA, y: Any = None) -> "IntegerEncoder":
 89 |         """
 90 |         Fit encoder values from the input data
 91 | 
 92 |         Parameters
 93 |         ----------
 94 |         X : NumPy array or PyTorch Tensor
 95 |             should be 2d
 96 |         y : any, optional
 97 |             not used; parameter provided to imitate Scikit-learn transformers;
 98 |             default is None
 99 | 
100 |         """
101 |         for column in _columns(X):
102 |             missing = _ismissing(column)
103 |             encoder = LabelEncoder()
104 |             encoder.fit(column[~missing])
105 |             self.encoders.append(encoder)
106 |             self.classes_.append(encoder.classes_)
107 | 
108 |             self.num_classes.append(len(encoder.classes_))
109 |             self.class_counts.append(
110 |                 [(column == val).sum().item() for val in encoder.classes_]
111 |             )
112 |             num_missing = missing.sum().item()
113 |             if num_missing:
114 |                 self.nan_labels.append(len(encoder.classes_))
115 |                 self.num_classes[-1] += 1
116 |                 self.class_counts[-1].append(int(missing.sum().item()))
117 |             else:
118 |                 self.nan_labels.append(-1)
119 | 
120 |         self._isfit = True
121 |         return self
122 | 
123 |     def _unexpected(self, column: UTA, col_idx: int) -> UTA:
124 |         if not self._isfit:
125 |             raise RuntimeError("encoder needs to be fit first")
126 |         unexp = ~_isin(column, self.classes_[col_idx])
127 |         if self.nan_labels[col_idx] != -1:
128 |             unexp[_ismissing(column)] = False
129 |         return unexp
130 | 
131 |     def transform(self, X: UTA, y: Any = None) -> Tensor:
132 |         """
133 |         Encode the input with integers
134 | 
135 |         Parameters
136 |         ----------
137 |         X : NumPy array or PyTorch Tensor
138 |             should be 2d
139 |         y : any, optional
140 |             not used; parameter provided to imitate Scikit-learn transformers;
141 |             default is None
142 | 
143 |         Returns
144 |         -------
145 |         PyTorch Tensor, with each column transformed to integers, from zero
146 |         up to (not including) the number of classes in that column
147 | 
148 |         """
149 |         if not self._isfit:
150 |             raise RuntimeError("encoder needs to be fit first")
151 |         if not X.shape[1] == len(self.encoders):
152 |             raise ValueError(
153 |                 "input has the wrong shape, expected "
154 |                 f"{len(self.encoders)} columns, got {X.shape[1]}"
155 |             )
156 | 
157 |         encoded_cols = []
158 |         for col_idx, column in enumerate(_columns(X)):
159 |             unxpctd = self._unexpected(column, col_idx)
160 |             if unxpctd.sum() and self.unexpected == "raise":
161 |                 values = ", ".join(str(x) for x in column[unxpctd][:3])
162 |                 if unxpctd.sum() > 3:
163 |                     values += ", ..."
164 |                 raise ValueError(f"unexpected values found in input: {values}")
165 |             encoder = self.encoders[col_idx]
166 |             missing = _ismissing(column)
167 |             allgood = ~missing & ~unxpctd
168 |             encoded = torch.empty(column.shape, dtype=torch.int64)
169 |             encoded[allgood] = torch.from_numpy(encoder.transform(column[allgood]))
170 |             encoded[missing] = self.nan_labels[col_idx]
171 |             encoded[unxpctd] = self.num_classes[col_idx]
172 |             encoded_cols.append(encoded.reshape((-1, 1)))
173 |         return torch.cat(encoded_cols, dim=1)
174 | 
175 |     def fit_transform(self, X: UTA, y: Any = None) -> Tensor:
176 |         """
177 |         Fit encoder values and encode the input
178 | 
179 |         Parameters
180 |         ----------
181 |         X : NumPy array or PyTorch Tensor
182 |             should be 2d
183 |         y : any, optional
184 |             not used; parameter provided to imitate Scikit-learn transformers;
185 |             default is None
186 | 
187 |         Returns
188 |         -------
189 |         PyTorch Tensor, with each column transformed to integers, from zero
190 |         up to (not including) the number of classes in that column
191 | 
192 |         """
193 |         self.fit(X)
194 |         return self.transform(X)
195 | 


--------------------------------------------------------------------------------
/tests/test_pnn/test_estimators.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | 
  4 | from xynn.pnn import PNNRegressor, PNNClassifier
  5 | from xynn.pnn import PNNPlusRegressor, PNNPlusClassifier
  6 | from xynn.embedding import LinearEmbedding, DefaultEmbedding
  7 | 
  8 | from ..common import check_estimator_learns, simple_data
  9 | 
 10 | 
 11 | def test_that_basic_params_are_passed_to_pnn_module():
 12 |     X_num, X_cat, y = simple_data(task="classification")
 13 |     estimator = PNNClassifier(
 14 |         embedding_l2_reg=0.2,
 15 |         mlp_l1_reg=0.1,
 16 |     )
 17 |     estimator.fit(
 18 |         X_num=X_num,
 19 |         X_cat=X_cat,
 20 |         y=y,
 21 |         optimizer=torch.optim.Adam,
 22 |         opt_kwargs={"lr": 1e-1},
 23 |         num_epochs=1,
 24 |     )
 25 | 
 26 |     model = estimator._model
 27 | 
 28 |     assert model.task == "classification"
 29 |     assert model.num_epochs == 1
 30 |     assert isinstance(model.loss_fn, nn.CrossEntropyLoss)
 31 |     assert model.embedding_num is not None
 32 |     assert model.embedding_cat is not None
 33 |     assert model.embedding_l1_reg == 0.0
 34 |     assert model.embedding_l2_reg == 0.2
 35 |     assert model.mlp_l1_reg == 0.1
 36 |     assert model.mlp_l2_reg == 0.0
 37 |     assert model.optimizer is not None
 38 |     assert model.optimizer_info != {}
 39 |     assert model.scheduler == {}
 40 |     assert model._device == torch.device("cpu")
 41 | 
 42 | 
 43 | def test_that_basic_params_are_passed_to_pnnplus_module():
 44 |     X_num, X_cat, y = simple_data(task="classification")
 45 |     estimator = PNNPlusClassifier(
 46 |         embedding_l2_reg=0.2,
 47 |         mlp_l1_reg=0.1,
 48 |     )
 49 |     estimator.fit(
 50 |         X_num=X_num,
 51 |         X_cat=X_cat,
 52 |         y=y,
 53 |         optimizer=torch.optim.Adam,
 54 |         opt_kwargs={"lr": 1e-1},
 55 |         num_epochs=1,
 56 |     )
 57 | 
 58 |     model = estimator._model
 59 | 
 60 |     assert model.task == "classification"
 61 |     assert model.num_epochs == 1
 62 |     assert isinstance(model.loss_fn, nn.CrossEntropyLoss)
 63 |     assert model.embedding_num is not None
 64 |     assert model.embedding_cat is not None
 65 |     assert model.embedding_l1_reg == 0.0
 66 |     assert model.embedding_l2_reg == 0.2
 67 |     assert model.mlp_l1_reg == 0.1
 68 |     assert model.mlp_l2_reg == 0.0
 69 |     assert model.optimizer is not None
 70 |     assert model.optimizer_info != {}
 71 |     assert model.scheduler == {}
 72 |     assert model._device == torch.device("cpu")
 73 | 
 74 | 
 75 | def test_that_pnnregressor_learns():
 76 |     estimator = PNNRegressor(
 77 |         mlp_hidden_sizes=[10, 8, 8, 6],
 78 |         mlp_use_bn=False,
 79 |         mlp_use_skip=False,
 80 |         use_leaky_gate=False,
 81 |     )
 82 |     check_estimator_learns(estimator, task="regression")
 83 |     assert estimator.init_parameters == {
 84 |         "embedding_num": "auto",
 85 |         "embedding_cat": "auto",
 86 |         "embedding_l1_reg": 0.0,
 87 |         "embedding_l2_reg": 0.0,
 88 |         "pnn_product_type": "outer",
 89 |         "pnn_product_size": 10,
 90 |         "mlp_hidden_sizes": [10, 8, 8, 6],
 91 |         "mlp_activation": nn.LeakyReLU,
 92 |         "mlp_use_bn": False,
 93 |         "mlp_bn_momentum": 0.1,
 94 |         "mlp_ghost_batch": None,
 95 |         "mlp_dropout": 0.0,
 96 |         "mlp_l1_reg": 0.0,
 97 |         "mlp_l2_reg": 0.0,
 98 |         "mlp_use_skip": False,
 99 |         "use_leaky_gate": False,
100 |         "loss_fn": "auto",
101 |         "seed": None,
102 |         "device": "cpu",
103 |     }
104 | 
105 | 
106 | def test_that_pnnclassifier_learns():
107 |     estimator = PNNClassifier(
108 |         pnn_product_type="inner",
109 |         mlp_hidden_sizes=[10, 8, 8, 6],
110 |         mlp_use_bn=False,
111 |         mlp_ghost_batch=4,
112 |         mlp_use_skip=False,
113 |         use_leaky_gate=False,
114 |     )
115 |     check_estimator_learns(estimator, task="classification")
116 |     assert estimator.init_parameters == {
117 |         "embedding_num": "auto",
118 |         "embedding_cat": "auto",
119 |         "embedding_l1_reg": 0.0,
120 |         "embedding_l2_reg": 0.0,
121 |         "pnn_product_type": "inner",
122 |         "pnn_product_size": 10,
123 |         "mlp_hidden_sizes": [10, 8, 8, 6],
124 |         "mlp_activation": nn.LeakyReLU,
125 |         "mlp_use_bn": False,
126 |         "mlp_bn_momentum": 0.1,
127 |         "mlp_ghost_batch": 4,
128 |         "mlp_dropout": 0.0,
129 |         "mlp_l1_reg": 0.0,
130 |         "mlp_l2_reg": 0.0,
131 |         "mlp_use_skip": False,
132 |         "use_leaky_gate": False,
133 |         "loss_fn": "auto",
134 |         "seed": None,
135 |         "device": "cpu",
136 |     }
137 | 
138 | 
139 | 
140 | def test_that_pnnplusregressor_learns():
141 |     estimator = PNNPlusRegressor(
142 |         pnn_product_type="both",
143 |         pnn_product_size=8,
144 |         mlp_hidden_sizes=[10, 8, 8, 6],
145 |         mlp_use_bn=False,
146 |         mlp_ghost_batch=4,
147 |         mlp_use_skip=False,
148 |         use_leaky_gate=False,
149 |     )
150 |     check_estimator_learns(estimator, task="regression")
151 |     assert estimator.init_parameters == {
152 |         "embedding_num": "auto",
153 |         "embedding_cat": "auto",
154 |         "embedding_l1_reg": 0.0,
155 |         "embedding_l2_reg": 0.0,
156 |         "pnn_product_type": "both",
157 |         "pnn_product_size": 8,
158 |         "mlp_hidden_sizes": [10, 8, 8, 6],
159 |         "mlp_activation": nn.LeakyReLU,
160 |         "mlp_use_bn": False,
161 |         "mlp_bn_momentum": 0.1,
162 |         "mlp_ghost_batch": 4,
163 |         "mlp_dropout": 0.0,
164 |         "mlp_l1_reg": 0.0,
165 |         "mlp_l2_reg": 0.0,
166 |         "mlp_use_skip": False,
167 |         "use_leaky_gate": False,
168 |         "weighted_sum": True,
169 |         "loss_fn": "auto",
170 |         "seed": None,
171 |         "device": "cpu",
172 |     }
173 | 
174 | 
175 | def test_that_pnnplusclassifier_learns():
176 |     embed_num = LinearEmbedding(10)
177 |     embed_cat = DefaultEmbedding(10)
178 |     estimator = PNNPlusClassifier(
179 |         embedding_num=embed_num,
180 |         embedding_cat=embed_cat,
181 |         mlp_hidden_sizes=[10, 8, 8, 6],
182 |         mlp_use_bn=False,
183 |         mlp_use_skip=False,
184 |         use_leaky_gate=False,
185 |     )
186 |     check_estimator_learns(estimator, task="classification")
187 |     assert estimator.init_parameters == {
188 |         "embedding_num": embed_num,
189 |         "embedding_cat": embed_cat,
190 |         "embedding_l1_reg": 0.0,
191 |         "embedding_l2_reg": 0.0,
192 |         "pnn_product_type": "outer",
193 |         "pnn_product_size": 10,
194 |         "mlp_hidden_sizes": [10, 8, 8, 6],
195 |         "mlp_activation": nn.LeakyReLU,
196 |         "mlp_use_bn": False,
197 |         "mlp_bn_momentum": 0.1,
198 |         "mlp_ghost_batch": None,
199 |         "mlp_dropout": 0.0,
200 |         "mlp_l1_reg": 0.0,
201 |         "mlp_l2_reg": 0.0,
202 |         "mlp_use_skip": False,
203 |         "use_leaky_gate": False,
204 |         "weighted_sum": True,
205 |         "loss_fn": "auto",
206 |         "seed": None,
207 |         "device": "cpu",
208 |     }
209 |     assert repr(estimator._model.embedding_num) == "LinearEmbedding(10, 'cpu')"
210 |     assert repr(estimator._model.embedding_cat) == "DefaultEmbedding(10, 20, 'cpu')"
211 | 


--------------------------------------------------------------------------------
/tests/common.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from tempfile import NamedTemporaryFile
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import torch
  7 | from torch import nn
  8 | from torch.utils.data import Dataset, DataLoader
  9 | 
 10 | from xynn.base_classes.estimators import _set_seed
 11 | from xynn.base_classes.modules import BaseNN
 12 | 
 13 | 
 14 | class SimpleEmbedding(nn.Module):
 15 | 
 16 |     def __init__(self, num_embeddings, embedding_dim):
 17 |         super().__init__()
 18 |         self.embedding = nn.Embedding(num_embeddings, embedding_dim)
 19 |         self.num_fields = 1
 20 |         self.embedding_size = embedding_dim
 21 |         self.output_size = embedding_dim
 22 | 
 23 |     def weight_sum(self):
 24 |         w = self.embedding.weight
 25 |         return w.abs().sum().item(), (w ** 2).sum().item()
 26 | 
 27 |     def forward(self, x):
 28 |         return self.embedding(x)
 29 | 
 30 | 
 31 | def simple_train_loop(module, X, y, loss_func, optimizer, num_epochs):
 32 |     module.train()
 33 |     losses = []
 34 |     for e_ in range(num_epochs):
 35 |         optimizer.zero_grad()
 36 |         y_pred = module(X)
 37 |         loss = loss_func(y_pred, y)
 38 |         loss.backward()
 39 |         optimizer.step()
 40 |         losses.append(loss.item())
 41 |     return losses
 42 | 
 43 | 
 44 | def simple_model_train_loop(model, X_num, X_cat, y, loss_func, optimizer, num_epochs):
 45 |     model.train()
 46 |     losses = []
 47 |     for e_ in range(num_epochs):
 48 |         optimizer.zero_grad()
 49 |         y_pred = model(X_num, X_cat)
 50 |         print(y_pred.shape, y.shape)
 51 |         loss = loss_func(y_pred, y)
 52 |         loss.backward()
 53 |         optimizer.step()
 54 |         losses.append(loss.item())
 55 |     return losses
 56 | 
 57 | 
 58 | class Reshape(nn.Module):
 59 | 
 60 |     def forward(self, X):
 61 |         return X.reshape((X.shape[0], -1))
 62 | 
 63 | 
 64 | def example_data():
 65 |     data = pd.DataFrame(
 66 |         {
 67 |             "num_a": [i / 10 for i in range(10)],
 68 |             "num_b": range(10, 0, -1),
 69 |             "cat_a": list("abcdabcaba"),
 70 |             "cat_b": list("abbabacbab"),
 71 |             "cat_c": [1, 1, 0, 0, 1, 1, 0, np.nan, 1, 1],
 72 |             "cat_a_num": [0, 1, 2, 3, 0, 1, 2, 0, 1, 0],
 73 |             "cat_b_num": [0, 1, 1, 0, 1, 0, 2, 1, 0, 1],
 74 |         }
 75 |     )
 76 |     return data
 77 | 
 78 | 
 79 | class SimpleMLP(BaseNN):
 80 | 
 81 |     def __init__(
 82 |         self,
 83 |         task="regression",
 84 |         embedding_num=None,
 85 |         embedding_cat=None,
 86 |         embedding_l1_reg=0.0,
 87 |         embedding_l2_reg=0.0,
 88 |         input_size=11,
 89 |         hidden_sizes=(7,),
 90 |         output_size=3,
 91 |         mlp_l1_reg=0.0,
 92 |         mlp_l2_reg=0.0,
 93 |         loss_fn="auto",
 94 |         mix_value=None,
 95 |         device="cpu",
 96 |     ):
 97 |         super().__init__(
 98 |             task,
 99 |             embedding_num,
100 |             embedding_cat,
101 |             embedding_l1_reg,
102 |             embedding_l2_reg,
103 |             mlp_l1_reg,
104 |             mlp_l2_reg,
105 |             loss_fn,
106 |         )
107 |         layers = []
108 |         for size in hidden_sizes:
109 |             layers.append(nn.Linear(input_size, size))
110 |             input_size = size
111 |             layers.append(nn.ReLU())
112 |         layers.append(nn.Linear(input_size, output_size))
113 |         self.layers = nn.Sequential(*layers)
114 |         if mix_value is not None:
115 |             self.mix = torch.tensor([mix_value])
116 |         else:
117 |             self.mix = None
118 |         self._device = "cpu"
119 |         self.to(device)
120 | 
121 |     def mlp_weight_sum(self):
122 |         w1_sum = 0.0
123 |         w2_sum = 0.0
124 |         for layer in self.layers:
125 |             if not isinstance(layer, nn.Linear):
126 |                 continue
127 |             w1_sum += layer.weight.abs().sum().item()
128 |             w2_sum += (layer.weight ** 2).sum().item()
129 |         return w1_sum, w2_sum
130 | 
131 |     def forward(self, X_num, X_cat):
132 |         x = torch.cat([X_num, X_cat], axis=1)
133 |         return self.layers(x)
134 | 
135 | 
136 | class SimpleDataset(Dataset):
137 | 
138 |     def __init__(self, X_num, X_cat, y):
139 |         self.X_num = X_num
140 |         self.X_cat = X_cat
141 |         self.y = y
142 | 
143 |     def __len__(self):
144 |         return len(self.y)
145 | 
146 |     def __getitem__(self, idx):
147 |         return self.X_num[idx], self.X_cat[idx], self.y[idx]
148 | 
149 | 
150 | def simple_data(task="regression"):
151 |     X_num = torch.randint(-2, 2, (300, 10), dtype=torch.float32)
152 |     X_cat = torch.randint(0, 2, (300, 1), dtype=torch.float32)
153 |     z = torch.rand(size=(300, 3), dtype=torch.float32) - 0.5
154 |     y = torch.tensor(
155 |         [
156 |             [
157 |                 0.1 * num[0] - 0.2 * num[1] + 0.1 * num[2] * num[3] + cat[0],
158 |                 - 0.3 * num[4] * num[6] * num[7] + 0.1 * num[8] - num[9],
159 |                 - 0.2 * num[1] - 0.3 * num[4] * num[6] * num[7] + 0.1 * cat[0],
160 |             ]
161 |             for num, cat in zip(X_num, X_cat)
162 |         ],
163 |         dtype=torch.float32
164 |     ) + z
165 |     if task == "classification":
166 |         y_sum = y.sum(dim=1)
167 |         y_cuts = torch.quantile(y_sum, q=torch.tensor([1/3, 2/3]))
168 |         y = (
169 |             (y_sum > y_cuts[0]).to(dtype=torch.int)
170 |             + (y_sum > y_cuts[1]).to(dtype=torch.int)
171 |         )
172 |     return X_num, X_cat, y
173 | 
174 | 
175 | def simple_train_inputs(
176 |     loss_fn="auto",
177 |     mix_value=None,
178 |     optimizer=torch.optim.Adam,
179 |     opt_kwargs={"lr": 1e-2},
180 |     scheduler=None,
181 |     sch_kwargs=None,
182 |     sch_options=None,
183 |     configure=True,
184 | ):
185 |     X_num, X_cat, y = simple_data()
186 |     X_num_train, X_num_valid = X_num[:220], X_num[220:]
187 |     X_cat_train, X_cat_valid = X_cat[:220], X_cat[220:]
188 |     y_train, y_valid = y[:220], y[220:]
189 | 
190 |     model = SimpleMLP(task="regression", loss_fn=loss_fn, mix_value=mix_value)
191 |     model.set_optimizer(
192 |         optimizer=optimizer,
193 |         opt_kwargs=opt_kwargs,
194 |         scheduler=scheduler,
195 |         sch_kwargs=sch_kwargs,
196 |         sch_options=sch_options,
197 |     )
198 |     if configure:
199 |         model.configure_optimizers()
200 | 
201 |     train_ds = SimpleDataset(X_num_train, X_cat_train, y_train)
202 |     valid_ds = SimpleDataset(X_num_valid, X_cat_valid, y_valid)
203 |     train_dl = DataLoader(train_ds, batch_size=10, shuffle=True)
204 |     valid_dl = DataLoader(valid_ds, batch_size=10)
205 | 
206 |     return model, train_dl, valid_dl
207 | 
208 | 
209 | def check_estimator_learns(estimator, task, data=None, seed=10101):
210 |     _set_seed(seed)
211 | 
212 |     if data is None:
213 |         X_num, X_cat, y = simple_data(task=task)
214 |     else:
215 |         X_num, X_cat, y = data
216 | 
217 |     logfile = NamedTemporaryFile()
218 | 
219 |     estimator.fit(
220 |         X_num=X_num,
221 |         X_cat=X_cat,
222 |         y=y,
223 |         optimizer=torch.optim.Adam,
224 |         opt_kwargs={"lr": 1e-1},
225 |         log_path=logfile.name,
226 |     )
227 | 
228 |     with open(logfile.name, "r") as infile:
229 |         train_info = json.load(infile)
230 | 
231 |     loss_vals = [epoch["train_loss"] for epoch in train_info["train_info"]]
232 |     assert any(loss_vals[i] < loss_vals[0] for i in range(1, len(loss_vals)))
233 | 


--------------------------------------------------------------------------------
/tests/test_dataset.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | import torch
  4 | import pytest
  5 | 
  6 | from xynn.dataset import _validate_x, _validate_y, TabularDataLoader
  7 | 
  8 | 
  9 | def test_that__validate_x_raises_error_with_wrong_type():
 10 |     X = list(range(100))
 11 |     y = torch.tensor(list(range(100)))
 12 |     with pytest.raises(TypeError, match="input X should be Tensor, NumPy array, or None"):
 13 |         _validate_x(X, y, "X", "cpu")
 14 | 
 15 | 
 16 | def test_that__validate_x_raises_error_with_wrong_size():
 17 |     X = np.array(list(range(99)))
 18 |     y = torch.tensor(list(range(100)))
 19 |     with pytest.raises(
 20 |         ValueError,
 21 |         match=r"shape mismatch; got y.shape\[0\] == 100, X.shape\[0\] == 99",
 22 |     ):
 23 |         _validate_x(X, y, "X", "cpu")
 24 | 
 25 | 
 26 | def test_that__validate_x_raises_error_with_wrong_shape():
 27 |     X = np.array(list(range(100)))
 28 |     y = torch.tensor(list(range(100)))
 29 |     with pytest.raises(ValueError, match=r"X should be 2-d; got shape \(100,\)"):
 30 |         _validate_x(X, y, "X", "cpu")
 31 | 
 32 |     X = np.array(list(range(100))).reshape((100, 1, 1))
 33 |     y = torch.tensor(list(range(100)))
 34 |     with pytest.raises(ValueError, match=r"X should be 2-d; got shape \(100, 1, 1\)"):
 35 |         _validate_x(X, y, "X", "cpu")
 36 | 
 37 | 
 38 | def test_that__validate_x_returns_empty_tensor_when_given_none():
 39 |     y = torch.tensor(list(range(100)))
 40 |     X = _validate_x(None, y, "X example", "cpu")
 41 |     assert X.shape == (100, 0)
 42 | 
 43 | 
 44 | def test__validate_x_with_numpy_input():
 45 |     X = np.array(list(range(100))).reshape((100, 1))
 46 |     y = torch.tensor(list(range(100)))
 47 |     X_out = _validate_x(X, y, "X", "cpu")
 48 |     assert all(X[i, 0].item() == X_out[i, 0].item() for i in range(100))
 49 | 
 50 | 
 51 | def test__validate_x_with_tensor_input():
 52 |     X = torch.tensor([[i, i + 1] for i in range(100)])
 53 |     y = torch.tensor(list(range(100)))
 54 |     X_out = _validate_x(X, y, "X", "cpu")
 55 |     assert X_out is X
 56 | 
 57 | 
 58 | def test_that__validate_y_raises_error_with_wrong_type():
 59 |     with pytest.raises(TypeError, match="y should be Tensor or NumPy array"):
 60 |         _validate_y(None, task="regression", device="cpu")
 61 | 
 62 | 
 63 | def test_that__validate_y_raises_error_with_wrong_size():
 64 |     y = torch.tensor([])
 65 |     with pytest.raises(
 66 |         ValueError,
 67 |         match=r"y has a zero-sized dimension; got shape torch.Size\(\[0\]\)"
 68 |     ):
 69 |         _validate_y(y, task="classification", device="cpu")
 70 | 
 71 | 
 72 | def test_that__validate_y_raises_error_with_wrong_shape():
 73 |     y = torch.tensor(list(range(100))).reshape((25, 4))
 74 |     with pytest.raises(
 75 |         ValueError,
 76 |         match="for classification y must be 1-d or 2-d with one column"
 77 |     ):
 78 |         _validate_y(y, task="classification", device="cpu")
 79 | 
 80 |     y = torch.tensor(list(range(100))).reshape((100, 1, 1))
 81 |     with pytest.raises(
 82 |         ValueError,
 83 |         match=r"y has too many dimensions; got shape torch.Size\(\[100, 1, 1\]\)",
 84 |     ):
 85 |         _validate_y(y, task="regression", device="cpu")
 86 | 
 87 | 
 88 | def test__validate_y_with_numpy_input():
 89 |     y = np.array(list(range(100)), dtype="int").reshape((100, 1))
 90 |     y_out = _validate_y(y, task="regression", device="cpu")
 91 |     assert all(y[i, 0].item() == y_out[i, 0].item() for i in range(100))
 92 |     assert y_out.shape == (100, 1)
 93 |     y_out = _validate_y(y, task="classification", device="cpu")
 94 |     assert y_out.shape == (100,)
 95 | 
 96 | 
 97 | def test__validate_y_with_tensor_input():
 98 |     y = torch.tensor(list(range(100)))
 99 |     y_out = _validate_y(y, task="regression", device="cpu")
100 |     assert all(y[i].item() == y_out[i, 0].item() for i in range(100))
101 |     assert y_out.shape == (100, 1)
102 |     y_out = _validate_y(y, task="classification", device="cpu")
103 |     assert y_out is y
104 | 
105 |     y = torch.tensor(list(range(100))).reshape((100, 1))
106 |     y_out = _validate_y(y, task="regression", device="cpu")
107 |     assert y_out is y
108 | 
109 | 
110 | def test_that_TabularDataLoader_raises_error_when_both_Xs_are_None():
111 |     y = np.array([0, 1] * 10)
112 |     with pytest.raises(TypeError, match="X_num and X_cat cannot both be None"):
113 |         TabularDataLoader(task="regression", X_num=None, X_cat=None, y=y)
114 | 
115 | 
116 | def test_TabularDataLoader_with_numpy_input():
117 |     X_num = np.array([[i - 1, i, i + 1] for i in range(20, 0, -1)])
118 |     X_cat = np.array([[i + j for j in range(5)] for i in range(20)])
119 |     y = np.array([0, 1] * 10)
120 | 
121 |     loader = TabularDataLoader(task="regression", X_num=X_num, X_cat=X_cat, y=y)
122 |     assert len(loader) == 1
123 | 
124 |     batch = next(iter(loader))
125 |     assert isinstance(batch, tuple)
126 |     assert len(batch) == 3
127 |     # batch size is greater than 20
128 |     assert torch.all(batch[0] == torch.from_numpy(X_num)).item()
129 |     assert torch.all(batch[1] == torch.from_numpy(X_cat)).item()
130 |     assert torch.all(batch[2] == torch.from_numpy(y).reshape((20, 1))).item()
131 | 
132 | 
133 | def test_TabularDataLoader_with_shuffled_numpy_input():
134 |     X_num = np.array([[i - 1, i, i + 1] for i in range(20, 0, -1)])
135 |     X_cat = np.array([[i + j for j in range(5)] for i in range(20)])
136 |     y = np.arange(20)
137 | 
138 |     loader = TabularDataLoader(
139 |         task="regression", X_num=X_num, X_cat=X_cat, y=y, shuffle=True
140 |     )
141 |     assert len(loader) == 1
142 | 
143 |     batch = next(iter(loader))
144 |     assert isinstance(batch, tuple)
145 |     assert len(batch) == 3
146 |     # batch size is greater than 20
147 |     order = [int(x.item()) for x in batch[2].reshape((20,))]
148 |     assert set(order) == set(y)
149 |     assert order != list(y)
150 |     assert torch.all(batch[0] == torch.from_numpy(X_num[order])).item()
151 |     assert torch.all(batch[1] == torch.from_numpy(X_cat[order])).item()
152 |     assert torch.all(batch[2] == torch.from_numpy(y[order]).reshape((20, 1))).item()
153 | 
154 | 
155 | def test_TabularDataLoader_with_numpy_input_and_smaller_batches():
156 |     X_num = np.array([[i - 1, i, i + 1] for i in range(20, 0, -1)])
157 |     X_cat = np.array([[i + j for j in range(5)] for i in range(20)])
158 |     y = np.arange(20)
159 | 
160 |     loader = TabularDataLoader(
161 |         task="regression", X_num=X_num, X_cat=X_cat, y=y, shuffle=True, batch_size=10
162 |     )
163 |     assert len(loader) == 2
164 | 
165 |     batch_0, batch_1 = list(loader)
166 |     assert all(len(t) == 10 for batch in (batch_0, batch_1) for t in batch)
167 |     batch_cat = tuple(torch.cat([t_0, t_1], dim=0) for t_0, t_1 in zip(batch_0, batch_1))
168 |     order = [int(x.item()) for x in batch_cat[2].reshape((20,))]
169 |     assert set(order) == set(y)
170 |     assert order != list(y)
171 |     assert torch.all(batch_cat[0] == torch.from_numpy(X_num[order])).item()
172 |     assert torch.all(batch_cat[1] == torch.from_numpy(X_cat[order])).item()
173 |     assert torch.all(batch_cat[2] == torch.from_numpy(y[order]).reshape((20, 1))).item()
174 | 
175 | 
176 | def test_TabularDataLoader_with_tensor_input():
177 |     X_num = torch.tensor([[i - 1, i, i + 1] for i in range(20, 0, -1)])
178 |     y = torch.tensor([0, 1] * 10)
179 | 
180 |     loader = TabularDataLoader(task="classification", X_num=X_num, X_cat=None, y=y)
181 |     assert len(loader) == 1
182 | 
183 |     batch = next(iter(loader))
184 |     assert isinstance(batch, tuple)
185 |     assert len(batch) == 3
186 |     # batch size is greater than 20
187 |     assert torch.all(batch[0] == X_num).item()
188 |     assert batch[1].shape == (20, 0)
189 |     assert torch.all(batch[2] == y).item()
190 | 


--------------------------------------------------------------------------------
/xynn/embedding/uniform/fast_categorical.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Classes for embedding categorical fields
  3 | 
  4 | FastBasicEmbedding
  5 |   - requires that each field's values are integers 0, 1, ...
  6 |   - embed each value with single vector
  7 | FastDefaultEmbedding
  8 |   - requires that each field's values are integers 0, 1, ...
  9 |   - like FastBasicEmbedding, but include a "default" vector for each field
 10 |   - returned vector is a weighted combination between the value's own vector
 11 |     and the field's "default" vector
 12 |   - the weighting is based on the count of value in the training set; a higher
 13 |     count puts more weight the value's own vector
 14 |   - values not seen in the training data are embedded with the default vector
 15 | 
 16 | """
 17 | 
 18 | from typing import Union, List, Optional
 19 | 
 20 | import numpy as np
 21 | import torch
 22 | from torch import nn, Tensor
 23 | 
 24 | from ..common import FastBasicBase, FastDefaultBase
 25 | from .base import UniformBase
 26 | 
 27 | 
 28 | class FastBasicEmbedding(UniformBase, FastBasicBase):
 29 |     """
 30 |     A basic embedding that creates an embedded vector for each field value.
 31 | 
 32 |     """
 33 | 
 34 |     def __init__(self, embedding_size: int = 10, device: Union[str, torch.device] = "cpu"):
 35 |         """
 36 |         Parameters
 37 |         ----------
 38 |         embedding_size : int, optional
 39 |             size of each value's embedding vector; default is 10
 40 |         device : string or torch.device
 41 | 
 42 |         """
 43 |         super().__init__()
 44 |         self.num_fields = 0
 45 |         self.output_size = 0
 46 |         self.offsets: Optional[Tensor] = None
 47 |         self.embedding: Optional[nn.Embedding] = None
 48 |         self.embedding_size = embedding_size
 49 |         self._device = device
 50 |         self.to(device)
 51 |         self._isfit = False
 52 | 
 53 |     def __repr__(self):
 54 |         embed_size = self.embedding_size
 55 |         device = repr(self._device)
 56 |         return f"FastBasicEmbedding({embed_size}, {device})"
 57 | 
 58 |     def from_summary(self, num_classes: List[int]) -> "FastBasicEmbedding":
 59 |         """
 60 |         Create the embedding from category values for each field
 61 | 
 62 |         Parameters
 63 |         ----------
 64 |         num_classes : list of int
 65 |             number of category values for each field
 66 | 
 67 |         Return
 68 |         ------
 69 |         self
 70 | 
 71 |         """
 72 |         self.num_fields = len(num_classes)
 73 |         self.output_size = self.num_fields * self.embedding_size
 74 |         self.offsets = torch.tensor([[0] + list(np.cumsum(num_classes[:-1]))], device=self._device)
 75 |         self.embedding = nn.Embedding(
 76 |             sum(num_classes), self.embedding_size
 77 |         ).to(device=self._device)
 78 |         nn.init.xavier_uniform_(self.embedding.weight)
 79 | 
 80 |         self._isfit = True
 81 | 
 82 |         return self
 83 | 
 84 |     def forward(self, X: Tensor) -> Tensor:
 85 |         """
 86 |         Produce embedding for each value in input
 87 | 
 88 |         Parameters
 89 |         ----------
 90 |         X : torch.Tensor
 91 | 
 92 |         Return
 93 |         ------
 94 |         torch.Tensor
 95 | 
 96 |         """
 97 |         if not self._isfit:
 98 |             raise RuntimeError("need to call `fit` or `from_summary` first")
 99 | 
100 |         return self.embedding(X + self.offsets)
101 | 
102 | 
103 | class FastDefaultEmbedding(UniformBase, FastDefaultBase):
104 |     """
105 |     An embedding with a default value for each field. The default is returned for
106 |     any field value not seen when the embedding was initialized (using `fit` or
107 |     `from_summary`). For any value seen at initialization, a weighted average of
108 |     that value's embedding and the default embedding is returned. The weights for
109 |     the average are determined by the parameter `alpha`:
110 | 
111 |     weight = count / (count + alpha)
112 |     final = embedding * weight + default * (1 - weight)
113 | 
114 |     """
115 | 
116 |     def __init__(
117 |         self,
118 |         embedding_size: int = 10,
119 |         alpha: int = 20,
120 |         device: Union[str, torch.device] = "cpu",
121 |     ):
122 |         """
123 |         Parameters
124 |         ----------
125 |         embedding_size : int, optional
126 |             size of each value's embedding vector; default is 10
127 |         alpha : int, optional
128 |             controls the weighting of each embedding vector with the default;
129 |             when `alpha`-many values are seen at initialization; the final
130 |             vector is evenly weighted; the influence of the default is decreased
131 |             with either higher counts or lower `alpha`; default is 20
132 |         device : string or torch.device
133 | 
134 |         """
135 |         super().__init__()
136 |         #self.offsets
137 |         self.embedding_size = embedding_size
138 |         self.alpha = alpha
139 |         self.num_fields = 0
140 |         self.output_size = 0
141 |         self.max_values: Optional[Tensor] = None
142 |         self.offsets: Optional[Tensor] = None
143 |         self.counts: Optional[Tensor] = None
144 |         self.num_cat_vals = 0
145 |         self.embedding: Optional[nn.Embedding] = None
146 |         self._device = device
147 |         self.to(device)
148 |         self._isfit = False
149 | 
150 |     def __repr__(self):
151 |         embed_size = self.embedding_size
152 |         alpha = self.alpha
153 |         device = repr(self._device)
154 |         return f"FastDefaultEmbedding({embed_size}, {alpha}, {device})"
155 | 
156 |     def from_summary(self, class_counts: List[List[int]]) -> "FastDefaultEmbedding":
157 |         """
158 |         Create the embedding from known value counts for each field
159 | 
160 |         Parameters
161 |         ----------
162 |         class_counts : list of list of int
163 |             each sub-list has count of category occurrences,
164 |             one sub-list for each field
165 | 
166 |         Return
167 |         ------
168 |         self
169 | 
170 |         """
171 |         num_fields = len(class_counts)
172 |         num_uniques = [len(counts) for counts in class_counts]
173 |         max_values = [x - 1 for x in num_uniques]
174 |         offsets = [0] + list(np.cumsum(num_uniques[:-1]))
175 |         num_embed = sum(num_uniques) + num_fields
176 |         counts_flat = [count for field in class_counts for count in field]
177 | 
178 |         self.num_fields = num_fields
179 |         self.output_size = self.num_fields * self.embedding_size
180 |         self.max_values = torch.tensor(max_values, device=self._device).reshape((1, -1))
181 |         self.offsets = torch.tensor(offsets, device=self._device)
182 |         self.counts = torch.tensor(counts_flat).to(self._device)
183 |         self.num_cat_vals = sum(num_uniques)
184 |         self.embedding = nn.Embedding(num_embed, self.embedding_size).to(self._device)
185 |         nn.init.xavier_uniform_(self.embedding.weight)
186 | 
187 |         self._isfit = True
188 | 
189 |         return self
190 | 
191 |     def forward(self, X: Tensor) -> Tensor:
192 |         """
193 |         Produce embedding for each value in input
194 | 
195 |         Parameters
196 |         ----------
197 |         X : torch.Tensor
198 | 
199 |         Return
200 |         ------
201 |         torch.Tensor
202 | 
203 |         """
204 |         if not self._isfit:
205 |             raise RuntimeError("need to call `fit` or `from_summary` first")
206 | 
207 |         offsets = self.offsets.expand(X.shape[0], self.offsets.shape[0])
208 |         X_offset = X + offsets
209 | 
210 |         unxpcted = (X > self.max_values)
211 |         X_offset[unxpcted] = offsets[unxpcted]  # block any unexpected categories
212 | 
213 |         counts = self.counts.expand(X_offset.shape[0], self.counts.shape[0])
214 |         counts = torch.gather(counts, dim=1, index=X_offset)
215 |         weights = (counts / (counts + self.alpha)).unsqueeze(-1)
216 |         weights[unxpcted] = 0  # block any unexpected categories
217 |         primary = self.embedding(X_offset)
218 |         default = self.embedding.weight[self.num_cat_vals:, :].unsqueeze(0)
219 |         return weights * primary + (1 - weights) * default
220 | 


--------------------------------------------------------------------------------
/tests/test_base_classes/test_modules.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import pytest
  3 | import torch
  4 | from torch import nn
  5 | from pytorch_lightning import Trainer
  6 | 
  7 | from xynn.embedding import RaggedEmbedding, LinearEmbedding
  8 | from ..common import SimpleMLP, simple_train_inputs, SimpleEmbedding
  9 | 
 10 | 
 11 | def test_that_basenn_raises_error_for_bad_task_value():
 12 |     with pytest.raises(
 13 |         ValueError,
 14 |         match=(
 15 |             "task classy-regression not recognized; "
 16 |             "should be 'regression' or 'classification'"
 17 |         )
 18 |     ):
 19 |         SimpleMLP(task="classy-regression")
 20 | 
 21 | 
 22 | def test_that_basenn_raises_error_when_configuring_optimizer_without_setting():
 23 |     mlp = SimpleMLP()
 24 |     with pytest.raises(
 25 |         RuntimeError,
 26 |         match=(
 27 |             "The optimizer and learning rate info needs to first be set "
 28 |             "with the `set_optimizer` method"
 29 |         ),
 30 |     ):
 31 |         mlp.configure_optimizers()
 32 | 
 33 | 
 34 | def test_num_parameters_against_known_value():
 35 |     mlp = SimpleMLP()
 36 |     assert mlp.num_parameters() == 11 * 7 + 7 + 7 * 3 + 3
 37 |     mlp.embedding_num = SimpleEmbedding(20, 3)
 38 |     assert mlp.num_parameters() == 20 * 3 + 11 * 7 + 7 + 7 * 3 + 3
 39 | 
 40 | 
 41 | def test_embedding_sum_against_known_value():
 42 |     mlp = SimpleMLP()
 43 |     mlp.embedding_num = SimpleEmbedding(20, 3)
 44 |     assert mlp.embedding_sum() == mlp.embedding_num.weight_sum()
 45 | 
 46 |     mlp.embedding_num.embedding.weight = nn.Parameter(
 47 |         torch.tensor([[-1, 0, 1]] * 20, dtype=torch.float32)
 48 |     )
 49 |     mlp.embedding_cat = SimpleEmbedding(10, 4)
 50 |     mlp.embedding_cat.embedding.weight = nn.Parameter(
 51 |         torch.tensor([[-1, 0, 1, 2]] * 10, dtype=torch.float32)
 52 |     )
 53 |     assert mlp.embedding_sum() == (80, 100)
 54 | 
 55 | 
 56 | def test_that_embed_raises_error_when_both_Xs_none():
 57 |     mlp = SimpleMLP()
 58 |     with pytest.raises(ValueError, match="X_num and X_cat cannot both be None"):
 59 |         mlp.embed(None, None)
 60 | 
 61 | 
 62 | def test_that_embed_raises_error_for_bad_num_dim():
 63 |     X_num = torch.tensor([[0, 5, 15], [1, 6, 16]])
 64 |     X_cat = torch.tensor([[0, 5], [1, 6]])
 65 |     mlp = SimpleMLP(
 66 |         embedding_num=SimpleEmbedding(20, 3),
 67 |         embedding_cat=SimpleEmbedding(10, 3),
 68 |     )
 69 | 
 70 |     with pytest.raises(ValueError, match="num_dim should be 2 or 3, got 4"):
 71 |         mlp.embed(X_num, X_cat, num_dim=4)
 72 | 
 73 |     with pytest.raises(ValueError, match="num_dim should be 2 or 3, got any"):
 74 |         mlp.embed(X_num, X_cat, num_dim="any")
 75 | 
 76 | 
 77 | def test_that_embed_raises_error_for_num_dim_3_with_ragged_embedding():
 78 |     X_num = torch.tensor([[0, 5, 15], [1, 6, 16]])
 79 |     X_cat = torch.tensor([[0, 5], [1, 6]])
 80 |     mlp = SimpleMLP(
 81 |         embedding_num=SimpleEmbedding(20, 3),
 82 |         embedding_cat=RaggedEmbedding(),
 83 |     )
 84 |     with pytest.raises(ValueError, match="cannot use num_dim=3 with ragged embeddings"):
 85 |         mlp.embed(X_num, X_cat, num_dim=3)
 86 | 
 87 | 
 88 | def test_embed():
 89 |     X_num = torch.tensor([[0, 5, 15], [1, 6, 16]])
 90 |     X_cat = torch.tensor([[0, 5], [1, 6]])
 91 |     mlp = SimpleMLP(
 92 |         embedding_num=SimpleEmbedding(20, 3),
 93 |         embedding_cat=SimpleEmbedding(10, 3),
 94 |     )
 95 |     X_num_emb, X_cat_emb = mlp.embed(X_num, X_cat, concat=False)
 96 |     assert X_num_emb.shape == (2, 3, 3)
 97 |     assert X_cat_emb.shape == (2, 2, 3)
 98 | 
 99 | 
100 | def test_embed_when_numeric_data_has_zero_columns():
101 |     mlp = SimpleMLP(
102 |         embedding_num=SimpleEmbedding(20, 3),
103 |         embedding_cat=SimpleEmbedding(10, 4),
104 |     )
105 |     X_cat = torch.tensor([[0, 5], [1, 6]])
106 |     X_num = torch.empty(size=(2, 0))
107 | 
108 |     X_num_emb, X_cat_emb = mlp.embed(X_num, X_cat, concat=False)
109 |     assert X_num_emb.shape == (2, 0, 4)
110 |     assert X_cat_emb.shape == (2, 2, 4)
111 | 
112 |     embedded = mlp.embed(X_num, X_cat)
113 |     assert embedded.shape == (2, 2, 4), str(embedded.shape)
114 | 
115 |     embedded = mlp.embed(X_num, X_cat, num_dim=2)
116 |     assert embedded.shape == (2, 8), str(embedded.shape)
117 | 
118 |     # show that None works the same as the empty X_num above
119 |     assert torch.all(mlp.embed(X_num, X_cat) == mlp.embed(None, X_cat)).item()
120 | 
121 | 
122 | def test_embed_when_categorical_data_has_zero_columns():
123 |     mlp = SimpleMLP(
124 |         embedding_num=SimpleEmbedding(20, 3),
125 |         embedding_cat=SimpleEmbedding(10, 4),
126 |     )
127 |     X_num = torch.tensor([[0, 5, 15], [1, 6, 16]])
128 |     X_cat = torch.empty(size=(2, 0))
129 | 
130 |     X_num_emb, X_cat_emb = mlp.embed(X_num, X_cat, concat=False)
131 |     assert X_num_emb.shape == (2, 3, 3)
132 |     assert X_cat_emb.shape == (2, 0, 3)
133 | 
134 |     embedded = mlp.embed(X_num, X_cat)
135 |     assert embedded.shape == (2, 3, 3), str(embedded.shape)
136 | 
137 |     embedded = mlp.embed(X_num, X_cat, num_dim=2)
138 |     assert embedded.shape == (2, 9), str(embedded.shape)
139 | 
140 |     # show that None works the same as the empty X_cat above
141 |     assert torch.all(mlp.embed(X_num, X_cat) == mlp.embed(X_num, None)).item()
142 | 
143 | 
144 | def test_embed_results_without_numeric_embeddings():
145 |     X_num = torch.tensor([[0, 5, 15], [1, 6, 16]])
146 |     X_cat = torch.tensor([[0, 5], [1, 6]])
147 |     mlp = SimpleMLP(
148 |         embedding_num=None,
149 |         embedding_cat=SimpleEmbedding(10, 4),
150 |     )
151 | 
152 |     X_num_emb, X_cat_emb = mlp.embed(X_num, X_cat, concat=False)
153 |     assert X_num_emb.shape == (2, 3, 1)
154 |     assert X_cat_emb.shape == (2, 2, 4)
155 | 
156 |     X_num_emb, X_cat_emb = mlp.embed(X_num, X_cat, num_dim=2, concat=False)
157 |     assert X_num_emb.shape == (2, 3)
158 |     assert X_cat_emb.shape == (2, 8)
159 | 
160 |     embedded = mlp.embed(X_num, X_cat, num_dim=2)
161 |     assert embedded.shape == (2, 11)
162 | 
163 |     # cannot concat with different numbers in dim 2
164 |     with pytest.raises(RuntimeError):
165 |         mlp.embed(X_num, X_cat)
166 | 
167 | 
168 | def test_embed_results_without_categorical_embeddings():
169 |     X_num = torch.tensor([[0, 5, 15], [1, 6, 16]])
170 |     X_cat = torch.tensor([[0, 5], [1, 6]])
171 |     mlp = SimpleMLP(
172 |         embedding_num=SimpleEmbedding(20, 3),
173 |         embedding_cat=None,
174 |     )
175 | 
176 |     X_num_emb, X_cat_emb = mlp.embed(X_num, X_cat, concat=False)
177 |     assert X_num_emb.shape == (2, 3, 3)
178 |     assert X_cat_emb.shape == (2, 0, 3)
179 | 
180 |     embedded = mlp.embed(X_num, X_cat)
181 |     assert embedded.shape == (2, 3, 3)
182 | 
183 |     embedded = mlp.embed(X_num, X_cat, num_dim=2)
184 |     assert embedded.shape == (2, 9)
185 | 
186 | 
187 | def test_embed_results_without_any_embeddings():
188 |     X_num = torch.tensor([[0, 5, 15], [1, 6, 16]])
189 |     X_cat = torch.tensor([[0, 5], [1, 6]])
190 |     mlp = SimpleMLP(
191 |         embedding_num=None,
192 |         embedding_cat=None,
193 |     )
194 | 
195 |     X_num_emb, X_cat_emb = mlp.embed(X_num, X_cat, concat=False)
196 |     assert torch.all(X_num_emb == X_num.reshape((2, 3, 1))).item()
197 |     assert X_cat_emb.shape == (2, 0, 1)
198 | 
199 |     embedded = mlp.embed(X_num, X_cat)
200 |     assert torch.all(embedded == X_num.reshape((2, 3, 1))).item()
201 | 
202 |     embedded = mlp.embed(X_num, X_cat, num_dim=2)
203 |     assert torch.all(embedded == X_num).item()
204 | 
205 | 
206 | def test_embed_results_with_ragged_embedding():
207 |     X_num = torch.tensor([[0, 5, 15], [1, 6, 16]])
208 |     X_cat = torch.tensor([[0, 5], [1, 6]])
209 | 
210 |     mlp = SimpleMLP(
211 |         embedding_num=None,
212 |         embedding_cat=RaggedEmbedding(embedding_size=(3, 4)).fit(X_cat),
213 |     )
214 |     X_num_emb, X_cat_emb = mlp.embed(X_num, X_cat, num_dim=2, concat=False)
215 |     assert X_num_emb.shape == (2, 3)
216 |     assert X_cat_emb.shape == (2, 7)
217 | 
218 |     embedded = mlp.embed(X_num, X_cat, num_dim=2)
219 |     assert embedded.shape == (2, 10)
220 | 
221 |     mlp = SimpleMLP(
222 |         embedding_num=LinearEmbedding(embedding_size=3).fit(X_num),
223 |         embedding_cat=RaggedEmbedding(embedding_size=(3, 4)).fit(X_cat),
224 |     )
225 |     X_num_emb, X_cat_emb = mlp.embed(X_num, X_cat, num_dim=2, concat=False)
226 |     assert X_num_emb.shape == (2, 9)
227 |     assert X_cat_emb.shape == (2, 7)
228 | 
229 |     embedded = mlp.embed(X_num, X_cat, num_dim=2)
230 |     assert embedded.shape == (2, 16)
231 | 
232 | 
233 | def test_that_pytorch_lightning_runs_without_error():
234 |     model, train_dl, valid_dl = simple_train_inputs(configure=False)
235 |     test_dl = valid_dl  # just to check that the code runs
236 |     trainer = Trainer(max_epochs=5)
237 |     trainer.fit(model, train_dl, valid_dl)
238 |     trainer.test(model, test_dl)
239 | 


--------------------------------------------------------------------------------
/xynn/fibinet/estimators.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Scikit-learn style classes for the FiBiNet model
  3 | 
  4 | """
  5 | 
  6 | import textwrap
  7 | from typing import Type, Union, Callable, List, Tuple, Optional
  8 | 
  9 | import torch
 10 | from torch import nn
 11 | 
 12 | from .modules import FiBiNet
 13 | from ..base_classes.estimators import BaseClassifier, BaseRegressor, ESTIMATOR_INIT_DOC
 14 | from ..embedding import EmbeddingBase
 15 | 
 16 | 
 17 | INIT_DOC = ESTIMATOR_INIT_DOC.format(
 18 |     textwrap.dedent(
 19 |         """\
 20 |         fibi_reduction_ratio : int, optional
 21 |             used in the SENET layer; default is 3
 22 |         fibi_activation : subclass of torch.nn.Module, optional
 23 |             activation used in the SENET layer; default is nn.LeakyReLU
 24 |         fibi_senet_product : str, optional
 25 |             options:
 26 |                 - "field-all"
 27 |                 - "field-each"
 28 |                 - "field-interaction"
 29 |                 - "sym-all"
 30 |                 - "sym-each"
 31 |                 - "sym-interaction"
 32 |                 - "hadamard"
 33 |             "field" :
 34 |                 the original asymmetric bilinear products, with products like
 35 |                 `linear(field_1) * field_2`
 36 |                 where `*` is elementwise multiplication
 37 |             "sym" :
 38 |                 symmetric versions of the "field" products
 39 |                 `(linear(field_1) * field_2 + field_1 * linear(field_2)) / 2`
 40 |             "all" : a single product matrix is shared across all pairs of fields
 41 |             "each" : each field has an associated product matrix
 42 |             "interaction" : each pair of fields has an associated product matrix
 43 |             "hadamard" : elementwise multiplication of each pair of fields
 44 |             default is \"sym-interaction\"
 45 |         fibi_embed_product : str, optional
 46 |             options:
 47 |                 - "shared"
 48 |                 - "field-all"
 49 |                 - "field-each"
 50 |                 - "field-interaction"
 51 |                 - "sym-all"
 52 |                 - "sym-each"
 53 |                 - "sym-interaction"
 54 |                 - "hadamard"
 55 |             "shared" :
 56 |                 use the same product layer (not just the same option) as the SENET
 57 |                 product (previous parameter)
 58 |             for descriptions of other options, see notes under `fibi_senet_product`;
 59 |             default is \"sym-interaction\"
 60 |         fibi_senet_skip: bool, optional
 61 |             whether SENET output should also be used in both the MLP and Bilinear
 62 |             layer (True), or just the Bilinear layer (False); see FiBiNet.diagram();
 63 |             default is True"""
 64 |     )
 65 | )
 66 | 
 67 | 
 68 | class FiBiNetClassifier(BaseClassifier):
 69 |     """
 70 |     Scikit-learn style classification model for the FiBiNet model
 71 | 
 72 |     """
 73 | 
 74 |     diagram = FiBiNet.diagram
 75 | 
 76 |     def __init__(
 77 |         self,
 78 |         embedding_num: Optional[Union[str, EmbeddingBase]] = "auto",
 79 |         embedding_cat: Optional[Union[str, EmbeddingBase]] = "auto",
 80 |         embedding_l1_reg: float = 0.0,
 81 |         embedding_l2_reg: float = 0.0,
 82 |         fibi_reduction_ratio: int = 3,
 83 |         fibi_activation: Type[nn.Module] = nn.LeakyReLU,
 84 |         fibi_senet_product: str = "sym-interaction",
 85 |         fibi_embed_product: str = "sym-interaction",
 86 |         fibi_senet_skip: bool = True,
 87 |         mlp_hidden_sizes: Union[int, Tuple[int, ...], List[int]] = (512, 256, 128, 64),
 88 |         mlp_activation: Type[nn.Module] = nn.LeakyReLU,
 89 |         mlp_use_bn: bool = True,
 90 |         mlp_bn_momentum: float = 0.1,
 91 |         mlp_ghost_batch: Optional[int] = None,
 92 |         mlp_dropout: float = 0.0,
 93 |         mlp_l1_reg: float = 0.0,
 94 |         mlp_l2_reg: float = 0.0,
 95 |         mlp_use_skip: bool = True,
 96 |         use_leaky_gate: bool = True,
 97 |         loss_fn: Union[str, Callable] = "auto",
 98 |         seed: Union[int, None] = None,
 99 |         device: Union[str, torch.device] = "cpu",
100 |     ):
101 |         super().__init__(
102 |             embedding_num=embedding_num,
103 |             embedding_cat=embedding_cat,
104 |             embedding_l1_reg=embedding_l1_reg,
105 |             embedding_l2_reg=embedding_l2_reg,
106 |             fibi_reduction_ratio=fibi_reduction_ratio,
107 |             fibi_activation=fibi_activation,
108 |             fibi_senet_product=fibi_senet_product,
109 |             fibi_embed_product=fibi_embed_product,
110 |             fibi_senet_skip=fibi_senet_skip,
111 |             mlp_hidden_sizes=mlp_hidden_sizes,
112 |             mlp_activation=mlp_activation,
113 |             mlp_use_bn=mlp_use_bn,
114 |             mlp_bn_momentum=mlp_bn_momentum,
115 |             mlp_ghost_batch=mlp_ghost_batch,
116 |             mlp_dropout=mlp_dropout,
117 |             mlp_l1_reg=mlp_l1_reg,
118 |             mlp_l2_reg=mlp_l2_reg,
119 |             mlp_use_skip=mlp_use_skip,
120 |             use_leaky_gate=use_leaky_gate,
121 |             loss_fn=loss_fn,
122 |             seed=seed,
123 |             device=device,
124 |         )
125 |         self._model_class = FiBiNet
126 | 
127 |     __init__.__doc__ = INIT_DOC
128 | 
129 |     def _create_model(self):
130 |         model_kwargs = {
131 |             k: v for k, v in self.model_kwargs.items() if k != "embed_numeric_fields"
132 |         }
133 |         self._model = self._model_class(
134 |             task="classification",
135 |             output_size=len(self.classes),
136 |             embedding_num=self.embedding_num,
137 |             embedding_cat=self.embedding_cat,
138 |             num_numeric_fields=self._num_numeric_fields,
139 |             loss_fn=self.loss_fn,
140 |             device=self._device,
141 |             **model_kwargs,
142 |         )
143 | 
144 | 
145 | class FiBiNetRegressor(BaseRegressor):
146 |     """
147 |     Scikit-learn style regression model for the FiBiNet model
148 | 
149 |     """
150 | 
151 |     diagram = FiBiNet.diagram
152 | 
153 |     def __init__(
154 |         self,
155 |         embedding_num: Optional[Union[str, EmbeddingBase]] = "auto",
156 |         embedding_cat: Optional[Union[str, EmbeddingBase]] = "auto",
157 |         embedding_l1_reg: float = 0.0,
158 |         embedding_l2_reg: float = 0.0,
159 |         fibi_reduction_ratio: int = 3,
160 |         fibi_activation: Type[nn.Module] = nn.LeakyReLU,
161 |         fibi_senet_product: str = "sym-interaction",
162 |         fibi_embed_product: str = "sym-interaction",
163 |         fibi_senet_skip: bool = True,
164 |         mlp_hidden_sizes: Union[int, Tuple[int, ...], List[int]] = (512, 256, 128, 64),
165 |         mlp_activation: Type[nn.Module] = nn.LeakyReLU,
166 |         mlp_use_bn: bool = True,
167 |         mlp_bn_momentum: float = 0.1,
168 |         mlp_ghost_batch: Optional[int] = None,
169 |         mlp_dropout: float = 0.0,
170 |         mlp_l1_reg: float = 0.0,
171 |         mlp_l2_reg: float = 0.0,
172 |         mlp_use_skip: bool = True,
173 |         use_leaky_gate: bool = True,
174 |         loss_fn: Union[str, Callable] = "auto",
175 |         seed: Union[int, None] = None,
176 |         device: Union[str, torch.device] = "cpu",
177 |     ):
178 |         super().__init__(
179 |             embedding_num=embedding_num,
180 |             embedding_cat=embedding_cat,
181 |             embedding_l1_reg=embedding_l1_reg,
182 |             embedding_l2_reg=embedding_l2_reg,
183 |             fibi_reduction_ratio=fibi_reduction_ratio,
184 |             fibi_activation=fibi_activation,
185 |             fibi_senet_product=fibi_senet_product,
186 |             fibi_embed_product=fibi_embed_product,
187 |             fibi_senet_skip=fibi_senet_skip,
188 |             mlp_hidden_sizes=mlp_hidden_sizes,
189 |             mlp_activation=mlp_activation,
190 |             mlp_use_bn=mlp_use_bn,
191 |             mlp_bn_momentum=mlp_bn_momentum,
192 |             mlp_ghost_batch=mlp_ghost_batch,
193 |             mlp_dropout=mlp_dropout,
194 |             mlp_l1_reg=mlp_l1_reg,
195 |             mlp_l2_reg=mlp_l2_reg,
196 |             mlp_use_skip=mlp_use_skip,
197 |             use_leaky_gate=use_leaky_gate,
198 |             loss_fn=loss_fn,
199 |             seed=seed,
200 |             device=device,
201 |         )
202 |         self._model_class = FiBiNet
203 | 
204 |     __init__.__doc__ = INIT_DOC
205 | 
206 |     def _create_model(self):
207 |         model_kwargs = {
208 |             k: v for k, v in self.model_kwargs.items() if k != "embed_numeric_fields"
209 |         }
210 |         self._model = self._model_class(
211 |             task="regression",
212 |             output_size=self.num_targets,
213 |             embedding_num=self.embedding_num,
214 |             embedding_cat=self.embedding_cat,
215 |             num_numeric_fields=self._num_numeric_fields,
216 |             loss_fn=self.loss_fn,
217 |             device=self._device,
218 |             **model_kwargs,
219 |         )
220 | 


--------------------------------------------------------------------------------
/tests/test_embedding/test_uniform/test_numeric.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import torch
  3 | from torch import nn
  4 | from torch.utils.data import DataLoader
  5 | import pytest
  6 | 
  7 | from xynn.embedding import LinearEmbedding, DenseEmbedding
  8 | from ...common import simple_train_loop
  9 | from ..utils import example_data, Reshape, SimpleDataset
 10 | 
 11 | 
 12 | def test_that_linearembedding_must_be_fit():
 13 |     embedding = LinearEmbedding(embedding_size=2)
 14 |     data_test = pd.DataFrame(
 15 |         {
 16 |             "num_a": [1, 0, 0.5, 0, -1],
 17 |             "num_b": [1, 0.5, 0, 0, -1],
 18 |         }
 19 |     )
 20 |     msg = "need to call `fit` or `from_summary` first"
 21 |     with pytest.raises(RuntimeError, match=msg):
 22 |         embedding(data_test.values)
 23 | 
 24 | 
 25 | def test_linearembedding_repr():
 26 |     embedding = LinearEmbedding(embedding_size=2)
 27 |     assert repr(embedding) == "LinearEmbedding(2, 'cpu')"
 28 |     embedding = LinearEmbedding()
 29 |     assert repr(embedding) == "LinearEmbedding(10, 'cpu')"
 30 | 
 31 | 
 32 | def test_linearembedding_with_pandas_example():
 33 |     data_num = example_data()[["num_a", "num_b"]]
 34 |     embedding = LinearEmbedding(embedding_size=3).fit(data_num)
 35 |     data_test = pd.DataFrame(
 36 |         {
 37 |             "num_a": [1, 0, 0.5, 0, -1],
 38 |             "num_b": [1, 0.5, 0, 0, -1],
 39 |         }
 40 |     )
 41 |     weight = embedding.embedding.weight
 42 |     output = embedding(torch.from_numpy(data_test.values)).to(dtype=weight.dtype)
 43 |     assert weight.shape == (2, 3)
 44 |     assert output.shape == (5, 2, 3)
 45 |     # test returned vectors vs weight matrix
 46 |     assert torch.all(output[0] == weight).item()
 47 |     assert torch.all(output[1, 0] == 0).item()
 48 |     assert torch.allclose(output[1, 1], 0.5 * weight[1])
 49 |     assert torch.allclose(output[2, 0], 0.5 * weight[0])
 50 |     assert torch.all(output[2, 1] == 0).item()
 51 |     assert torch.all(output[3] == 0).item()
 52 |     assert torch.all(output[4] == -weight).item()
 53 | 
 54 | 
 55 | def test_linearembedding_with_tensor_example():
 56 |     data_num = torch.from_numpy(example_data()[["num_a", "num_b"]].values)
 57 |     embedding = LinearEmbedding(embedding_size=3).fit(data_num)
 58 |     data_test = pd.DataFrame(
 59 |         {
 60 |             "num_a": [1, 0, 0.5, 0, -1],
 61 |             "num_b": [1, 0.5, 0, 0, -1],
 62 |         }
 63 |     )
 64 |     weight = embedding.embedding.weight
 65 |     output = embedding(torch.from_numpy(data_test.values)).to(dtype=weight.dtype)
 66 |     assert weight.shape == (2, 3)
 67 |     assert output.shape == (5, 2, 3)
 68 |     # test returned vectors vs weight matrix
 69 |     assert torch.all(output[0] == weight).item()
 70 |     assert torch.all(output[1, 0] == 0).item()
 71 |     assert torch.allclose(output[1, 1], 0.5 * weight[1])
 72 |     assert torch.allclose(output[2, 0], 0.5 * weight[0])
 73 |     assert torch.all(output[2, 1] == 0).item()
 74 |     assert torch.all(output[3] == 0).item()
 75 |     assert torch.all(output[4] == -weight).item()
 76 | 
 77 | 
 78 | def test_linearembedding_with_dataloader():
 79 |     data_num = torch.from_numpy(example_data()[["num_a", "num_b"]].values)
 80 |     dataset = SimpleDataset(data_num)
 81 |     dataloader = DataLoader(dataset, batch_size=5)
 82 |     embedding = LinearEmbedding(embedding_size=3).fit(dataloader)
 83 |     data_test = pd.DataFrame(
 84 |         {
 85 |             "num_a": [1, 0, 0.5, 0, -1],
 86 |             "num_b": [1, 0.5, 0, 0, -1],
 87 |         }
 88 |     )
 89 |     weight = embedding.embedding.weight
 90 |     output = embedding(torch.from_numpy(data_test.values)).to(dtype=weight.dtype)
 91 |     assert weight.shape == (2, 3)
 92 |     assert output.shape == (5, 2, 3)
 93 |     # test returned vectors vs weight matrix
 94 |     assert torch.all(output[0] == weight).item()
 95 |     assert torch.all(output[1, 0] == 0).item()
 96 |     assert torch.allclose(output[1, 1], 0.5 * weight[1])
 97 |     assert torch.allclose(output[2, 0], 0.5 * weight[0])
 98 |     assert torch.all(output[2, 1] == 0).item()
 99 |     assert torch.all(output[3] == 0).item()
100 |     assert torch.all(output[4] == -weight).item()
101 | 
102 | 
103 | def test_that_linearembedding_learns():
104 |     X = torch.rand((100, 10))
105 |     y = torch.rand((100, 3))
106 |     embedding = LinearEmbedding(embedding_size=3).fit(X)
107 |     model = nn.Sequential(embedding, Reshape(), nn.Linear(30, 3))
108 |     loss_func = nn.MSELoss()
109 |     optimizer = torch.optim.Adam(embedding.parameters(), lr=1e-1)  # not model.parameters()
110 |     wt_before = torch.clone(embedding.embedding.weight)
111 |     loss_vals = simple_train_loop(model, X, y, loss_func, optimizer, num_epochs=5)
112 |     assert torch.all(embedding.embedding.weight != wt_before).item()
113 |     assert loss_vals[0] > loss_vals[-1]
114 | 
115 | 
116 | def test_that_denseembedding_must_be_fit():
117 |     embedding = DenseEmbedding(embedding_size=(2, 2))
118 |     data_test = pd.DataFrame(
119 |         {
120 |             "num_a": [1, 0, 0.5, 0, -1],
121 |             "num_b": [1, 0.5, 0, 0, -1],
122 |         }
123 |     )
124 |     msg = "need to call `fit` or `from_summary` first"
125 |     with pytest.raises(RuntimeError, match=msg):
126 |         embedding(data_test.values)
127 | 
128 | 
129 | def test_denseembedding_repr():
130 |     embedding = DenseEmbedding(embedding_size=(2, 2))
131 |     assert repr(embedding) == "DenseEmbedding((2, 2), LeakyReLU, 'cpu')"
132 |     embedding = DenseEmbedding(activation=nn.ReLU)
133 |     assert repr(embedding) == "DenseEmbedding((1, 10), ReLU, 'cpu')"
134 | 
135 | 
136 | def test_denseembedding_with_pandas_example():
137 |     data_num = example_data()[["num_a", "num_b"]]
138 |     embedding = DenseEmbedding(embedding_size=3, activation=nn.ReLU).fit(data_num)
139 |     data_test = pd.DataFrame(
140 |         {
141 |             "num_a": [1, 0, 0.5, 0.0, -1],
142 |             "num_b": [0, 1, 0.0, 0.5, -1],
143 |         }
144 |     )
145 |     emb_w = embedding.embedding_w
146 |     emb_b = embedding.embedding_b
147 |     output = embedding(torch.from_numpy(data_test.values)).to(dtype=emb_w.dtype)
148 |     assert emb_w.shape == (2, 1, 3)
149 |     assert emb_b.shape == (1, 3)
150 |     assert output.shape == (5, 1, 3)
151 |     ## test returned vectors vs weight matrix
152 |     identity_relu = emb_b + torch.where(
153 |         emb_w > 0, emb_w, torch.zeros(emb_w.shape, dtype=emb_w.dtype)
154 |     )
155 |     assert torch.allclose(output[:2], identity_relu)
156 | 
157 | 
158 | def test_denseembedding_with_tensor_example():
159 |     data_num = torch.from_numpy(example_data()[["num_a", "num_b"]].values)
160 |     embedding = DenseEmbedding(embedding_size=(2, 3), activation=nn.ReLU).fit(data_num)
161 |     data_test = pd.DataFrame(
162 |         {
163 |             "num_a": [1, 0, 0.5, 0.0, -1],
164 |             "num_b": [0, 1, 0.0, 0.5, -1],
165 |         }
166 |     )
167 |     emb_w = embedding.embedding_w
168 |     emb_b = embedding.embedding_b
169 |     output = embedding(torch.from_numpy(data_test.values)).to(dtype=emb_w.dtype)
170 |     assert emb_w.shape == (2, 2, 3)
171 |     assert emb_b.shape == (2, 3)
172 |     assert output.shape == (5, 2, 3)
173 |     ## test returned vectors vs weight matrix
174 |     identity_relu = emb_b + torch.where(
175 |         emb_w > 0, emb_w, torch.zeros(emb_w.shape, dtype=emb_w.dtype)
176 |     )
177 |     assert torch.allclose(output[:2], identity_relu)
178 | 
179 | 
180 | def test_denseembedding_with_dataloader():
181 |     data_num = torch.from_numpy(example_data()[["num_a", "num_b"]].values)
182 |     dataset = SimpleDataset(data_num)
183 |     dataloader = DataLoader(dataset, batch_size=5)
184 |     embedding = DenseEmbedding(embedding_size=(1, 3), activation=nn.ReLU).fit(dataloader)
185 |     data_test = pd.DataFrame(
186 |         {
187 |             "num_a": [1, 0, 0.5, 0.0, -1],
188 |             "num_b": [0, 1, 0.0, 0.5, -1],
189 |         }
190 |     )
191 |     emb_w = embedding.embedding_w
192 |     emb_b = embedding.embedding_b
193 |     output = embedding(torch.from_numpy(data_test.values)).to(dtype=emb_w.dtype)
194 |     assert emb_w.shape == (2, 1, 3)
195 |     assert emb_b.shape == (1, 3)
196 |     assert output.shape == (5, 1, 3)
197 |     ## test returned vectors vs weight matrix
198 |     identity_relu = emb_b + torch.where(
199 |         emb_w > 0, emb_w, torch.zeros(emb_w.shape, dtype=emb_w.dtype)
200 |     )
201 |     assert torch.allclose(output[:2], identity_relu)
202 | 
203 | 
204 | def test_that_denseembedding_learns():
205 |     X = torch.rand((100, 10))
206 |     y = torch.rand((100, 3))
207 |     embedding = DenseEmbedding(embedding_size=(10, 3)).fit(X)
208 |     model = nn.Sequential(embedding, Reshape(), nn.Linear(30, 3))
209 |     loss_func = nn.MSELoss()
210 |     optimizer = torch.optim.Adam(embedding.parameters(), lr=1e-1)  # not model.parameters()
211 |     wt_before = torch.clone(embedding.embedding_w)
212 |     loss_vals = simple_train_loop(model, X, y, loss_func, optimizer, num_epochs=5)
213 |     assert torch.all(embedding.embedding_w != wt_before).item()
214 |     assert loss_vals[0] > loss_vals[-1]
215 | 


--------------------------------------------------------------------------------
/xynn/mlp.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Module for MLP (multi-layer perceptron) and related modules
  3 | 
  4 | """
  5 | 
  6 | from math import ceil
  7 | from typing import Union, Tuple, List, Type, Optional
  8 | 
  9 | import torch
 10 | from torch import Tensor
 11 | from torch import nn
 12 | 
 13 | from .ghost_norm import GhostBatchNorm
 14 | 
 15 | 
 16 | class LeakyGate(nn.Module):
 17 |     """
 18 |     This performs an element-wise linear transformation followed by a chosen
 19 |     activation; the default activation is nn.LeakyReLU. Fields may be
 20 |     represented by individual values or vectors of values (i.e., embedded).
 21 | 
 22 |     Input needs to be shaped like (num_rows, num_fields) or
 23 |     (num_rows, num_fields, embedding_size)
 24 | 
 25 |     """
 26 | 
 27 |     def __init__(
 28 |         self,
 29 |         input_size: int,
 30 |         bias: bool = True,
 31 |         activation: Type[nn.Module] = nn.LeakyReLU,
 32 |         device: Union[str, torch.device] = "cpu",
 33 |     ):
 34 |         """
 35 |         Parameters
 36 |         ----------
 37 |         input_size : int
 38 |         bias : boolean, optional
 39 |             whether to include an additive bias; default is True
 40 |         activation : torch.nn.Module, optional
 41 |             default is nn.LeakyReLU
 42 |         device : string or torch.device, optional
 43 |             default is "cpu"
 44 | 
 45 |         """
 46 |         super().__init__()
 47 |         self.weight = nn.Parameter(torch.normal(mean=0, std=1.0, size=(1, input_size)))
 48 |         self.bias = nn.Parameter(torch.zeros(size=(1, input_size)), requires_grad=bias)
 49 |         self.activation = activation()
 50 |         self.to(device)
 51 | 
 52 |     def forward(self, X: Tensor) -> Tensor:
 53 |         """
 54 |         Transform the input tensor
 55 | 
 56 |         Parameters
 57 |         ----------
 58 |         X : torch.Tensor
 59 | 
 60 |         Return
 61 |         ------
 62 |         torch.Tensor
 63 | 
 64 |         """
 65 |         out = X
 66 |         if len(X.shape) > 2:
 67 |             out = out.reshape((X.shape[0], -1))
 68 |         out = out * self.weight + self.bias
 69 |         if len(X.shape) > 2:
 70 |             out = out.reshape(X.shape)
 71 |         out = self.activation(out)
 72 |         return out
 73 | 
 74 | 
 75 | class MLP(nn.Module):
 76 |     """
 77 |     A "multi-layer perceptron". This forms layes of fully-connected linear
 78 |     transformations, with opional batch norm, dropout, and an initial
 79 |     "leaky gate".
 80 | 
 81 |     Input should be shaped like (num_rows, num_fields)
 82 | 
 83 |     """
 84 | 
 85 |     def __init__(
 86 |         self,
 87 |         task: str,
 88 |         input_size: int,
 89 |         hidden_sizes: Union[int, Tuple[int, ...], List[int]],
 90 |         output_size: int,
 91 |         activation: Type[nn.Module] = nn.LeakyReLU,
 92 |         dropout: Union[float, Tuple[float], List[float]] = 0.0,
 93 |         dropout_first: bool = False,
 94 |         use_bn: bool = True,
 95 |         bn_momentum: float = 0.1,
 96 |         ghost_batch: Optional[int] = None,
 97 |         leaky_gate: bool = True,
 98 |         use_skip: bool = True,
 99 |         weighted_sum: bool = True,
100 |         device: Union[str, torch.device] = "cpu",
101 |     ):
102 |         """
103 |         Parameters
104 |         ----------
105 |         task : {"regression", "classification"}
106 |         input_size : int
107 |             the number of inputs into the first layer
108 |         hidden_sizes : iterable of int
109 |             intermediate sizes between `input_size` and `output_size`
110 |         output_size : int
111 |             the number of outputs from the last layer
112 |         activation : subclass of torch.nn.Module (uninitialized), optional
113 |             default is nn.LeakyReLU
114 |         dropout : float or iterable of float
115 |             should be between 0.0 and 1.0; if iterable of float, there
116 |             should be one value for each hidden size, plus an additional
117 |             value if `use_bn` is True
118 |         dropout_first : boolean, optional
119 |             whether to include dropout before the first fully-connected
120 |             linear layer (and after "leaky_gate", if using);
121 |             default is False
122 |         use_bn : boolean, optional
123 |             whether to use batch normalization; default is True
124 |         bn_momentum : float, optional
125 |             default is 0.1
126 |         ghost_batch : int or None, optional
127 |             only used if `use_bn` is True; size of batch in "ghost batch norm";
128 |             if None, normal batch norm is used; defualt is None
129 |         leaky_gate : boolean, optional
130 |             whether to include a LeakyGate layer before the linear layers;
131 |             default is True
132 |         use_skip : boolean, optional
133 |             use a side path containing just the optional leaky gate plust
134 |             a single linear layer; default is True
135 |         weighted_sum : boolean, optional
136 |             only used with use_skip; when adding main MLP output with side
137 |             "skip" output, use a weighted sum with learnable weight; default is True
138 |         device : string or torch.device, optional
139 |             default is "cpu"
140 | 
141 |         """
142 |         super().__init__()
143 | 
144 |         if isinstance(hidden_sizes, int):
145 |             hidden_sizes = [hidden_sizes]
146 | 
147 |         dropout_len = len(hidden_sizes) + (1 if dropout_first else 0)
148 | 
149 |         if isinstance(dropout, float):
150 |             dropout = [dropout] * dropout_len
151 |         elif not len(dropout) == dropout_len:
152 |             raise ValueError(
153 |                 f"expected a single dropout value or {dropout_len} values "
154 |                 f"({'one more than' if dropout_first else 'same as'} hidden_sizes)"
155 |             )
156 | 
157 |         main_layers: List[nn.Module] = []
158 | 
159 |         if leaky_gate:
160 |             main_layers.append(LeakyGate(input_size))
161 | 
162 |         if dropout_first and dropout[0] > 0:
163 |             main_layers.append(nn.Dropout(dropout[0]))
164 |             dropout = dropout[1:]
165 | 
166 |         input_size_i = input_size
167 |         for hidden_size_i, dropout_i in zip(hidden_sizes, dropout):
168 |             main_layers.append(nn.Linear(input_size_i, hidden_size_i, bias=(not use_bn)))
169 |             if use_bn:
170 |                 if ghost_batch is None:
171 |                     bnlayer = nn.BatchNorm1d(hidden_size_i, momentum=bn_momentum)
172 |                 else:
173 |                     bnlayer = GhostBatchNorm(
174 |                         hidden_size_i, ghost_batch, momentum=bn_momentum
175 |                     )
176 |                 main_layers.append(bnlayer)
177 |             main_layers.append(activation())
178 |             if dropout_i > 0:
179 |                 main_layers.append(nn.Dropout(dropout_i))
180 |             input_size_i = hidden_size_i
181 | 
182 |         main_layers.append(
183 |             nn.Linear(input_size_i, output_size, bias=(task != "classification"))
184 |         )
185 | 
186 |         self.main_layers = nn.Sequential(*main_layers)
187 | 
188 |         self.use_skip = use_skip
189 |         if use_skip:
190 |             skip_linear = nn.Linear(input_size, output_size, bias=(task != "classification"))
191 |             if leaky_gate:
192 |                 self.skip_layers = nn.Sequential(LeakyGate(input_size), skip_linear)
193 |             else:
194 |                 self.skip_layers = skip_linear
195 |             if weighted_sum:
196 |                 self.mix = nn.Parameter(torch.tensor([0.0]))
197 |             else:
198 |                 self.mix = torch.tensor([0.0], device=device)
199 |         else:
200 |             self.skip_layers = None
201 |             self.mix = None
202 | 
203 |         self.to(device)
204 | 
205 |     def weight_sum(self) -> Tuple[Tensor, Tensor]:
206 |         """
207 |         Sum of absolute value and squared weights, for regularization
208 | 
209 |         Return
210 |         ------
211 |         w1 : float
212 |             sum of absolute value of weights
213 |         w2 : float
214 |             sum of squared weights
215 | 
216 |         """
217 |         w1_sum = 0.0
218 |         w2_sum = 0.0
219 |         for layer_group in (self.main_layers, self.skip_layers):
220 |             if layer_group is None:
221 |                 continue
222 |             for layer in layer_group:
223 |                 if not isinstance(layer, nn.Linear):
224 |                     continue
225 |                 w1_sum += layer.weight.abs().sum()
226 |                 w2_sum += (layer.weight ** 2).sum()
227 |         return w1_sum, w2_sum
228 | 
229 |     def forward(self, X: Tensor) -> Tuple[float, float]:
230 |         """
231 |         Transform the input tensor
232 | 
233 |         Parameters
234 |         ----------
235 |         X : torch.Tensor
236 | 
237 |         Return
238 |         ------
239 |         torch.Tensor
240 | 
241 |         """
242 |         out = self.main_layers(X)
243 |         if self.use_skip:
244 |             mix = torch.sigmoid(self.mix)
245 |             skip_out = self.skip_layers(X)
246 |             out = mix * skip_out + (1 - mix) * out
247 |         return out
248 | 


--------------------------------------------------------------------------------
/tests/test_autoint/test_modules.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import random
  3 | 
  4 | import torch
  5 | from torch import nn
  6 | import numpy as np
  7 | import pytest
  8 | 
  9 | from xynn.base_classes.estimators import _set_seed
 10 | from xynn.base_classes.modules import BaseNN
 11 | from xynn.autoint.modules import AttnInteractionLayer, AttnInteractionBlock
 12 | from xynn.autoint import AutoInt
 13 | from xynn.embedding import LinearEmbedding, BasicEmbedding
 14 | from xynn.mlp import LeakyGate, GhostBatchNorm
 15 | 
 16 | from ..common import simple_train_inputs, simple_model_train_loop
 17 | 
 18 | 
 19 | def test_attnlayer_basic_initialization():
 20 |     attn = AttnInteractionLayer(
 21 |         field_input_size=5, 
 22 |         use_residual=False,
 23 |         dropout=0.0,
 24 |         normalize=False,
 25 |     )
 26 |     assert attn.W_q.shape == (5, 8, 2)
 27 |     assert attn.W_k.shape == (5, 8, 2)
 28 |     assert attn.W_v.shape == (5, 8, 2)
 29 |     assert attn.W_r is None
 30 |     assert isinstance(attn.w_act, nn.Identity)
 31 |     assert isinstance(attn.dropout, nn.Identity)
 32 |     assert isinstance(attn.layer_norm, nn.Identity)
 33 | 
 34 | 
 35 | def test_attnlayer_intitialization_with_more_options():
 36 |     attn = AttnInteractionLayer(
 37 |         field_input_size=5, 
 38 |         field_output_size=10,
 39 |         activation=nn.ReLU,
 40 |     )
 41 |     assert attn.W_q.shape == (5, 10, 2)
 42 |     assert attn.W_k.shape == (5, 10, 2)
 43 |     assert attn.W_v.shape == (5, 10, 2)
 44 |     assert attn.W_r.shape == (5, 20)
 45 |     assert isinstance(attn.w_act, nn.ReLU)
 46 |     assert isinstance(attn.dropout, nn.Dropout)
 47 |     assert isinstance(attn.layer_norm, nn.LayerNorm)
 48 | 
 49 | 
 50 | def test_attnlayer_output_shape():
 51 |     x = torch.tensor([[[1, 0]]], dtype=torch.float)
 52 |     attn = AttnInteractionLayer(field_input_size=2, field_output_size=3)
 53 |     out = attn(x)
 54 |     assert out.shape == (1, 1, 6)
 55 | 
 56 | 
 57 | def test_that_autoint_module_subclasses_basenn():
 58 |     assert issubclass(AutoInt, BaseNN)
 59 | 
 60 | 
 61 | def test_that_autoint_uses_basenn_init():
 62 |     X = torch.randint(0, 10, (100, 10))
 63 |     embedding_num = LinearEmbedding(embedding_size=3).fit(X)
 64 | 
 65 |     model = AutoInt(
 66 |         task="classification",
 67 |         output_size=3,
 68 |         embedding_num=embedding_num,
 69 |         embedding_cat=None,
 70 |         embedding_l1_reg=0.1,
 71 |         mlp_l2_reg=0.2,
 72 |     )
 73 | 
 74 |     assert model.task == "classification"
 75 |     assert model.num_epochs == 0
 76 |     assert isinstance(model.loss_fn, nn.CrossEntropyLoss)
 77 |     assert model.embedding_num is embedding_num
 78 |     assert model.embedding_cat is None
 79 |     assert model.embedding_l1_reg == 0.1
 80 |     assert model.embedding_l2_reg == 0.0
 81 |     assert model.mlp_l1_reg == 0.0
 82 |     assert model.mlp_l2_reg == 0.2
 83 |     assert model.optimizer is None
 84 |     assert model.optimizer_info == {}
 85 |     assert model.scheduler == {}
 86 |     assert model._device == "cpu"
 87 | 
 88 | 
 89 | def test_that_autoint_parameters_are_passed_to_submodules():
 90 |     X = torch.randint(0, 10, (100, 10))
 91 |     embedding_cat = BasicEmbedding(embedding_size=3).fit(X)
 92 |     model = AutoInt(
 93 |         task="classification",
 94 |         output_size=3,
 95 |         embedding_num=None,
 96 |         embedding_cat=embedding_cat,
 97 |         mlp_activation=nn.ReLU,
 98 |         mlp_hidden_sizes=(512, 128, 32),
 99 |         mlp_use_bn=False,
100 |         mlp_use_skip=False,
101 |         use_leaky_gate=False,
102 |     )
103 | 
104 |     expected_classes = [
105 |         nn.Linear,
106 |         nn.ReLU,
107 |         nn.Linear,
108 |         nn.ReLU,
109 |         nn.Linear,
110 |         nn.ReLU,
111 |         nn.Linear,
112 |     ]
113 |     for mlp in (model.mlp, model.attn_final):
114 |         assert len(mlp.main_layers) == len(expected_classes)
115 |         for layer, expected_class in zip(mlp.main_layers, expected_classes):
116 |             assert isinstance(layer, expected_class)
117 |         assert mlp.skip_layers is None
118 | 
119 |     assert isinstance(model.attn_interact, AttnInteractionBlock)
120 |     assert len(model.attn_interact.layers) == 3
121 |     assert model.mix.requires_grad
122 | 
123 | 
124 | def test_that_autoint_parameters_are_passed_to_submodules_other_params():
125 |     X = torch.randint(0, 10, (100, 10))
126 |     embedding_cat = BasicEmbedding(embedding_size=3).fit(X)
127 |     model = AutoInt(
128 |         task="classification",
129 |         output_size=3,
130 |         embedding_num=None,
131 |         embedding_cat=embedding_cat,
132 |         attn_num_layers=2,
133 |         mlp_ghost_batch=8,
134 |         mlp_use_skip=True,
135 |     )
136 | 
137 |     expected_classes = [
138 |         LeakyGate,
139 |         nn.Linear,
140 |         GhostBatchNorm,
141 |         nn.LeakyReLU,
142 |         nn.Linear,
143 |         GhostBatchNorm,
144 |         nn.LeakyReLU,
145 |         nn.Linear,
146 |         GhostBatchNorm,
147 |         nn.LeakyReLU,
148 |         nn.Linear,
149 |         GhostBatchNorm,
150 |         nn.LeakyReLU,
151 |         nn.Linear,
152 |     ]
153 |     for mlp in (model.mlp, model.attn_final):
154 |         assert len(mlp.main_layers) == len(expected_classes)
155 |         for layer, expected_class in zip(mlp.main_layers, expected_classes):
156 |             assert isinstance(layer, expected_class)
157 |         assert mlp.skip_layers is not None
158 | 
159 |     assert isinstance(model.attn_interact, AttnInteractionBlock)
160 |     assert len(model.attn_interact.layers) == 2
161 |     assert model.mix.requires_grad
162 | 
163 | 
164 | def test_that_autoint_diagram_exists_and_prints_something(capsys):
165 |     AutoInt.diagram()
166 |     captured = capsys.readouterr()
167 |     assert len(captured.out.split("\n")) > 5
168 | 
169 | 
170 | def test_autoint_mlp_weight():
171 |     X = torch.randint(0, 10, (100, 10))
172 |     embedding_num = LinearEmbedding(embedding_size=3).fit(X)
173 | 
174 |     # without Linear after CIN
175 |     model = AutoInt(
176 |         task="regression",
177 |         output_size=1,
178 |         embedding_num=embedding_num,
179 |         embedding_cat=None,
180 |         attn_use_mlp=False,
181 |         mlp_use_bn=False,
182 |         mlp_use_skip=False,
183 |         use_leaky_gate=False,
184 |     )
185 | 
186 |     exp_w1 = 0
187 |     exp_w2 = 0
188 |     for mlp in (model.mlp, model.attn_final):
189 |         exp_w1 += sum(l.weight.abs().sum().item() for l in mlp.main_layers[::2])
190 |         exp_w2 += sum((l.weight ** 2).sum().item() for l in mlp.main_layers[::2])
191 | 
192 |     w1, w2 = model.mlp_weight_sum()
193 |     assert np.isclose(w1.item(), exp_w1)
194 |     assert np.isclose(w2.item(), exp_w2)
195 | 
196 |     # with MLP after CIN
197 |     model = AutoInt(
198 |         task="regression",
199 |         output_size=1,
200 |         embedding_num=embedding_num,
201 |         embedding_cat=None,
202 |         attn_use_mlp=True,
203 |         mlp_use_bn=False,
204 |         mlp_use_skip=False,
205 |         use_leaky_gate=False,
206 |     )
207 | 
208 |     exp_w1 = 0
209 |     exp_w2 = 0
210 |     for mlp in (model.mlp, model.attn_final):
211 |         exp_w1 += sum(l.weight.abs().sum().item() for l in mlp.main_layers[::2])
212 |         exp_w2 += sum((l.weight ** 2).sum().item() for l in mlp.main_layers[::2])
213 | 
214 |     w1, w2 = model.mlp_weight_sum()
215 |     assert np.isclose(w1.item(), exp_w1)
216 |     assert np.isclose(w2.item(), exp_w2)
217 | 
218 | 
219 | def test_that_autoint_learns():
220 |     _set_seed(10101)
221 | 
222 |     X_num = torch.randint(0, 10, (100, 10))
223 |     X_cat = torch.randint(0, 5, (100, 1))
224 |     y = (
225 |         (X_cat - 2) + X_num[:, ::2].sum(dim=1) - X_num[:, 1::2].sum(dim=1)
226 |     ).to(dtype=torch.float)
227 |     
228 |     model = AutoInt(
229 |         task="regression",
230 |         output_size=1,
231 |         embedding_num=LinearEmbedding(embedding_size=3).fit(X_num),
232 |         embedding_cat=BasicEmbedding(embedding_size=3).fit(X_cat),
233 |         mlp_hidden_sizes=[10, 8, 6],
234 |     )
235 |     
236 |     loss_func = nn.MSELoss()
237 |     optimizer = torch.optim.Adam(model.parameters(), lr=1e-1)
238 |     loss_vals = simple_model_train_loop(
239 |         model, X_num, X_cat, y, loss_func, optimizer, num_epochs=5
240 |     )
241 |     
242 |     assert loss_vals[0] > loss_vals[-1]
243 | 
244 | 
245 | def test_that_autoint_learns_with_other_params():
246 |     _set_seed(10101)
247 | 
248 |     X_num = torch.randint(0, 10, (100, 10))
249 |     X_cat = torch.randint(0, 5, (100, 1))
250 |     y = (
251 |         (X_cat - 2) + X_num[:, ::2].sum(dim=1) - X_num[:, 1::2].sum(dim=1)
252 |     ).to(dtype=torch.float)
253 |     
254 |     model = AutoInt(
255 |         task="regression",
256 |         output_size=1,
257 |         embedding_num=LinearEmbedding(embedding_size=3).fit(X_num),
258 |         embedding_cat=BasicEmbedding(embedding_size=3).fit(X_cat),
259 |         mlp_hidden_sizes=[],
260 |     )
261 |     
262 |     loss_func = nn.MSELoss()
263 |     optimizer = torch.optim.Adam(model.parameters(), lr=1e-1)
264 |     loss_vals = simple_model_train_loop(
265 |         model, X_num, X_cat, y, loss_func, optimizer, num_epochs=5
266 |     )
267 |     
268 |     assert loss_vals[0] > loss_vals[-1]
269 | 


--------------------------------------------------------------------------------
/xynn/pnn/estimators.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Scikit-learn style classes for the PNN and PNNPlus models
  3 | 
  4 | """
  5 | 
  6 | import textwrap
  7 | from typing import Union, Callable, Optional, Type, List, Tuple
  8 | 
  9 | import torch
 10 | from torch import nn
 11 | 
 12 | from ..base_classes.estimators import BaseClassifier, BaseRegressor, ESTIMATOR_INIT_DOC
 13 | from ..embedding import EmbeddingBase
 14 | from .modules import PNN, PNNPlus
 15 | 
 16 | 
 17 | INIT_DOC = ESTIMATOR_INIT_DOC.format(
 18 |     textwrap.dedent(
 19 |         """\
 20 |         pnn_product_type : {"inner", "outer", "both"}, optional
 21 |             default is "outer"
 22 |         pnn_product_size : int, optional
 23 |             size of overall product output after transformation; i.e., after
 24 |             transformation, the batch size is num_rows x product_output_size;
 25 |             default is 10"""
 26 |     )
 27 | )
 28 | 
 29 | 
 30 | class PNNClassifier(BaseClassifier):
 31 |     """
 32 |     Scikit-learn style classification model for the PNN model
 33 | 
 34 |     """
 35 | 
 36 |     diagram = PNN.diagram
 37 | 
 38 |     def __init__(
 39 |         self,
 40 |         embedding_num: Optional[Union[str, EmbeddingBase]] = "auto",
 41 |         embedding_cat: Optional[Union[str, EmbeddingBase]] = "auto",
 42 |         embedding_l1_reg: float = 0.0,
 43 |         embedding_l2_reg: float = 0.0,
 44 |         pnn_product_type: str = "outer",
 45 |         pnn_product_size: int = 10,
 46 |         mlp_hidden_sizes: Union[int, Tuple[int, ...], List[int]] = (512, 256, 128, 64),
 47 |         mlp_activation: Type[nn.Module] = nn.LeakyReLU,
 48 |         mlp_use_bn: bool = True,
 49 |         mlp_bn_momentum: float = 0.1,
 50 |         mlp_ghost_batch: Optional[int] = None,
 51 |         mlp_dropout: float = 0.0,
 52 |         mlp_l1_reg: float = 0.0,
 53 |         mlp_l2_reg: float = 0.0,
 54 |         mlp_use_skip: bool = True,
 55 |         use_leaky_gate: bool = True,
 56 |         loss_fn: Union[str, Callable] = "auto",
 57 |         seed: Union[int, None] = None,
 58 |         device: Union[str, torch.device] = "cpu",
 59 |     ):
 60 |         super().__init__(
 61 |             embedding_num=embedding_num,
 62 |             embedding_cat=embedding_cat,
 63 |             embedding_l1_reg=embedding_l1_reg,
 64 |             embedding_l2_reg=embedding_l2_reg,
 65 |             pnn_product_type=pnn_product_type,
 66 |             pnn_product_size=pnn_product_size,
 67 |             mlp_hidden_sizes=mlp_hidden_sizes,
 68 |             mlp_activation=mlp_activation,
 69 |             mlp_use_bn=mlp_use_bn,
 70 |             mlp_bn_momentum=mlp_bn_momentum,
 71 |             mlp_ghost_batch=mlp_ghost_batch,
 72 |             mlp_dropout=mlp_dropout,
 73 |             mlp_l1_reg=mlp_l1_reg,
 74 |             mlp_l2_reg=mlp_l2_reg,
 75 |             mlp_use_skip=mlp_use_skip,
 76 |             use_leaky_gate=use_leaky_gate,
 77 |             loss_fn=loss_fn,
 78 |             seed=seed,
 79 |             device=device,
 80 |         )
 81 |         self._model_class = PNN
 82 |         self._require_numeric_embedding = True
 83 | 
 84 |     __init__.__doc__ = INIT_DOC
 85 | 
 86 | 
 87 | class PNNRegressor(BaseRegressor):
 88 |     """
 89 |     Scikit-learn style regression model for the PNN model
 90 | 
 91 |     """
 92 | 
 93 |     diagram = PNN.diagram
 94 | 
 95 |     def __init__(
 96 |         self,
 97 |         embedding_num: Optional[Union[str, EmbeddingBase]] = "auto",
 98 |         embedding_cat: Optional[Union[str, EmbeddingBase]] = "auto",
 99 |         embedding_l1_reg: float = 0.0,
100 |         embedding_l2_reg: float = 0.0,
101 |         pnn_product_type: str = "outer",
102 |         pnn_product_size: int = 10,
103 |         mlp_hidden_sizes: Union[int, Tuple[int, ...], List[int]] = (512, 256, 128, 64),
104 |         mlp_activation: Type[nn.Module] = nn.LeakyReLU,
105 |         mlp_use_bn: bool = True,
106 |         mlp_bn_momentum: float = 0.1,
107 |         mlp_ghost_batch: Optional[int] = None,
108 |         mlp_dropout: float = 0.0,
109 |         mlp_l1_reg: float = 0.0,
110 |         mlp_l2_reg: float = 0.0,
111 |         mlp_use_skip: bool = True,
112 |         use_leaky_gate: bool = True,
113 |         loss_fn: Union[str, Callable] = "auto",
114 |         seed: Optional[int] = None,
115 |         device: Union[str, torch.device] = "cpu",
116 |     ):
117 |         super().__init__(
118 |             embedding_num=embedding_num,
119 |             embedding_cat=embedding_cat,
120 |             embedding_l1_reg=embedding_l1_reg,
121 |             embedding_l2_reg=embedding_l2_reg,
122 |             pnn_product_type=pnn_product_type,
123 |             pnn_product_size=pnn_product_size,
124 |             mlp_hidden_sizes=mlp_hidden_sizes,
125 |             mlp_activation=mlp_activation,
126 |             mlp_use_bn=mlp_use_bn,
127 |             mlp_bn_momentum=mlp_bn_momentum,
128 |             mlp_ghost_batch=mlp_ghost_batch,
129 |             mlp_dropout=mlp_dropout,
130 |             mlp_l1_reg=mlp_l1_reg,
131 |             mlp_l2_reg=mlp_l2_reg,
132 |             mlp_use_skip=mlp_use_skip,
133 |             use_leaky_gate=use_leaky_gate,
134 |             loss_fn=loss_fn,
135 |             seed=seed,
136 |             device=device,
137 |         )
138 |         self._model_class = PNN
139 |         self._require_numeric_embedding = True
140 | 
141 |     __init__.__doc__ = INIT_DOC
142 | 
143 | 
144 | class PNNPlusClassifier(BaseClassifier):
145 |     """
146 |     Scikit-learn style classification model for the PNN-plus-MLP model
147 | 
148 |     """
149 | 
150 |     diagram = PNNPlus.diagram
151 | 
152 |     def __init__(
153 |         self,
154 |         embedding_num: Optional[Union[str, EmbeddingBase]] = "auto",
155 |         embedding_cat: Optional[Union[str, EmbeddingBase]] = "auto",
156 |         embedding_l1_reg: float = 0.0,
157 |         embedding_l2_reg: float = 0.0,
158 |         pnn_product_type: str = "outer",
159 |         pnn_product_size: int = 10,
160 |         mlp_hidden_sizes: Union[int, Tuple[int, ...], List[int]] = (512, 256, 128, 64),
161 |         mlp_activation: Type[nn.Module] = nn.LeakyReLU,
162 |         mlp_use_bn: bool = True,
163 |         mlp_bn_momentum: float = 0.1,
164 |         mlp_ghost_batch: Optional[int] = None,
165 |         mlp_dropout: float = 0.0,
166 |         mlp_l1_reg: float = 0.0,
167 |         mlp_l2_reg: float = 0.0,
168 |         mlp_use_skip: bool = True,
169 |         use_leaky_gate: bool = True,
170 |         weighted_sum: bool = True,
171 |         loss_fn: Union[str, Callable] = "auto",
172 |         seed: Union[int, None] = None,
173 |         device: Union[str, torch.device] = "cpu",
174 |     ):
175 |         super().__init__(
176 |             embedding_num=embedding_num,
177 |             embedding_cat=embedding_cat,
178 |             embedding_l1_reg=embedding_l1_reg,
179 |             embedding_l2_reg=embedding_l2_reg,
180 |             pnn_product_type=pnn_product_type,
181 |             pnn_product_size=pnn_product_size,
182 |             mlp_hidden_sizes=mlp_hidden_sizes,
183 |             mlp_activation=mlp_activation,
184 |             mlp_use_bn=mlp_use_bn,
185 |             mlp_bn_momentum=mlp_bn_momentum,
186 |             mlp_ghost_batch=mlp_ghost_batch,
187 |             mlp_dropout=mlp_dropout,
188 |             mlp_l1_reg=mlp_l1_reg,
189 |             mlp_l2_reg=mlp_l2_reg,
190 |             mlp_use_skip=mlp_use_skip,
191 |             use_leaky_gate=use_leaky_gate,
192 |             weighted_sum=weighted_sum,
193 |             loss_fn=loss_fn,
194 |             seed=seed,
195 |             device=device,
196 |         )
197 |         self._model_class = PNNPlus
198 |         self._require_numeric_embedding = True
199 | 
200 |     __init__.__doc__ = INIT_DOC
201 | 
202 | 
203 | class PNNPlusRegressor(BaseRegressor):
204 |     """
205 |     Scikit-learn style regression model for the PNN-plus-MLP model
206 | 
207 |     """
208 | 
209 |     diagram = PNNPlus.diagram
210 | 
211 |     def __init__(
212 |         self,
213 |         embedding_num: Optional[Union[str, EmbeddingBase]] = "auto",
214 |         embedding_cat: Optional[Union[str, EmbeddingBase]] = "auto",
215 |         embedding_l1_reg: float = 0.0,
216 |         embedding_l2_reg: float = 0.0,
217 |         pnn_product_type: str = "outer",
218 |         pnn_product_size: int = 10,
219 |         mlp_hidden_sizes: Union[int, Tuple[int, ...], List[int]] = (512, 256, 128, 64),
220 |         mlp_activation: Type[nn.Module] = nn.LeakyReLU,
221 |         mlp_use_bn: bool = True,
222 |         mlp_bn_momentum: float = 0.1,
223 |         mlp_ghost_batch: Optional[int] = None,
224 |         mlp_dropout: float = 0.0,
225 |         mlp_l1_reg: float = 0.0,
226 |         mlp_l2_reg: float = 0.0,
227 |         mlp_use_skip: bool = True,
228 |         use_leaky_gate: bool = True,
229 |         weighted_sum: bool = True,
230 |         loss_fn: Union[str, Callable] = "auto",
231 |         seed: Optional[int] = None,
232 |         device: Union[str, torch.device] = "cpu",
233 |     ):
234 |         super().__init__(
235 |             embedding_num=embedding_num,
236 |             embedding_cat=embedding_cat,
237 |             embedding_l1_reg=embedding_l1_reg,
238 |             embedding_l2_reg=embedding_l2_reg,
239 |             pnn_product_type=pnn_product_type,
240 |             pnn_product_size=pnn_product_size,
241 |             mlp_hidden_sizes=mlp_hidden_sizes,
242 |             mlp_activation=mlp_activation,
243 |             mlp_use_bn=mlp_use_bn,
244 |             mlp_bn_momentum=mlp_bn_momentum,
245 |             mlp_ghost_batch=mlp_ghost_batch,
246 |             mlp_dropout=mlp_dropout,
247 |             mlp_l1_reg=mlp_l1_reg,
248 |             mlp_l2_reg=mlp_l2_reg,
249 |             mlp_use_skip=mlp_use_skip,
250 |             use_leaky_gate=use_leaky_gate,
251 |             weighted_sum=weighted_sum,
252 |             loss_fn=loss_fn,
253 |             seed=seed,
254 |             device=device,
255 |         )
256 |         self._model_class = PNNPlus
257 |         self._require_numeric_embedding = True
258 | 
259 |     __init__.__doc__ = INIT_DOC
260 | 


--------------------------------------------------------------------------------
/xynn/embedding/common.py:
--------------------------------------------------------------------------------
  1 | """
  2 | The base classes and common functions for embeddings
  3 | 
  4 | """
  5 | 
  6 | from abc import ABCMeta, abstractmethod
  7 | from collections import defaultdict
  8 | from typing import Any, Union, List, Dict, Iterable, Tuple
  9 | 
 10 | import numpy as np
 11 | import pandas as pd
 12 | import torch
 13 | from torch import Tensor
 14 | from torch import nn
 15 | from torch.utils.data import DataLoader
 16 | 
 17 | from ..preprocessing import IntegerEncoder
 18 | 
 19 | 
 20 | def _isnan(value: Any) -> bool:
 21 |     return isinstance(value, float) and np.isnan(value)
 22 | 
 23 | 
 24 | def _isnan_index(series: pd.Series) -> np.ndarray:
 25 |     return np.array([_isnan(value) for value in series.index])
 26 | 
 27 | 
 28 | def _linear_agg(num_fields, empty_param, batch):
 29 |     if num_fields != 0:
 30 |         return num_fields, empty_param
 31 |     return batch.shape[1], empty_param
 32 | 
 33 | 
 34 | def _unique(
 35 |     X: Union[Tensor, np.ndarray, pd.DataFrame]
 36 | ) -> Tuple[List[Iterable], List[bool]]:
 37 |     if isinstance(X, pd.DataFrame):
 38 |         uniques = [X[col].unique() for col in X.columns]
 39 |     elif isinstance(X, np.ndarray):
 40 |         uniques = [pd.unique(X[:, i]) for i in range(X.shape[1])]
 41 |     elif isinstance(X, Tensor):
 42 |         uniques = [torch.unique(X[:, i]).numpy() for i in range(X.shape[1])]
 43 |     else:
 44 |         raise TypeError(
 45 |             "input should be Pandas DataFrame, NumPy array, or PyTorch Tensor"
 46 |         )
 47 |     nan_chk = [np.array([_isnan(value) for value in group]) for group in uniques]
 48 |     has_nan = [np.any(check) for check in nan_chk]
 49 |     uniques = [group[~check] for group, check in zip(uniques, nan_chk)]
 50 |     return uniques, has_nan
 51 | 
 52 | 
 53 | def _unique_agg(uniques, has_nan, batch):
 54 |     for row in batch:
 55 |         for colnum, value in enumerate(row):
 56 |             value = value.item()
 57 |             if colnum >= len(uniques):
 58 |                 uniques.append(set())
 59 |                 has_nan.append(False)
 60 |             if _isnan(value):
 61 |                 has_nan[colnum] = True
 62 |             else:
 63 |                 uniques[colnum].add(value)
 64 |     return uniques, has_nan
 65 | 
 66 | 
 67 | def _value_counts(
 68 |     X: Union[Tensor, np.ndarray, pd.DataFrame]
 69 | ) -> Tuple[List[Dict[Any, int]], List[int]]:
 70 |     if isinstance(X, (np.ndarray, pd.DataFrame)):
 71 |         if isinstance(X, pd.DataFrame):
 72 |             counts = [
 73 |                 X[col].value_counts(dropna=False, ascending=True) for col in X.columns
 74 |             ]
 75 |         else:
 76 |             counts = [
 77 |                 pd.value_counts(X[:, i], dropna=False, ascending=True)
 78 |                 for i in range(X.shape[1])
 79 |             ]
 80 |         nan_check = [_isnan_index(count) for count in counts]
 81 |         nan_counts = [sum(count.loc[isnan]) for count, isnan in zip(counts, nan_check)]
 82 |         unique_counts = [
 83 |             count.loc[~isnan].to_dict() for count, isnan in zip(counts, nan_check)
 84 |         ]
 85 |     elif isinstance(X, Tensor):
 86 |         counts = [
 87 |             [values.numpy() for values in torch.unique(X[:, i], return_counts=True)]
 88 |             for i in range(X.shape[1])
 89 |         ]
 90 |         nan_check = [np.array([_isnan(val) for val in values]) for values, _ in counts]
 91 |         nan_counts = [np.sum(check) for check in nan_check]
 92 |         unique_counts = [
 93 |             dict(zip(vals[~check], counts[~check]))
 94 |             for (vals, counts), check in zip(counts, nan_check)
 95 |         ]
 96 |     else:
 97 |         raise TypeError(
 98 |             "input should be Pandas DataFrame, NumPy array, or PyTorch Tensor"
 99 |         )
100 |     return unique_counts, nan_counts
101 | 
102 | 
103 | def _value_counts_agg(unique_counts, nan_counts, batch):
104 |     for row in batch:
105 |         for colnum, value in enumerate(row):
106 |             value = value.item()
107 |             if colnum >= len(unique_counts):
108 |                 unique_counts.append(defaultdict(int))
109 |                 nan_counts.append(0)
110 |             if _isnan(value):
111 |                 nan_counts[colnum] += 1
112 |             else:
113 |                 unique_counts[colnum][value] += 1
114 |     return unique_counts, nan_counts
115 | 
116 | 
117 | def _flatten_counts(unique_counts: List[Dict[int, int]]) -> List[int]:
118 |     counts = [
119 |         [count.get(i, 0) for i in range(max(count) + 1)]
120 |         for count in unique_counts
121 |     ]
122 |     return counts
123 | 
124 | 
125 | class EmbeddingBase(nn.Module, metaclass=ABCMeta):
126 |     """
127 |     Base class for embeddings
128 | 
129 |     """
130 | 
131 |     def __init__(self):
132 |         super().__init__()
133 |         self._isfit = False
134 | 
135 |     @abstractmethod
136 |     def _fit_array(self, X):
137 |         return
138 | 
139 |     @abstractmethod
140 |     def _fit_iterable(self, X):
141 |         return
142 | 
143 |     def fit(self, X) -> "EmbeddingBase":
144 |         """
145 |         Create the embedding from training data
146 | 
147 |         Parameters
148 |         ----------
149 |         X : array-like or iterable of array-like
150 |             should be a PyTorch Tensor, NumPy array, Pandas DataFrame
151 |             or iterable of arrays/tensors (i.e., batches)
152 | 
153 |         Return
154 |         ------
155 |         self
156 | 
157 |         """
158 |         if isinstance(X, (np.ndarray, Tensor, pd.DataFrame)):
159 |             self._fit_array(X)
160 |         elif isinstance(X, DataLoader):
161 |             self._fit_iterable(X)
162 |         else:
163 |             raise TypeError(
164 |                 "input X must be a PyTorch Tensor, PyTorch DataLoader, "
165 |                 "NumPy array, or Pandas DataFrame"
166 |             )
167 | 
168 |         self._isfit = True
169 | 
170 |         return self
171 | 
172 | 
173 | class BasicBase(EmbeddingBase):
174 |     """Base class for embeddings that do not have defaults"""
175 | 
176 |     @abstractmethod
177 |     def from_summary(self, uniques, has_nan) -> "BasicBase":
178 |         return self
179 | 
180 |     def _fit_array(self, X):
181 |         uniques, has_nan = _unique(X)
182 |         self.from_summary(uniques, has_nan)
183 | 
184 |     def _fit_iterable(self, X):
185 |         uniques = []
186 |         has_nan = []
187 |         for batch in X:
188 |             _unique_agg(uniques, has_nan, batch)
189 |         self.from_summary(uniques, has_nan)
190 | 
191 | 
192 | class DefaultBase(EmbeddingBase):
193 |     """Base class for embeddings that have a default embedding for each field"""
194 | 
195 |     @abstractmethod
196 |     def from_summary(self, unique_counts, nan_counts) -> "DefaultBase":
197 |         return self
198 | 
199 |     def _fit_array(self, X):
200 |         unique_counts, nan_counts = _value_counts(X)
201 |         self.from_summary(unique_counts, nan_counts)
202 | 
203 |     def _fit_iterable(self, X):
204 |         unique_counts = []
205 |         nan_counts = []
206 |         for batch in X:
207 |             _value_counts_agg(unique_counts, nan_counts, batch)
208 |         self.from_summary(unique_counts, nan_counts)
209 | 
210 | 
211 | class FastBasicBase(EmbeddingBase):
212 |     """Base class for embeddings that do not have defaults"""
213 | 
214 |     @abstractmethod
215 |     def from_summary(self, num_classes: List[int]) -> "FastBasicBase":
216 |         return self
217 | 
218 |     def from_encoder(self, encoder: IntegerEncoder) -> "FastBasicBase":
219 |         """
220 |         Initialize from a fit IntegerEncoder
221 | 
222 |         Parameters
223 |         ----------
224 |         encoder : IntegerEncoder
225 | 
226 |         Return
227 |         ------
228 |         self
229 | 
230 |         """
231 |         if not isinstance(encoder, IntegerEncoder):
232 |             raise TypeError("encoder needs to be a fit IntegerEncoder")
233 |         if not encoder._isfit:
234 |             raise ValueError("encoder needs to be fit")
235 |         return self.from_summary(encoder.num_classes)
236 | 
237 |     def _fit_array(self, X):
238 |         uniques, has_nan = _unique(X)
239 |         if any(has_nan):
240 |             raise ValueError("NaN found in categorical data")
241 |         self.from_summary([max(col_uniques) + 1 for col_uniques in uniques])
242 | 
243 |     def _fit_iterable(self, X):
244 |         uniques = []
245 |         has_nan = []
246 |         for batch in X:
247 |             _unique_agg(uniques, has_nan, batch)
248 |         if any(has_nan):
249 |             raise ValueError("NaN found in categorical data")
250 |         self.from_summary([max(col_uniques) + 1 for col_uniques in uniques])
251 | 
252 | 
253 | class FastDefaultBase(EmbeddingBase):
254 |     """Base class for embeddings that have a default embedding for each field"""
255 | 
256 |     @abstractmethod
257 |     def from_summary(self, class_counts: List[List[int]]) -> "FastDefaultBase":
258 |         return self
259 | 
260 |     def from_encoder(self, encoder: IntegerEncoder) -> "FastDefaultBase":
261 |         """
262 |         Initialize from a fit IntegerEncoder
263 | 
264 |         Parameters
265 |         ----------
266 |         encoder : IntegerEncoder
267 | 
268 |         Return
269 |         ------
270 |         self
271 | 
272 |         """
273 |         if not isinstance(encoder, IntegerEncoder):
274 |             raise TypeError("encoder needs to be a fit IntegerEncoder")
275 |         if not encoder._isfit:
276 |             raise ValueError("encoder needs to be fit")
277 |         return self.from_summary(encoder.class_counts)
278 | 
279 |     def _fit_array(self, X):
280 |         unique_counts, nan_counts = _value_counts(X)
281 |         if any(nan_counts):
282 |             raise ValueError("NaN found in categorical data")
283 |         counts = _flatten_counts(unique_counts)
284 |         self.from_summary(counts)
285 | 
286 |     def _fit_iterable(self, X):
287 |         unique_counts = []
288 |         nan_counts = []
289 |         for batch in X:
290 |             _value_counts_agg(unique_counts, nan_counts, batch)
291 |             if any(nan_counts):
292 |                 raise ValueError("NaN found in categorical data")
293 |         counts = _flatten_counts(unique_counts)
294 |         self.from_summary(counts)
295 | 


--------------------------------------------------------------------------------
/xynn/embedding/uniform/categorical.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Classes for embedding categorical fields
  3 | 
  4 | BasicEmbedding
  5 |   - embed each value with single vector
  6 | DefaultEmbedding
  7 |   - like BasicEmbedding, but include a "default" vector for each field
  8 |   - returned vector is a weighted combination between the value's own vector
  9 |     and the field's "default" vector
 10 |   - the weighting is based on the count of value in the training set; a higher
 11 |     count puts more weight the value's own vector
 12 |   - values not seen in the training data are embedded with the default vector
 13 | 
 14 | """
 15 | 
 16 | from typing import Any, Union, List, Dict, Optional, Tuple
 17 | 
 18 | import numpy as np
 19 | import torch
 20 | from torch import Tensor
 21 | from torch import nn
 22 | 
 23 | from ..common import _isnan, BasicBase, DefaultBase
 24 | from .base import UniformBase
 25 | 
 26 | 
 27 | class BasicEmbedding(UniformBase, BasicBase):
 28 |     """
 29 |     A basic embedding that creates an embedded vector for each field value.
 30 | 
 31 |     """
 32 | 
 33 |     def __init__(self, embedding_size: int = 10, device: Union[str, torch.device] = "cpu"):
 34 |         """
 35 |         Parameters
 36 |         ----------
 37 |         embedding_size : int, optional
 38 |             size of each value's embedding vector; default is 10
 39 |         device : string or torch.device
 40 | 
 41 |         """
 42 |         super().__init__()
 43 |         self.num_fields = 0
 44 |         self.output_size = 0
 45 |         self.lookup: Dict[Tuple[int, Any], int] = {}
 46 |         self.lookup_nan: Dict[int, int] = {}
 47 |         self.num_values = 0
 48 |         self.embedding: Optional[nn.Embedding] = None
 49 |         self.embedding_size = embedding_size
 50 |         self._device = device
 51 |         self.to(device)
 52 |         self._isfit = False
 53 | 
 54 |     def __repr__(self):
 55 |         return f"BasicEmbedding({repr(self.embedding_size)}, {repr(self._device)})"
 56 | 
 57 |     def from_summary(
 58 |         self, uniques: List[Union[List, Tensor, np.ndarray]], has_nan: List[bool]
 59 |     ):
 60 |         """
 61 |         Create the embedding from category values for each field
 62 | 
 63 |         Parameters
 64 |         ----------
 65 |         uniques : list of array-like
 66 |             all possible category values for each field
 67 |         has_nan : list of boolean
 68 |             whether each field can have NaN
 69 | 
 70 |         Return
 71 |         ------
 72 |         self
 73 | 
 74 |         """
 75 |         if not len(uniques) == len(has_nan):
 76 |             raise ValueError(
 77 |                 "length of uniques and has_nan should be equal, "
 78 |                 f"got {len(uniques)}, {len(has_nan)}"
 79 |             )
 80 | 
 81 |         lookup = {}
 82 |         lookup_nan = {}
 83 |         num_values = 0
 84 |         for fieldnum, (field, use_nan) in enumerate(zip(uniques, has_nan)):
 85 |             for value in field:
 86 |                 if (fieldnum, value) in lookup:
 87 |                     # extra defense against repeated values
 88 |                     continue
 89 |                 lookup[(fieldnum, value)] = num_values
 90 |                 num_values += 1
 91 |             if use_nan:
 92 |                 lookup_nan[fieldnum] = num_values
 93 |                 num_values += 1
 94 | 
 95 |         self.num_fields = len(uniques)
 96 |         self.output_size = self.num_fields * self.embedding_size
 97 |         self.lookup = lookup
 98 |         self.lookup_nan = lookup_nan
 99 |         self.num_values = num_values
100 |         self.embedding = nn.Embedding(num_values, self.embedding_size).to(device=self._device)
101 |         nn.init.xavier_uniform_(self.embedding.weight)
102 | 
103 |         self._isfit = True
104 | 
105 |         return self
106 | 
107 |     def forward(self, X: Tensor) -> Tensor:
108 |         """
109 |         Produce embedding for each value in input
110 | 
111 |         Parameters
112 |         ----------
113 |         X : torch.Tensor
114 | 
115 |         Return
116 |         ------
117 |         torch.Tensor
118 | 
119 |         """
120 |         if not self._isfit:
121 |             raise RuntimeError("need to call `fit` or `from_summary` first")
122 | 
123 |         idxs: List[List[int]] = []
124 |         for row in X:
125 |             idxs.append([])
126 |             for col, val in enumerate(row):
127 |                 val = val.item()
128 |                 if _isnan(val):
129 |                     idx = self.lookup_nan[col]
130 |                 else:
131 |                     idx = self.lookup[(col, val)]
132 |                 idxs[-1].append(idx)
133 | 
134 |         return self.embedding(torch.tensor(idxs, dtype=torch.int64, device=self._device))
135 | 
136 | 
137 | class DefaultEmbedding(UniformBase, DefaultBase):
138 |     """
139 |     An embedding with a default value for each field. The default is returned for
140 |     any field value not seen when the embedding was initialized (using `fit` or
141 |     `from_summary`). For any value seen at initialization, a weighted average of
142 |     that value's embedding and the default embedding is returned. The weights for
143 |     the average are determined by the parameter `alpha`:
144 | 
145 |     weight = count / (count + alpha)
146 |     final = embedding * weight + default * (1 - weight)
147 | 
148 |     """
149 | 
150 |     def __init__(
151 |         self,
152 |         embedding_size: int = 10,
153 |         alpha: int = 20,
154 |         device: Union[str, torch.device] = "cpu",
155 |     ):
156 |         """
157 |         Parameters
158 |         ----------
159 |         embedding_size : int, optional
160 |             size of each value's embedding vector; default is 10
161 |         alpha : int, optional
162 |             controls the weighting of each embedding vector with the default;
163 |             when `alpha`-many values are seen at initialization; the final
164 |             vector is evenly weighted; the influence of the default is decreased
165 |             with either higher counts or lower `alpha`; default is 20
166 |         device : string or torch.device
167 | 
168 |         """
169 |         super().__init__()
170 |         self.num_fields = 0
171 |         self.output_size = 0
172 |         self.alpha = alpha
173 |         self.lookup: Dict[Tuple[int, Any], Tuple[int, int]] = {}
174 |         self.lookup_nan: Dict[int, Tuple[int, int]] = {}
175 |         self.lookup_default: Dict[int, Tuple[int, int]] = {}
176 |         self.num_values = 0
177 |         self.embedding: Optional[nn.Embedding] = None
178 |         self.embedding_size = embedding_size
179 |         self._device = device
180 |         self.to(device)
181 |         self._isfit = False
182 | 
183 |     def __repr__(self):
184 |         embed_size = self.embedding_size
185 |         alpha = self.alpha
186 |         device = repr(self._device)
187 |         return f"DefaultEmbedding({embed_size}, {alpha}, {device})"
188 | 
189 |     def from_summary(self, unique_counts: List[Dict[Any, int]], nan_counts: List[int]):
190 |         """
191 |         Create the embedding from known value counts for each field
192 | 
193 |         Parameters
194 |         ----------
195 |         unique_counts : list of dicts
196 |             each dict is a mapping from Python object to count of occurrences,
197 |             one dict for each field
198 |         nan_counts : list of int
199 |             count of NaN occurrences for each field
200 | 
201 |         Return
202 |         ------
203 |         self
204 | 
205 |         """
206 |         if not len(unique_counts) == len(nan_counts):
207 |             raise ValueError(
208 |                 "length of unique_counts and nan_counts should be equal, "
209 |                 f"got {len(unique_counts)}, {len(nan_counts)}"
210 |             )
211 | 
212 |         lookup = {}
213 |         lookup_nan = {}
214 |         lookup_default = {}
215 |         num_values = 0
216 |         for fieldnum, (counts, nan_count) in enumerate(zip(unique_counts, nan_counts)):
217 |             lookup_default[fieldnum] = (num_values, 0)
218 |             num_values += 1
219 |             for value, count in counts.items():
220 |                 lookup[(fieldnum, value)] = (num_values, count)
221 |                 num_values += 1
222 |             if nan_count:
223 |                 lookup_nan[fieldnum] = (num_values, nan_count)
224 |                 num_values += 1
225 | 
226 |         self.num_fields = len(unique_counts)
227 |         self.output_size = self.num_fields * self.embedding_size
228 |         self.lookup = lookup
229 |         self.lookup_nan = lookup_nan
230 |         self.lookup_default = lookup_default
231 |         self.num_values = num_values
232 |         self.embedding = nn.Embedding(num_values, self.embedding_size).to(device=self._device)
233 |         nn.init.xavier_uniform_(self.embedding.weight)
234 | 
235 |         self._isfit = True
236 | 
237 |         return self
238 | 
239 |     def forward(self, X: Tensor) -> Tensor:
240 |         """
241 |         Produce embedding for each value in input
242 | 
243 |         Parameters
244 |         ----------
245 |         X : torch.Tensor
246 | 
247 |         Return
248 |         ------
249 |         torch.Tensor
250 | 
251 |         """
252 |         if not self._isfit:
253 |             raise RuntimeError("need to call `fit` or `from_summary` first")
254 | 
255 |         list_weights: List[List[List[float]]] = []
256 |         idxs_primary: List[List[int]] = []
257 |         idxs_default: List[List[int]] = []
258 |         for row in X:
259 |             list_weights.append([])
260 |             idxs_primary.append([])
261 |             idxs_default.append([])
262 |             for col, val in enumerate(row):
263 |                 val = val.item()
264 |                 default = self.lookup_default[col]
265 |                 if _isnan(val):
266 |                     idx, count = self.lookup_nan.get(col, default)
267 |                 else:
268 |                     idx, count = self.lookup.get((col, val), default)
269 |                 list_weights[-1].append([count / (count + self.alpha)])
270 |                 idxs_primary[-1].append(idx)
271 |                 idxs_default[-1].append(default[0])
272 |         tsr_weights = torch.tensor(list_weights, dtype=torch.float32, device=self._device)
273 |         emb_primary = self.embedding(
274 |             torch.tensor(idxs_primary, dtype=torch.int64, device=self._device)
275 |         )
276 |         emb_default = self.embedding(
277 |             torch.tensor(idxs_default, dtype=torch.int64, device=self._device)
278 |         )
279 |         return tsr_weights * emb_primary + (1 - tsr_weights) * emb_default
280 | 


--------------------------------------------------------------------------------
/xynn/embedding/ragged/fast_ragged.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Embeddings that allow embedding of multiple fields together and
  3 | allow a different vector size for each field
  4 | 
  5 | FastRaggedEmbedding
  6 |   - requires that each field's values are integers 0, 1, ...
  7 |   - embed each value with single vector
  8 |   - allows a different vector size for each field
  9 | FastRaggedDefaultEmbedding
 10 |   - requires that each field's values are integers 0, 1, ...
 11 |   - like RaggedEmbedding, but include a "default" vector for each field
 12 |   - returned vector is a weighted combination between the value's own vector
 13 |     and the field's "default" vector
 14 |   - the weighting is based on the count of value in the training set; a higher
 15 |     count puts more weight the value's own vector
 16 |   - values not seen in the training data are embedded with the default vector
 17 |   - allows a different vector size for each field
 18 | 
 19 | """
 20 | 
 21 | from typing import Union, List, Optional, Iterable
 22 | 
 23 | import torch
 24 | from torch import Tensor
 25 | from torch import nn
 26 | 
 27 | from ..common import FastBasicBase, FastDefaultBase
 28 | from .common import RaggedBase, _check_embedding_size, _parse_embedding_size
 29 | 
 30 | 
 31 | class FastRaggedEmbedding(RaggedBase, FastBasicBase):
 32 |     """
 33 |     Creates an embedded vector for each field value, with each field allowed
 34 |     a different size of embedding
 35 | 
 36 |     """
 37 | 
 38 |     def __init__(
 39 |         self,
 40 |         embedding_size: Union[str, int, Iterable[int]] = "sqrt",
 41 |         max_size: int = 100,
 42 |         device: Union[str, torch.device] = "cpu",
 43 |     ):
 44 |         """
 45 |         Parameters
 46 |         ----------
 47 |         embedding_size : {"sqrt", "log", "fastai"} or iterable of int; optional
 48 |             - "sqrt": square root of number of classes in each field, rounded up
 49 |             - "log": log of number of classes in each field, rounded up
 50 |             - "fastai": `round(1.6 * num_classes**0.56))`
 51 |             if iterable of int, the number of values must match the number of
 52 |             fields when calling `fit`; the embedding size can also be
 53 |             passed in later with `fit` or `from_summary`; default is "sqrt"
 54 |         max_size : int, optional
 55 |             maximum embedding size if using "sqrt", "log", or "fastai";
 56 |             default is 100
 57 |         device : string or torch.device, optional
 58 | 
 59 |         """
 60 |         super().__init__()
 61 |         embedding_size = _check_embedding_size(embedding_size)
 62 |         self.num_fields = 0
 63 |         self.output_size = 0
 64 |         self.num_classes: List[int] = []
 65 |         self.embedding: Optional[nn.ModuleList] = None
 66 |         self.embedding_size_orig = embedding_size
 67 |         self.embedding_size = embedding_size
 68 |         self.max_size = max_size
 69 |         self._device = device
 70 |         self.to(device)
 71 |         self._isfit = False
 72 | 
 73 |     def __repr__(self):
 74 |         embed_size = repr(self.embedding_size_orig)
 75 |         max_size = self.max_size
 76 |         device = repr(self._device)
 77 |         return f"FastRaggedEmbedding({embed_size}, {max_size}, {device})"
 78 | 
 79 |     def from_summary(self, num_classes: List[int]) -> "FastRaggedEmbedding":
 80 |         """
 81 |         Create the embedding from category values for each field
 82 | 
 83 |         Parameters
 84 |         ----------
 85 |         num_classes : list of int
 86 |             number of category values for each field
 87 | 
 88 |         Return
 89 |         ------
 90 |         self
 91 | 
 92 |         """
 93 |         embedding_size = _parse_embedding_size(
 94 |             self.embedding_size, self.max_size, num_classes
 95 |         )
 96 | 
 97 |         self.embedding = nn.ModuleList([])
 98 |         for num_cats, size in zip(num_classes, embedding_size):
 99 |             embedding = nn.Embedding(num_cats, size).to(device=self._device)
100 |             nn.init.xavier_uniform_(embedding.weight)
101 |             self.embedding.append(embedding)
102 | 
103 |         self.num_fields = len(num_classes)
104 |         self.output_size = sum(embedding_size)
105 |         self.num_classes = num_classes
106 |         self.embedding_size = embedding_size
107 | 
108 |         self._isfit = True
109 | 
110 |         return self
111 | 
112 |     def forward(self, X: Tensor) -> Tensor:
113 |         """
114 |         Produce embedding for each value in input
115 | 
116 |         Parameters
117 |         ----------
118 |         X : torch.Tensor
119 | 
120 |         Return
121 |         ------
122 |         torch.Tensor
123 | 
124 |         """
125 |         if not self._isfit:
126 |             raise RuntimeError("need to call `fit` or `from_summary` first")
127 | 
128 |         embedded = [
129 |             embedding(column).reshape((X.shape[0], -1))
130 |             for embedding, column in zip(self.embedding, X.split(1, dim=1))
131 |         ]
132 | 
133 |         return torch.cat(embedded, dim=1)
134 | 
135 | 
136 | class FastRaggedDefaultEmbedding(RaggedBase, FastDefaultBase):
137 |     """
138 |     An embedding with a default value for each field and which allows a different
139 |     embedding size for each field. The default is returned for any field value
140 |     not seen when the embedding was initialized (using `fit` or `from_summary`).
141 |     For any value seen at initialization, a weighted average of that value's
142 |     embedding and the default embedding is returned. The weights for the average
143 |     are determined by the parameter `alpha`:
144 | 
145 |     weight = count / (count + alpha)
146 |     final = embedding * weight + default * (1 - weight)
147 | 
148 |     """
149 | 
150 |     def __init__(
151 |         self,
152 |         embedding_size: Union[str, Iterable[int]] = "sqrt",
153 |         max_size: int = 100,
154 |         alpha: int = 20,
155 |         device: Union[str, torch.device] = "cpu",
156 |     ):
157 |         """
158 |         Parameters
159 |         ----------
160 |         embedding_size : {"sqrt", "log", "fastai"} or iterable of int; optional
161 |             - "sqrt": square root of number of classes in each field, rounded up
162 |             - "log": log of number of classes in each field, rounded up
163 |             - "fastai": `round(1.6 * num_classes**0.56))`
164 |             if iterable of int, the number of values must match the number of
165 |             fields when calling `fit`; the embedding size can also be
166 |             passed in later with `fit` or `from_summary`; default is "sqrt"
167 |         max_size : int, optional
168 |             maximum embedding size if using "sqrt", "log", or "fastai";
169 |             default is 100
170 |         alpha : int, optional
171 |             controls the weighting of each embedding vector with the default;
172 |             when `alpha`-many values are seen at initialization; the final
173 |             vector is evenly weighted; the influence of the default is decreased
174 |             with either higher counts or lower `alpha`; default is 20
175 |         device : string or torch.device
176 | 
177 |         """
178 |         super().__init__()
179 |         embedding_size = _check_embedding_size(embedding_size)
180 |         self.num_fields = 0
181 |         self.output_size = 0
182 |         self.alpha = alpha
183 |         self.embedding: Optional[nn.ModuleList] = None
184 |         self.embedding_size_orig = embedding_size
185 |         self.embedding_size = embedding_size
186 |         self.max_size = max_size
187 |         self.num_classes: List[int] = []
188 |         self.max_values: Optional[Tensor] = None
189 |         self.counts: List[Tensor] = []
190 |         self._device = device
191 |         self.to(device)
192 |         self._isfit = False
193 | 
194 |     def __repr__(self):
195 |         embed_size = repr(self.embedding_size_orig)
196 |         max_size = self.max_size
197 |         alpha = self.alpha
198 |         device = repr(self._device)
199 |         return f"FastRaggedDefaultEmbedding({embed_size}, {max_size}, {alpha}, {device})"
200 | 
201 |     def from_summary(self, class_counts: List[List[int]]) -> "FastRaggedDefaultEmbedding":
202 |         """
203 |         Create the embedding from known value counts for each field
204 | 
205 |         Parameters
206 |         ----------
207 |         class_counts : list of list of int
208 |             each sub-list has count of category occurrences,
209 |             one sub-list for each field
210 | 
211 |         Return
212 |         ------
213 |         self
214 | 
215 |         """
216 |         num_classes = [len(counts) for counts in class_counts]
217 | 
218 |         embedding_size = _parse_embedding_size(
219 |             self.embedding_size, self.max_size, num_classes
220 |         )
221 | 
222 |         self.embedding = nn.ModuleList([])
223 |         for num_cls, size in zip(num_classes, embedding_size):
224 |             embedding = nn.Embedding(num_cls + 1, size).to(device=self._device)
225 |             nn.init.xavier_uniform_(embedding.weight)
226 |             self.embedding.append(embedding)
227 | 
228 |         self.num_fields = len(class_counts)
229 |         self.output_size = sum(embedding_size)
230 |         self.num_classes = num_classes
231 |         self.embedding_size = embedding_size
232 |         self.max_values = torch.tensor([[x - 1 for x in num_classes]], device=self._device)
233 |         self.counts = [
234 |             torch.tensor(counts, device=self._device) for counts in class_counts
235 |         ]
236 | 
237 |         self._isfit = True
238 | 
239 |         return self
240 | 
241 |     def forward(self, X: Tensor) -> Tensor:
242 |         """
243 |         Produce embedding for each value in input
244 | 
245 |         Parameters
246 |         ----------
247 |         X : torch.Tensor
248 | 
249 |         Return
250 |         ------
251 |         torch.Tensor
252 | 
253 |         """
254 |         if not self._isfit:
255 |             raise RuntimeError("need to call `fit` or `from_summary` first")
256 | 
257 |         embedded = []
258 |         for embedding, counts, num_cls, X_col in zip(
259 |             self.embedding, self.counts, self.num_classes, X.split(1, dim=1)
260 |         ):
261 |             unxpctd = (X_col >= num_cls)
262 |             idx = torch.clone(X_col)
263 |             idx[unxpctd] = 0    # block any unexpected categories
264 | 
265 |             counts = counts.expand(idx.shape[0], num_cls)
266 |             counts = torch.gather(counts, dim=1, index=idx)
267 |             weights = (counts / (counts + self.alpha)).unsqueeze(-1)
268 |             weights[unxpctd] = 0  # block any unexpected categories
269 | 
270 |             primary = embedding(X_col)
271 |             default = embedding.weight[num_cls:, :].unsqueeze(0)
272 |             output = (weights * primary + (1 - weights) * default).reshape(X_col.shape[0], -1)
273 |             embedded.append(output)
274 | 
275 |         return torch.cat(embedded, dim=1)
276 | 


--------------------------------------------------------------------------------
/tests/test_xdeepfm/test_modules.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import random
  3 | 
  4 | import torch
  5 | from torch import nn
  6 | import numpy as np
  7 | import pytest
  8 | 
  9 | from xynn.base_classes.estimators import _set_seed
 10 | from xynn.base_classes.modules import BaseNN
 11 | from xynn.xdeepfm.modules import CIN
 12 | from xynn.xdeepfm import XDeepFM
 13 | from xynn.embedding import LinearEmbedding, BasicEmbedding
 14 | from xynn.mlp import LeakyGate, GhostBatchNorm
 15 | 
 16 | from ..common import simple_train_inputs, simple_model_train_loop
 17 | 
 18 | 
 19 | def test_that_cin_raises_error_for_bad_layer_sizes_when_not_full_agg():
 20 |     with pytest.raises(
 21 |         ValueError,
 22 |         match="when using full_agg=False, all but the last layer size must be even"
 23 |     ):
 24 |         CIN(num_fields=5, layer_sizes=(127, 127), full_agg=False)
 25 | 
 26 | 
 27 | def test_cin_layers_without_activation_and_bn():
 28 |     cin = CIN(num_fields=5, use_bn=False)
 29 |     assert len(cin.convs) == 2
 30 |     assert len(cin.actns) == 2
 31 |     assert len(cin.norms) == 2
 32 |     assert all(isinstance(conv, nn.Conv1d) for conv in cin.convs)
 33 |     assert all(isinstance(actn, nn.Identity) for actn in cin.actns)
 34 |     assert all(isinstance(norm, nn.Identity) for norm in cin.norms)
 35 | 
 36 | 
 37 | def test_cin_layers_with_activation_and_bn():
 38 |     cin = CIN(num_fields=5, activation=nn.ReLU, use_bn=True)
 39 |     assert len(cin.convs) == 2
 40 |     assert len(cin.actns) == 2
 41 |     assert len(cin.norms) == 2
 42 |     assert all(isinstance(conv, nn.Conv1d) for conv in cin.convs)
 43 |     assert all(isinstance(actn, nn.ReLU) for actn in cin.actns)
 44 |     assert all(isinstance(norm, nn.BatchNorm1d) for norm in cin.norms)
 45 | 
 46 | 
 47 | def test_cin_shape_of_output():
 48 |     x = torch.rand((20, 5, 8))
 49 | 
 50 |     cin = CIN(num_fields=5, layer_sizes=(10,))
 51 |     out = cin(x)
 52 |     assert out.shape == (20, 10)
 53 | 
 54 |     cin = CIN(num_fields=5, layer_sizes=(10, 10))
 55 |     out = cin(x)
 56 |     assert out.shape == (20, 15)
 57 | 
 58 |     cin = CIN(num_fields=5, layer_sizes=(10, 10), full_agg=True)
 59 |     out = cin(x)
 60 |     assert out.shape == (20, 20)
 61 | 
 62 | 
 63 | def test_that_xdeepfm_module_subclasses_basenn():
 64 |     assert issubclass(XDeepFM, BaseNN)
 65 | 
 66 | 
 67 | def test_that_xdeepfm_uses_basenn_init():
 68 |     X = torch.randint(0, 10, (100, 10))
 69 |     embedding_num = LinearEmbedding(embedding_size=3).fit(X)
 70 | 
 71 |     model = XDeepFM(
 72 |         task="classification",
 73 |         output_size=3,
 74 |         embedding_num=embedding_num,
 75 |         embedding_cat=None,
 76 |         embedding_l2_reg=0.2,
 77 |         mlp_l1_reg=0.1
 78 |     )
 79 | 
 80 |     assert model.task == "classification"
 81 |     assert model.num_epochs == 0
 82 |     assert isinstance(model.loss_fn, nn.CrossEntropyLoss)
 83 |     assert model.embedding_num is embedding_num
 84 |     assert model.embedding_cat is None
 85 |     assert model.embedding_l1_reg == 0.0
 86 |     assert model.embedding_l2_reg == 0.2
 87 |     assert model.mlp_l1_reg == 0.1
 88 |     assert model.mlp_l2_reg == 0.0
 89 |     assert model.optimizer is None
 90 |     assert model.optimizer_info == {}
 91 |     assert model.scheduler == {}
 92 |     assert model._device == "cpu"
 93 | 
 94 | 
 95 | def test_that_xdeepfm_parameters_are_passed_to_submodules():
 96 |     X = torch.randint(0, 10, (100, 10))
 97 |     embedding_cat = BasicEmbedding(embedding_size=3).fit(X)
 98 |     model = XDeepFM(
 99 |         task="classification",
100 |         output_size=3,
101 |         embedding_num=None,
102 |         embedding_cat=embedding_cat,
103 |         mlp_activation=nn.ReLU,
104 |         mlp_hidden_sizes=(512, 128, 32),
105 |         mlp_use_bn=False,
106 |         mlp_use_skip=False,
107 |         use_leaky_gate=False,
108 |     )
109 | 
110 |     expected_classes = [
111 |         nn.Linear,
112 |         nn.ReLU,
113 |         nn.Linear,
114 |         nn.ReLU,
115 |         nn.Linear,
116 |         nn.ReLU,
117 |         nn.Linear,
118 |     ]
119 |     for mlp in (model.mlp, model.cin_final):
120 |         assert len(mlp.main_layers) == len(expected_classes)
121 |         for layer, expected_class in zip(mlp.main_layers, expected_classes):
122 |             assert isinstance(layer, expected_class)
123 |         assert mlp.skip_layers is None
124 | 
125 |     assert model.use_residual
126 |     assert isinstance(model.cin, CIN) and len(model.cin.convs) == 2
127 |     assert model.mix.requires_grad
128 | 
129 | 
130 | def test_that_xdeepfm_parameters_are_passed_to_submodules_other_params():
131 |     X = torch.randint(0, 10, (100, 10))
132 |     embedding_cat = BasicEmbedding(embedding_size=3).fit(X)
133 |     model = XDeepFM(
134 |         task="classification",
135 |         output_size=3,
136 |         embedding_num=None,
137 |         embedding_cat=embedding_cat,
138 |         cin_layer_sizes=(20, 20, 20),
139 |         cin_use_residual=False,
140 |         mlp_use_skip=True,
141 |     )
142 | 
143 |     expected_classes = [
144 |         LeakyGate,
145 |         nn.Linear,
146 |         nn.BatchNorm1d,
147 |         nn.LeakyReLU,
148 |         nn.Linear,
149 |         nn.BatchNorm1d,
150 |         nn.LeakyReLU,
151 |         nn.Linear,
152 |         nn.BatchNorm1d,
153 |         nn.LeakyReLU,
154 |         nn.Linear,
155 |         nn.BatchNorm1d,
156 |         nn.LeakyReLU,
157 |         nn.Linear,
158 |     ]
159 |     for mlp in (model.mlp, model.cin_final):
160 |         assert len(mlp.main_layers) == len(expected_classes)
161 |         for layer, expected_class in zip(mlp.main_layers, expected_classes):
162 |             assert isinstance(layer, expected_class)
163 |         assert mlp.skip_layers is not None
164 | 
165 |     assert not model.use_residual
166 |     assert isinstance(model.cin, CIN) and len(model.cin.convs) == 3
167 |     assert model.mix.requires_grad
168 | 
169 | 
170 | def test_xdeepfm_parameters_with_ghost_batch():
171 |     X = torch.randint(0, 10, (100, 10))
172 |     embedding_cat = BasicEmbedding(embedding_size=3).fit(X)
173 |     model = XDeepFM(
174 |         task="classification",
175 |         output_size=3,
176 |         embedding_num=None,
177 |         embedding_cat=embedding_cat,
178 |         cin_layer_sizes=(20, 20, 20),
179 |         cin_use_residual=False,
180 |         mlp_hidden_sizes=(128, 128),
181 |         mlp_ghost_batch=12,
182 |         mlp_use_skip=False,
183 |     )
184 | 
185 |     expected_classes = [
186 |         LeakyGate,
187 |         nn.Linear,
188 |         GhostBatchNorm,
189 |         nn.LeakyReLU,
190 |         nn.Linear,
191 |         GhostBatchNorm,
192 |         nn.LeakyReLU,
193 |         nn.Linear,
194 |     ]
195 |     for mlp in (model.mlp, model.cin_final):
196 |         assert len(mlp.main_layers) == len(expected_classes)
197 |         for layer, expected_class in zip(mlp.main_layers, expected_classes):
198 |             assert isinstance(layer, expected_class)
199 |         assert mlp.skip_layers is None
200 | 
201 |     assert not model.use_residual
202 |     assert isinstance(model.cin, CIN) and len(model.cin.convs) == 3
203 |     assert model.mix.requires_grad
204 | 
205 | 
206 | def test_that_xdeepfm_diagram_exists_and_prints_something(capsys):
207 |     XDeepFM.diagram()
208 |     captured = capsys.readouterr()
209 |     assert len(captured.out.split("\n")) > 5
210 | 
211 | 
212 | def test_xdeepfm_mlp_weight():
213 |     X = torch.randint(0, 10, (100, 10))
214 |     embedding_num = LinearEmbedding(embedding_size=3).fit(X)
215 | 
216 |     # without Linear after CIN
217 |     model = XDeepFM(
218 |         task="regression",
219 |         output_size=1,
220 |         embedding_num=embedding_num,
221 |         embedding_cat=None,
222 |         cin_use_mlp=False,
223 |         mlp_use_bn=False,
224 |         mlp_use_skip=False,
225 |         use_leaky_gate=False,
226 |     )
227 | 
228 |     exp_w1 = 0
229 |     exp_w2 = 0
230 |     for mlp in (model.mlp, model.cin_final):
231 |         exp_w1 += sum(l.weight.abs().sum().item() for l in mlp.main_layers[::2])
232 |         exp_w2 += sum((l.weight ** 2).sum().item() for l in mlp.main_layers[::2])
233 | 
234 |     w1, w2 = model.mlp_weight_sum()
235 |     assert np.isclose(w1.item(), exp_w1)
236 |     assert np.isclose(w2.item(), exp_w2)
237 | 
238 |     # with MLP after CIN
239 |     model = XDeepFM(
240 |         task="regression",
241 |         output_size=1,
242 |         embedding_num=embedding_num,
243 |         embedding_cat=None,
244 |         cin_use_mlp=True,
245 |         mlp_use_bn=False,
246 |         mlp_use_skip=False,
247 |         use_leaky_gate=False,
248 |     )
249 | 
250 |     exp_w1 = 0
251 |     exp_w2 = 0
252 |     for mlp in (model.mlp, model.cin_final):
253 |         exp_w1 += sum(l.weight.abs().sum().item() for l in mlp.main_layers[::2])
254 |         exp_w2 += sum((l.weight ** 2).sum().item() for l in mlp.main_layers[::2])
255 | 
256 |     w1, w2 = model.mlp_weight_sum()
257 |     assert np.isclose(w1.item(), exp_w1)
258 |     assert np.isclose(w2.item(), exp_w2)
259 | 
260 | 
261 | def test_that_xdeepfm_learns():
262 |     _set_seed(10101)
263 | 
264 |     X_num = torch.randint(0, 10, (100, 10))
265 |     X_cat = torch.randint(0, 5, (100, 1))
266 |     y = (
267 |         (X_cat - 2) + X_num[:, ::2].sum(dim=1) - X_num[:, 1::2].sum(dim=1)
268 |     ).to(dtype=torch.float)
269 |     
270 |     model = XDeepFM(
271 |         task="regression",
272 |         output_size=1,
273 |         embedding_num=LinearEmbedding(embedding_size=3).fit(X_num),
274 |         embedding_cat=BasicEmbedding(embedding_size=3).fit(X_cat),
275 |         mlp_hidden_sizes=[10, 8, 6],
276 |         mlp_use_bn=False,
277 |         mlp_use_skip=False,
278 |         use_leaky_gate=False,
279 |     )
280 |     
281 |     loss_func = nn.MSELoss()
282 |     optimizer = torch.optim.Adam(model.parameters(), lr=1e-1)
283 |     loss_vals = simple_model_train_loop(
284 |         model, X_num, X_cat, y, loss_func, optimizer, num_epochs=5
285 |     )
286 |     
287 |     assert loss_vals[0] > loss_vals[-1]
288 | 
289 | 
290 | def test_that_xdeepfm_learns_with_other_params():
291 |     _set_seed(10101)
292 | 
293 |     X_num = torch.randint(0, 10, (100, 10))
294 |     X_cat = torch.randint(0, 5, (100, 1))
295 |     y = (
296 |         (X_cat - 2) + X_num[:, ::2].sum(dim=1) - X_num[:, 1::2].sum(dim=1)
297 |     ).to(dtype=torch.float)
298 |     
299 |     model = XDeepFM(
300 |         task="regression",
301 |         output_size=1,
302 |         embedding_num=LinearEmbedding(embedding_size=3).fit(X_num),
303 |         embedding_cat=BasicEmbedding(embedding_size=3).fit(X_cat),
304 |         mlp_hidden_sizes=[],
305 |         mlp_use_bn=False,
306 |         mlp_use_skip=False,
307 |         use_leaky_gate=False,
308 |     )
309 |     
310 |     loss_func = nn.MSELoss()
311 |     optimizer = torch.optim.Adam(model.parameters(), lr=1e-1)
312 |     loss_vals = simple_model_train_loop(
313 |         model, X_num, X_cat, y, loss_func, optimizer, num_epochs=5
314 |     )
315 |     
316 |     assert loss_vals[0] > loss_vals[-1]
317 | 


--------------------------------------------------------------------------------