├── tests ├── __init__.py ├── test_pnn │ ├── __init__.py │ └── test_estimators.py ├── test_autoint │ ├── __init__.py │ ├── test_estimators.py │ └── test_modules.py ├── test_embedding │ ├── __init__.py │ ├── test_ragged │ │ ├── __init__.py │ │ └── test_common.py │ ├── test_uniform │ │ ├── __init__.py │ │ └── test_numeric.py │ ├── utils.py │ └── test_common.py ├── test_fibinet │ ├── __init__.py │ └── test_estimators.py ├── test_mlpnet │ ├── __init__.py │ ├── test_estimators.py │ └── test_modules.py ├── test_xdeepfm │ ├── __init__.py │ ├── test_estimators.py │ └── test_modules.py ├── test_base_classes │ ├── __init__.py │ └── test_modules.py ├── conftest.py ├── test_ghost_norm.py ├── common.py └── test_dataset.py ├── xynn ├── __init__.py ├── base_classes │ └── __init__.py ├── mlpnet │ ├── __init__.py │ ├── modules.py │ └── estimators.py ├── autoint │ ├── __init__.py │ └── estimators.py ├── fibinet │ ├── __init__.py │ └── estimators.py ├── xdeepfm │ ├── __init__.py │ └── estimators.py ├── pnn │ ├── __init__.py │ └── estimators.py ├── embedding │ ├── ragged │ │ ├── __init__.py │ │ ├── common.py │ │ └── fast_ragged.py │ ├── uniform │ │ ├── __init__.py │ │ ├── base.py │ │ ├── numeric.py │ │ ├── fast_categorical.py │ │ └── categorical.py │ ├── __init__.py │ ├── utils.py │ └── common.py ├── ghost_norm.py ├── dataset.py ├── preprocessing.py └── mlp.py ├── requirements ├── test.txt └── examples.txt ├── requirements.txt ├── pyproject.toml ├── .gitignore ├── README.md └── setup.py /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /xynn/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_pnn/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements/test.txt: -------------------------------------------------------------------------------- 1 | pytest>=6.0 -------------------------------------------------------------------------------- /tests/test_autoint/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_embedding/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_fibinet/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_mlpnet/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_xdeepfm/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /xynn/base_classes/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements/examples.txt: -------------------------------------------------------------------------------- 1 | pandas>=1.2.3 -------------------------------------------------------------------------------- /tests/test_base_classes/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_embedding/test_ragged/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_embedding/test_uniform/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.20.2 2 | scikit-learn>=0.24.1 3 | torch>=1.8.1 4 | tqdm>=4.59.0 -------------------------------------------------------------------------------- /xynn/mlpnet/__init__.py: -------------------------------------------------------------------------------- 1 | from .modules import MLPNet 2 | from .estimators import MLPClassifier, MLPRegressor 3 | -------------------------------------------------------------------------------- /xynn/autoint/__init__.py: -------------------------------------------------------------------------------- 1 | from .modules import AutoInt 2 | from .estimators import AutoIntClassifier, AutoIntRegressor 3 | -------------------------------------------------------------------------------- /xynn/fibinet/__init__.py: -------------------------------------------------------------------------------- 1 | from .modules import FiBiNet 2 | from .estimators import FiBiNetRegressor, FiBiNetClassifier 3 | -------------------------------------------------------------------------------- /xynn/xdeepfm/__init__.py: -------------------------------------------------------------------------------- 1 | from .modules import XDeepFM 2 | from .estimators import XDeepFMClassifier, XDeepFMRegressor 3 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools", 4 | "wheel", 5 | ] 6 | build-backend = "setuptools.build_meta" -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints/ 2 | *.pyc 3 | .coverage 4 | __pycache__/ 5 | build/ 6 | lightning_logs/ 7 | dist/ 8 | shim.egg-info/ 9 | .vscode/ 10 | -------------------------------------------------------------------------------- /xynn/pnn/__init__.py: -------------------------------------------------------------------------------- 1 | from .modules import PNN, PNNPlus 2 | from .estimators import PNNClassifier, PNNPlusClassifier 3 | from .estimators import PNNRegressor, PNNPlusRegressor 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # XyNN: Experimental code for tabular neural networks 2 | 3 | This repo implements models from the paper [Simple Modifications to Improve Tabular Neural Networks](https://arxiv.org/abs/2108.03214). 4 | -------------------------------------------------------------------------------- /xynn/embedding/ragged/__init__.py: -------------------------------------------------------------------------------- 1 | from .common import RaggedBase 2 | from .ragged import RaggedEmbedding, RaggedDefaultEmbedding 3 | from .fast_ragged import FastRaggedEmbedding, FastRaggedDefaultEmbedding 4 | -------------------------------------------------------------------------------- /xynn/embedding/uniform/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import UniformBase 2 | from .numeric import LinearEmbedding, DenseEmbedding 3 | from .categorical import BasicEmbedding, DefaultEmbedding 4 | from .fast_categorical import FastBasicEmbedding, FastDefaultEmbedding 5 | -------------------------------------------------------------------------------- /xynn/embedding/__init__.py: -------------------------------------------------------------------------------- 1 | from .common import EmbeddingBase 2 | from .uniform import LinearEmbedding, DenseEmbedding 3 | from .uniform import BasicEmbedding, DefaultEmbedding 4 | from .uniform import FastBasicEmbedding, FastDefaultEmbedding 5 | from .ragged import RaggedEmbedding, RaggedDefaultEmbedding 6 | from .ragged import FastRaggedEmbedding, FastRaggedDefaultEmbedding 7 | from .utils import fit_embeddings, check_embeddings, check_uniform_embeddings 8 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | def pytest_addoption(parser): 2 | parser.addoption("--device", default="cpu") 3 | 4 | 5 | def pytest_generate_tests(metafunc): 6 | # This is called for every test. Only get/set command line arguments 7 | # if the argument is specified in the list of test "fixturenames". 8 | option_value = metafunc.config.option.device 9 | if "device" in metafunc.fixturenames and option_value is not None: 10 | metafunc.parametrize("device", [option_value]) 11 | -------------------------------------------------------------------------------- /xynn/embedding/uniform/base.py: -------------------------------------------------------------------------------- 1 | """ 2 | Base class for uniform embeddings, with fixed vector size across fields 3 | 4 | """ 5 | 6 | from typing import Tuple 7 | 8 | from torch import Tensor 9 | 10 | from ..common import EmbeddingBase 11 | 12 | 13 | class UniformBase(EmbeddingBase): 14 | """Base class for embeddings that have a single vector size for all fields""" 15 | 16 | def weight_sum(self) -> Tuple[Tensor, Tensor]: 17 | """ 18 | Sum of absolute value and square of embedding weights 19 | 20 | Return 21 | ------ 22 | e1_sum : sum of absolute value of embedding values 23 | e2_sum : sum of squared embedding values 24 | """ 25 | if not self._isfit: 26 | return 0.0, 0.0 27 | e1_sum = self.embedding.weight.abs().sum() 28 | e2_sum = (self.embedding.weight ** 2).sum() 29 | return e1_sum, e2_sum 30 | -------------------------------------------------------------------------------- /tests/test_embedding/utils.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import pandas as pd 4 | from torch import nn 5 | from torch.utils.data import Dataset 6 | 7 | 8 | class Reshape(nn.Module): 9 | def forward(self, X): 10 | return X.reshape((X.shape[0], -1)) 11 | 12 | 13 | def example_data(): 14 | data = pd.DataFrame( 15 | { 16 | "num_a": [i / 10 for i in range(10)], 17 | "num_b": range(10, 0, -1), 18 | "cat_a": [0, 1, 2, 3, 0, 1, 2, 0, 1, 0], 19 | "cat_b": [0, 1, 1, 0, 1, 0, 2, 1, 0, 1], 20 | "cat_c": [1, 1, 0, 0, 1, 1, 0, np.nan, 1, 1], 21 | } 22 | ) 23 | return data 24 | 25 | 26 | class SimpleDataset(Dataset): 27 | 28 | def __init__(self, data): 29 | self.data = data 30 | 31 | def __len__(self): 32 | return len(self.data) 33 | 34 | def __getitem__(self, idx): 35 | return self.data[idx] 36 | -------------------------------------------------------------------------------- /tests/test_ghost_norm.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | from torch import nn 4 | 5 | from xynn.ghost_norm import GhostBatchNorm 6 | 7 | 8 | def test_ghostbatchnorm(): 9 | gbn = GhostBatchNorm(3, 4, 0.2) 10 | assert gbn.inner_norm.num_features == 3 11 | assert gbn.inner_norm.momentum == 0.2 12 | assert gbn.virtual_batch_size == 4 13 | 14 | x = torch.tensor( 15 | [ 16 | [-1, 0, 3], 17 | [ 0, 2, -3], 18 | [ 1, -2, 0], 19 | [ 0, 0, 0], 20 | [-2, 0, 1], 21 | [ 0, 3, -1], 22 | [ 2, -3, 0], 23 | [ 0, 0, 0], 24 | [-3, 0, 2], 25 | [ 0, 1, -2], 26 | [ 3, -1, 0], 27 | [ 0, 0, 0], 28 | ], 29 | dtype=torch.float, 30 | ) 31 | out = gbn(x) 32 | expected = torch.tensor( 33 | [ 34 | [-1.4142, 0.0000, 1.4142], 35 | [ 0.0000, 1.4142, -1.4142], 36 | [ 1.4142, -1.4142, 0.0000], 37 | [ 0.0000, 0.0000, 0.0000], 38 | [-1.4142, 0.0000, 1.4142], 39 | [ 0.0000, 1.4142, -1.4142], 40 | [ 1.4142, -1.4142, 0.0000], 41 | [ 0.0000, 0.0000, 0.0000], 42 | [-1.4142, 0.0000, 1.4142], 43 | [ 0.0000, 1.4142, -1.4142], 44 | [ 1.4142, -1.4142, 0.0000], 45 | [ 0.0000, 0.0000, 0.0000], 46 | ] 47 | ) 48 | assert torch.allclose(out, expected) 49 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from typing import List 4 | 5 | from setuptools import find_packages, setup 6 | 7 | # copied from pytorch-lightning 8 | _PATH_ROOT = os.path.dirname(__file__) 9 | _PATH_REQUIRE = os.path.join(_PATH_ROOT, 'requirements') 10 | 11 | 12 | def _load_requirements(path_dir: str, file_name: str = 'requirements.txt', comment_char: str = '#') -> List[str]: 13 | """Load requirements from a file 14 | >>> _load_requirements(_PROJECT_ROOT) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE 15 | ['numpy...', 'torch...', ...] 16 | """ 17 | with open(os.path.join(path_dir, file_name), 'r') as file: 18 | lines = [ln.strip() for ln in file.readlines()] 19 | reqs = [] 20 | for ln in lines: 21 | # filer all comments 22 | if comment_char in ln: 23 | ln = ln[:ln.index(comment_char)].strip() 24 | # skip directly installed dependencies 25 | if ln.startswith('http'): 26 | continue 27 | if ln: # if requirement is not empty 28 | reqs.append(ln) 29 | return reqs 30 | 31 | # https://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-extras 32 | # Define package extras. These are only installed if you specify them. 33 | # From remote, use like `pip install pytorch-lightning[dev, docs]` 34 | # From local copy of repo, use like `pip install ".[dev, docs]"` 35 | extras = { 36 | 'examples': _load_requirements(path_dir=_PATH_REQUIRE, file_name='examples.txt'), 37 | 'test': _load_requirements(path_dir=_PATH_REQUIRE, file_name='test.txt') 38 | } 39 | extras['dev'] = extras['examples'] + extras['test'] 40 | extras['all'] = extras['dev'] 41 | 42 | 43 | setup( 44 | name='xynn', 45 | version='0.1', 46 | description='A collection of Tabular NN models with a Scikit-learn API', 47 | url='https://github.com/jrfiedler/xynn', 48 | author='James Fiedler', 49 | author_email='jrfiedler@gmail.com', 50 | license='MIT', 51 | python_requires=">=3.7", 52 | packages=find_packages(exclude=['tests','tests/*',]), 53 | zip_safe=False, 54 | keywords=['deep learning', 'pytorch', 'AI'], 55 | setup_requires=[], 56 | install_requires=_load_requirements(_PATH_ROOT), 57 | extras_require=extras, 58 | ) -------------------------------------------------------------------------------- /tests/test_embedding/test_ragged/test_common.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import torch 4 | from torch import nn 5 | import pytest 6 | 7 | from xynn.embedding.ragged.common import _check_embedding_size, _parse_embedding_size 8 | 9 | 10 | def test__check_embedding_size_raises_error_for_bad_embedding_size(): 11 | # bad name 12 | with pytest.raises( 13 | ValueError, 14 | match=( 15 | "str embedding_size value must be one of {'sqrt', 'log', 'fastai'}; " 16 | "got 'fourth_rt'" 17 | ), 18 | ): 19 | _check_embedding_size("fourth_rt") 20 | 21 | # single int not allowed 22 | with pytest.raises( 23 | TypeError, 24 | match="embedding_size 5 not understood", 25 | ): 26 | _check_embedding_size(5) 27 | 28 | # float values not allowed 29 | with pytest.raises( 30 | TypeError, 31 | match="embedding_size \[5, 10, 15.0\] not understood", 32 | ): 33 | _check_embedding_size([5, 10, 15.0]) 34 | 35 | # wrong number of ints 36 | with pytest.raises( 37 | ValueError, 38 | match="number of embeddings must match number of fields, got 3 sizes and 4 fields", 39 | ): 40 | _check_embedding_size([5, 10, 15], [10, 20, 30, 40]) 41 | 42 | 43 | def test__check_embedding_size_with_uppercase(): 44 | assert _check_embedding_size("SQRT") == "sqrt" 45 | assert _check_embedding_size("Log") == "log" 46 | assert _check_embedding_size("FastAI") == "fastai" 47 | 48 | 49 | def test__check_embedding_size_with_ints(): 50 | assert _check_embedding_size([5, 10, 15]) == [5, 10, 15] 51 | assert _check_embedding_size((5, 10, 15)) == (5, 10, 15) 52 | 53 | 54 | def test__parse_embedding_size_with_sqrt(): 55 | output = _parse_embedding_size("sqrt", 20, [4, 25, 64, 196, 400, 625, 1600]) 56 | assert output == [2, 5, 8, 14, 20, 20, 20] 57 | 58 | 59 | def test__parse_embedding_size_with_log(): 60 | output = _parse_embedding_size("log", 7, [4, 25, 64, 196, 400, 625, 1600]) 61 | assert output == [2, 4, 5, 6, 6, 7, 7] 62 | 63 | 64 | def test__parse_embedding_size_with_fastai(): 65 | output = _parse_embedding_size("fastai", 50, [4, 25, 64, 196, 400, 625, 1600]) 66 | assert output == [3, 10, 16, 31, 46, 50, 50] 67 | 68 | 69 | def test__parse_embedding_size_with_ints(): 70 | output = _parse_embedding_size( 71 | [5, 5, 5, 5, 5, 5, 5], 20, [4, 25, 64, 196, 400, 625, 1600] 72 | ) 73 | assert output == [5] * 7 74 | -------------------------------------------------------------------------------- /xynn/ghost_norm.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for Ghost Batch Norm and variations. 3 | Ghost Batch Norm: https://arxiv.org/pdf/1705.08741.pdf 4 | 5 | """ 6 | 7 | from math import ceil 8 | from typing import Union 9 | 10 | import torch 11 | from torch import Tensor 12 | from torch import nn 13 | 14 | 15 | class GhostNorm(nn.Module): 16 | """ 17 | Ghost Normalization 18 | https://arxiv.org/pdf/1705.08741.pdf 19 | 20 | """ 21 | 22 | def __init__( 23 | self, 24 | inner_norm: nn.Module, 25 | virtual_batch_size: int, 26 | device: Union[str, torch.device] = "cpu", 27 | ): 28 | """ 29 | Parameters 30 | ---------- 31 | inner_norm : torch.nn.Module (initialiezd) 32 | examples: `nn.BatchNorm1d`, `nn.LayerNorm` 33 | virtual_batch_size : int 34 | device : string or torch.device, optional 35 | default is "cpu" 36 | 37 | """ 38 | super().__init__() 39 | self.virtual_batch_size = virtual_batch_size 40 | self.inner_norm = inner_norm 41 | self.to(device) 42 | 43 | def forward(self, x: Tensor) -> Tensor: 44 | """ 45 | Transform the input tensor 46 | 47 | Parameters 48 | ---------- 49 | x : torch.Tensor 50 | 51 | Return 52 | ------ 53 | torch.Tensor 54 | 55 | """ 56 | chunk_size = int(ceil(x.shape[0] / self.virtual_batch_size)) 57 | chunk_norm = [self.inner_norm(chunk) for chunk in x.chunk(chunk_size, dim=0)] 58 | return torch.cat(chunk_norm, dim=0) 59 | 60 | 61 | class GhostBatchNorm(GhostNorm): 62 | """ 63 | Ghost Normalization, using BatchNorm1d as inner normalization 64 | https://arxiv.org/pdf/1705.08741.pdf 65 | 66 | """ 67 | 68 | def __init__( 69 | self, 70 | num_features: int, 71 | virtual_batch_size: int = 64, 72 | momentum: float = 0.1, 73 | device: Union[str, torch.device] = "cpu", 74 | ): 75 | """ 76 | Parameters 77 | ---------- 78 | num_features : int 79 | virtual_batch_size : int, optional 80 | default is 64 81 | momentum : float, optional 82 | default is 0.1 83 | device : string or torch.device, optional 84 | default is "cpu" 85 | 86 | """ 87 | super().__init__( 88 | inner_norm=nn.BatchNorm1d(num_features, momentum=momentum), 89 | virtual_batch_size=virtual_batch_size, 90 | ) 91 | -------------------------------------------------------------------------------- /tests/test_xdeepfm/test_estimators.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | from xynn.xdeepfm import XDeepFMRegressor, XDeepFMClassifier 4 | from xynn.embedding import LinearEmbedding, DefaultEmbedding 5 | 6 | from ..common import check_estimator_learns 7 | 8 | 9 | def test_that_xdeepfmregressor_learns(): 10 | estimator = XDeepFMRegressor( 11 | mlp_hidden_sizes=[10, 8, 8, 6], 12 | mlp_use_bn=False, 13 | ) 14 | check_estimator_learns(estimator, task="regression") 15 | assert estimator.init_parameters == { 16 | "embedding_num": "auto", 17 | "embedding_cat": "auto", 18 | "embedding_l1_reg": 0.0, 19 | "embedding_l2_reg": 0.0, 20 | "cin_layer_sizes": (128, 128), 21 | "cin_activation": nn.Identity, 22 | "cin_full_agg": False, 23 | "cin_use_bn": True, 24 | "cin_bn_momentum": 0.1, 25 | "cin_use_residual": True, 26 | "cin_use_mlp": True, 27 | "mlp_hidden_sizes": [10, 8, 8, 6], 28 | "mlp_activation": nn.LeakyReLU, 29 | "mlp_use_bn": False, 30 | "mlp_bn_momentum": 0.1, 31 | "mlp_ghost_batch": None, 32 | "mlp_dropout": 0.0, 33 | "mlp_l1_reg": 0.0, 34 | "mlp_l2_reg": 0.0, 35 | "mlp_use_skip": True, 36 | "use_leaky_gate": True, 37 | "loss_fn": "auto", 38 | "seed": None, 39 | "device": "cpu", 40 | } 41 | 42 | 43 | def test_that_xdeepfmclassifier_learns(): 44 | estimator = XDeepFMClassifier( 45 | cin_layer_sizes=[64, 64], 46 | cin_activation=nn.ReLU, 47 | cin_full_agg=True, 48 | cin_use_bn=False, 49 | mlp_hidden_sizes=[10, 8, 8, 6], 50 | mlp_use_bn=False, 51 | mlp_use_skip=False, 52 | use_leaky_gate=False, 53 | ) 54 | assert estimator 55 | check_estimator_learns(estimator, task="classification") 56 | assert estimator.init_parameters == { 57 | "embedding_num": "auto", 58 | "embedding_cat": "auto", 59 | "embedding_l1_reg": 0.0, 60 | "embedding_l2_reg": 0.0, 61 | "cin_layer_sizes": [64, 64], 62 | "cin_activation": nn.ReLU, 63 | "cin_full_agg": True, 64 | "cin_use_bn": False, 65 | "cin_bn_momentum": 0.1, 66 | "cin_use_residual": True, 67 | "cin_use_mlp": True, 68 | "mlp_hidden_sizes": [10, 8, 8, 6], 69 | "mlp_activation": nn.LeakyReLU, 70 | "mlp_use_bn": False, 71 | "mlp_bn_momentum": 0.1, 72 | "mlp_ghost_batch": None, 73 | "mlp_dropout": 0.0, 74 | "mlp_l1_reg": 0.0, 75 | "mlp_l2_reg": 0.0, 76 | "mlp_use_skip": False, 77 | "use_leaky_gate": False, 78 | "loss_fn": "auto", 79 | "seed": None, 80 | "device": "cpu", 81 | } 82 | -------------------------------------------------------------------------------- /tests/test_autoint/test_estimators.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | from xynn.autoint import AutoIntRegressor, AutoIntClassifier 4 | from xynn.embedding import LinearEmbedding, DefaultEmbedding 5 | 6 | from ..common import check_estimator_learns 7 | 8 | 9 | def test_that_autointregressor_learns(): 10 | estimator = AutoIntRegressor( 11 | mlp_hidden_sizes=[10, 8, 8, 6], 12 | mlp_use_bn=False, 13 | ) 14 | check_estimator_learns(estimator, task="regression") 15 | assert estimator.init_parameters == { 16 | "embedding_num": "auto", 17 | "embedding_cat": "auto", 18 | "embedding_l1_reg": 0.0, 19 | "embedding_l2_reg": 0.0, 20 | "attn_embedding_size": 8, 21 | "attn_num_layers": 3, 22 | "attn_num_heads": 2, 23 | "attn_activation": None, 24 | "attn_use_residual": True, 25 | "attn_dropout": 0.1, 26 | "attn_normalize": True, 27 | "attn_use_mlp": True, 28 | "mlp_hidden_sizes": [10, 8, 8, 6], 29 | "mlp_activation": nn.LeakyReLU, 30 | "mlp_use_bn": False, 31 | "mlp_bn_momentum": 0.1, 32 | "mlp_ghost_batch": None, 33 | "mlp_dropout": 0.0, 34 | "mlp_l1_reg": 0.0, 35 | "mlp_l2_reg": 0.0, 36 | "mlp_use_skip": True, 37 | "use_leaky_gate": True, 38 | "weighted_sum": True, 39 | "loss_fn": "auto", 40 | "seed": None, 41 | "device": "cpu", 42 | } 43 | 44 | 45 | def test_that_autointclassifier_learns(): 46 | estimator = AutoIntClassifier( 47 | attn_embedding_size=12, 48 | attn_activation=nn.ReLU, 49 | attn_dropout=0.0, 50 | attn_use_mlp=False, 51 | mlp_hidden_sizes=[10, 8, 8, 6], 52 | mlp_use_bn=False, 53 | mlp_use_skip=False, 54 | use_leaky_gate=False, 55 | ) 56 | assert estimator 57 | check_estimator_learns(estimator, task="classification") 58 | assert estimator.init_parameters == { 59 | "embedding_num": "auto", 60 | "embedding_cat": "auto", 61 | "embedding_l1_reg": 0.0, 62 | "embedding_l2_reg": 0.0, 63 | "attn_embedding_size": 12, 64 | "attn_num_layers": 3, 65 | "attn_num_heads": 2, 66 | "attn_activation": nn.ReLU, 67 | "attn_use_residual": True, 68 | "attn_dropout": 0.0, 69 | "attn_normalize": True, 70 | "attn_use_mlp": False, 71 | "mlp_hidden_sizes": [10, 8, 8, 6], 72 | "mlp_activation": nn.LeakyReLU, 73 | "mlp_use_bn": False, 74 | "mlp_bn_momentum": 0.1, 75 | "mlp_ghost_batch": None, 76 | "mlp_dropout": 0.0, 77 | "mlp_l1_reg": 0.0, 78 | "mlp_l2_reg": 0.0, 79 | "mlp_use_skip": False, 80 | "use_leaky_gate": False, 81 | "weighted_sum": True, 82 | "loss_fn": "auto", 83 | "seed": None, 84 | "device": "cpu", 85 | } 86 | -------------------------------------------------------------------------------- /xynn/embedding/ragged/common.py: -------------------------------------------------------------------------------- 1 | """ 2 | Base class for ragged embeddings and helpers for parsing embedding size 3 | 4 | FastAI embedding size: arXiv preprint arXiv:2002.04688 5 | 6 | """ 7 | 8 | from collections.abc import Iterable as IterableClass 9 | from typing import List, Optional, Tuple 10 | 11 | import numpy as np 12 | from torch import nn, Tensor 13 | 14 | from ..common import EmbeddingBase 15 | 16 | 17 | def _check_embedding_size(embedding_size, num_categories=None): 18 | """Check that given `embedding_size` makes sense for ragged embeddings""" 19 | if isinstance(embedding_size, str): 20 | embedding_size = embedding_size.lower() 21 | if embedding_size not in ("sqrt", "log", "fastai"): 22 | raise ValueError( 23 | "str embedding_size value must be one of {'sqrt', 'log', 'fastai'}; " 24 | f"got '{embedding_size}'" 25 | ) 26 | elif not isinstance(embedding_size, IterableClass) or not all( 27 | isinstance(size, int) for size in embedding_size 28 | ): 29 | raise TypeError(f"embedding_size {repr(embedding_size)} not understood") 30 | elif num_categories is not None and len(embedding_size) != len(num_categories): 31 | raise ValueError( 32 | "number of embeddings must match number of fields, got " 33 | f"{len(embedding_size)} sizes and {len(num_categories)} fields" 34 | ) 35 | return embedding_size 36 | 37 | 38 | def _parse_embedding_size(embedding_size, max_size, num_categories) -> List[int]: 39 | """ 40 | Parse given `embedding_size` into a list of individual sizes, 41 | for ragged embeddings 42 | """ 43 | _check_embedding_size(embedding_size, num_categories) 44 | # calculate the individual values if "sqrt" or "log" 45 | if isinstance(embedding_size, str): 46 | num_categories = np.array(num_categories) 47 | if embedding_size == "sqrt": 48 | base_size = np.ceil(np.sqrt(num_categories)) 49 | elif embedding_size == "log": 50 | base_size = np.ceil(np.log(num_categories)) 51 | else: # embedding_size == "fastai": 52 | base_size = (1.6 * num_categories ** 0.56).round() 53 | clipped_size = np.clip(1, max_size, base_size).astype("int") 54 | embedding_size = list(clipped_size) 55 | else: # iterable of int 56 | pass 57 | return embedding_size 58 | 59 | 60 | class RaggedBase(EmbeddingBase): 61 | """Base class for embeddings that allow a different vector size for each field""" 62 | 63 | def __init__(self): 64 | super().__init__() 65 | self.embedding: Optional[nn.ModuleList] = None 66 | 67 | def weight_sum(self) -> Tuple[Tensor, Tensor]: 68 | """ 69 | Sum of absolute value and square of embedding weights 70 | 71 | Return 72 | ------ 73 | e1_sum : sum of absolute value of embedding values 74 | e2_sum : sum of squared embedding values 75 | """ 76 | if not self._isfit: 77 | return 0.0, 0.0 78 | e1_sum = 0.0 79 | e2_sum = 0.0 80 | for embedding in self.embedding: 81 | e1_sum += embedding.weight.abs().sum() 82 | e2_sum += (embedding.weight ** 2).sum() 83 | return e1_sum, e2_sum 84 | -------------------------------------------------------------------------------- /tests/test_mlpnet/test_estimators.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | from tempfile import NamedTemporaryFile 4 | 5 | import torch 6 | from torch import nn 7 | import numpy as np 8 | 9 | from xynn.base_classes.estimators import _set_seed 10 | from xynn.embedding import DenseEmbedding, RaggedEmbedding 11 | from xynn.mlpnet import MLPRegressor, MLPClassifier 12 | 13 | from ..common import check_estimator_learns 14 | 15 | 16 | def test_that_basic_params_are_passed_to_mlpnet_module(): 17 | X = torch.rand((100, 10)) - 0.5 18 | y = X[:, 0] - X[:, 1] + X[:, 2] - X[:, 4] + 2 * X[:, 6] - 2 * X[:, 8] 19 | estimator = MLPRegressor( 20 | embedding_cat=None, 21 | embedding_l1_reg=0.1, 22 | mlp_l2_reg=0.2, 23 | ) 24 | estimator.fit( 25 | X_num=X, 26 | X_cat=None, 27 | y=y, 28 | optimizer=torch.optim.Adam, 29 | opt_kwargs={"lr": 1e-1}, 30 | num_epochs=1, 31 | ) 32 | 33 | model = estimator._model 34 | 35 | assert model.task == "regression" 36 | assert model.num_epochs == 1 37 | assert isinstance(model.loss_fn, nn.MSELoss) 38 | assert model.embedding_num is not None 39 | assert model.embedding_cat is None 40 | assert model.embedding_l1_reg == 0.1 41 | assert model.embedding_l2_reg == 0.0 42 | assert model.mlp_l1_reg == 0.0 43 | assert model.mlp_l2_reg == 0.2 44 | assert model.optimizer is not None 45 | assert model.optimizer_info != {} 46 | assert model.scheduler == {} 47 | assert model._device == torch.device("cpu") 48 | 49 | 50 | def test_that_mlpregressor_learns(): 51 | _set_seed(10101) 52 | X = torch.rand((100, 10)) - 0.5 53 | y = X[:, 0] - X[:, 1] + X[:, 2] - X[:, 4] + 2 * X[:, 6] - 2 * X[:, 8] 54 | estimator = MLPRegressor( 55 | mlp_hidden_sizes=[10, 8, 8, 6], 56 | mlp_use_bn=False, 57 | mlp_use_skip=False, 58 | use_leaky_gate=False, 59 | ) 60 | check_estimator_learns(estimator, task="regression", data=(X, None, y)) 61 | 62 | 63 | def test_that_mlpclassifier_learns(): 64 | _set_seed(10101) 65 | X = torch.rand((100, 10)) - 0.5 66 | y_cont = X[:, 0] - X[:, 1] + X[:, 2] - X[:, 4] + 2 * X[:, 6] - 2 * X[:, 8] 67 | y = (y_cont > 0) 68 | estimator = MLPClassifier( 69 | mlp_hidden_sizes=[10, 8, 8, 6], 70 | mlp_use_bn=False, 71 | mlp_use_skip=False, 72 | use_leaky_gate=False, 73 | ) 74 | check_estimator_learns(estimator, task="regression", data=(X, None, y)) 75 | 76 | 77 | def test_that_mlpregressor_allows_dense_and_ragged_embeddings(): 78 | _set_seed(10101) 79 | estimator = MLPClassifier( 80 | embedding_num=DenseEmbedding(), 81 | embedding_cat=RaggedEmbedding(), 82 | mlp_hidden_sizes=[10, 8, 8, 6], 83 | mlp_use_bn=False, 84 | mlp_use_skip=False, 85 | use_leaky_gate=False, 86 | ) 87 | check_estimator_learns(estimator, task="regression") 88 | 89 | 90 | def test_that_mlpclassifier_doesnt_require_numeric_embedding(): 91 | _set_seed(10101) 92 | estimator = MLPClassifier( 93 | embedding_num=None, 94 | mlp_hidden_sizes=[10, 8, 8, 6], 95 | mlp_use_bn=False, 96 | mlp_use_skip=False, 97 | use_leaky_gate=False, 98 | ) 99 | check_estimator_learns(estimator, task="regression") 100 | -------------------------------------------------------------------------------- /tests/test_fibinet/test_estimators.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | from xynn.fibinet import FiBiNetRegressor, FiBiNetClassifier 4 | from xynn.embedding import LinearEmbedding, DefaultEmbedding 5 | 6 | from ..common import check_estimator_learns 7 | 8 | 9 | def test_that_fibinetregressor_learns(): 10 | estimator = FiBiNetRegressor( 11 | mlp_hidden_sizes=[10, 8, 8, 6], 12 | mlp_use_bn=False, 13 | mlp_use_skip=False, 14 | use_leaky_gate=False, 15 | ) 16 | check_estimator_learns(estimator, task="regression") 17 | assert estimator.init_parameters == { 18 | "embedding_num": "auto", 19 | "embedding_cat": "auto", 20 | "embedding_l1_reg": 0.0, 21 | "embedding_l2_reg": 0.0, 22 | "fibi_reduction_ratio": 3, 23 | "fibi_activation": nn.LeakyReLU, 24 | "fibi_senet_product": "sym-interaction", 25 | "fibi_embed_product": "sym-interaction", 26 | "fibi_senet_skip": True, 27 | "mlp_hidden_sizes": [10, 8, 8, 6], 28 | "mlp_activation": nn.LeakyReLU, 29 | "mlp_use_bn": False, 30 | "mlp_bn_momentum": 0.1, 31 | "mlp_ghost_batch": None, 32 | "mlp_dropout": 0.0, 33 | "mlp_l1_reg": 0.0, 34 | "mlp_l2_reg": 0.0, 35 | "mlp_use_skip": False, 36 | "use_leaky_gate": False, 37 | "loss_fn": "auto", 38 | "seed": None, 39 | "device": "cpu", 40 | } 41 | 42 | 43 | def test_that_fibinetclassifier_learns(): 44 | estimator = FiBiNetClassifier( 45 | fibi_reduction_ratio=4, 46 | fibi_activation=nn.ReLU, 47 | fibi_senet_product="field-each", 48 | fibi_embed_product="shared", 49 | mlp_hidden_sizes=[10, 8, 8, 6], 50 | mlp_use_bn=False, 51 | mlp_use_skip=False, 52 | use_leaky_gate=False, 53 | ) 54 | assert estimator 55 | check_estimator_learns(estimator, task="classification") 56 | assert estimator.init_parameters == { 57 | "embedding_num": "auto", 58 | "embedding_cat": "auto", 59 | "embedding_l1_reg": 0.0, 60 | "embedding_l2_reg": 0.0, 61 | "fibi_reduction_ratio": 4, 62 | "fibi_activation": nn.ReLU, 63 | "fibi_senet_product": "field-each", 64 | "fibi_embed_product": "shared", 65 | "fibi_senet_skip": True, 66 | "mlp_hidden_sizes": [10, 8, 8, 6], 67 | "mlp_activation": nn.LeakyReLU, 68 | "mlp_use_bn": False, 69 | "mlp_bn_momentum": 0.1, 70 | "mlp_ghost_batch": None, 71 | "mlp_dropout": 0.0, 72 | "mlp_l1_reg": 0.0, 73 | "mlp_l2_reg": 0.0, 74 | "mlp_use_skip": False, 75 | "use_leaky_gate": False, 76 | "loss_fn": "auto", 77 | "seed": None, 78 | "device": "cpu", 79 | } 80 | 81 | 82 | def test_that_fibinetclassifier_learns_with_hadamard_products(): 83 | estimator = FiBiNetClassifier( 84 | fibi_reduction_ratio=4, 85 | fibi_activation=nn.ReLU, 86 | fibi_senet_product="hadamard", 87 | fibi_embed_product="shared", 88 | mlp_hidden_sizes=[10, 8, 8, 6], 89 | mlp_use_bn=False, 90 | mlp_use_skip=False, 91 | use_leaky_gate=False, 92 | ) 93 | check_estimator_learns(estimator, task="classification") 94 | assert estimator.init_parameters == { 95 | "embedding_num": "auto", 96 | "embedding_cat": "auto", 97 | "embedding_l1_reg": 0.0, 98 | "embedding_l2_reg": 0.0, 99 | "fibi_reduction_ratio": 4, 100 | "fibi_activation": nn.ReLU, 101 | "fibi_senet_product": "hadamard", 102 | "fibi_embed_product": "shared", 103 | "fibi_senet_skip": True, 104 | "mlp_hidden_sizes": [10, 8, 8, 6], 105 | "mlp_activation": nn.LeakyReLU, 106 | "mlp_use_bn": False, 107 | "mlp_bn_momentum": 0.1, 108 | "mlp_ghost_batch": None, 109 | "mlp_dropout": 0.0, 110 | "mlp_l1_reg": 0.0, 111 | "mlp_l2_reg": 0.0, 112 | "mlp_use_skip": False, 113 | "use_leaky_gate": False, 114 | "loss_fn": "auto", 115 | "seed": None, 116 | "device": "cpu", 117 | } 118 | -------------------------------------------------------------------------------- /xynn/mlpnet/modules.py: -------------------------------------------------------------------------------- 1 | """ 2 | PyTorch module for the MLP model 3 | 4 | """ 5 | 6 | import textwrap 7 | from typing import Union, Tuple, Callable, Optional, Type, List 8 | 9 | import torch 10 | from torch import Tensor 11 | from torch import nn 12 | 13 | from ..base_classes.modules import BaseNN, MODULE_INIT_DOC 14 | from ..mlp import MLP 15 | from ..embedding import check_embeddings 16 | from ..embedding.common import EmbeddingBase 17 | 18 | 19 | INIT_DOC = MODULE_INIT_DOC.format( 20 | textwrap.dedent( 21 | """\ 22 | num_numeric_fields : int or "auto", optional 23 | an integer must be specified when embedding_num is None; 24 | default is \"auto\"""" 25 | ) 26 | ) 27 | 28 | 29 | class MLPNet(BaseNN): 30 | """ A model consisting of just an MLP """ 31 | 32 | def __init__( 33 | self, 34 | task: str, 35 | output_size: int, 36 | embedding_num: Optional[EmbeddingBase], 37 | embedding_cat: Optional[EmbeddingBase], 38 | embedding_l1_reg: float = 0.0, 39 | embedding_l2_reg: float = 0.0, 40 | num_numeric_fields: Union[int, str] = "auto", 41 | mlp_hidden_sizes: Union[int, Tuple[int, ...], List[int]] = (512, 256, 128, 64), 42 | mlp_activation: Type[nn.Module] = nn.LeakyReLU, 43 | mlp_use_bn: bool = True, 44 | mlp_bn_momentum: float = 0.1, 45 | mlp_ghost_batch: Optional[int] = None, 46 | mlp_dropout: float = 0.0, 47 | mlp_use_skip: bool = True, 48 | mlp_l1_reg: float = 0.0, 49 | mlp_l2_reg: float = 0.0, 50 | use_leaky_gate: bool = True, 51 | weighted_sum: bool = True, 52 | loss_fn: Union[str, Callable] = "auto", 53 | device: Union[str, torch.device] = "cpu", 54 | ): 55 | super().__init__( 56 | task, 57 | embedding_num, 58 | embedding_cat, 59 | embedding_l1_reg, 60 | embedding_l2_reg, 61 | mlp_l1_reg, 62 | mlp_l2_reg, 63 | loss_fn, 64 | device, 65 | ) 66 | 67 | embed_info = check_embeddings(embedding_num, embedding_cat) 68 | 69 | if embedding_num is not None: 70 | input_size = embed_info.output_size 71 | elif not isinstance(num_numeric_fields, int): 72 | raise TypeError( 73 | "when embedding_num is None, num_numeric_fields must be an integer" 74 | ) 75 | else: 76 | input_size = embed_info.output_size + num_numeric_fields 77 | 78 | self.mlp = MLP( 79 | task, 80 | input_size=input_size, 81 | hidden_sizes=mlp_hidden_sizes, 82 | output_size=output_size, 83 | activation=mlp_activation, 84 | dropout=mlp_dropout, 85 | dropout_first=True, 86 | use_bn=mlp_use_bn, 87 | bn_momentum=mlp_bn_momentum, 88 | ghost_batch=mlp_ghost_batch, 89 | leaky_gate=use_leaky_gate, 90 | use_skip=mlp_use_skip, 91 | weighted_sum=weighted_sum, 92 | device=device, 93 | ) 94 | 95 | self.mix = self.mlp.mix 96 | #self.to(device) 97 | 98 | __init__.__doc__ = INIT_DOC 99 | 100 | @staticmethod 101 | def diagram(): 102 | """ Print a text diagram of this model """ 103 | gram = """\ 104 | 105 | if mlp_use_skip=True (default) 106 | ------------------------------ 107 | X_num ─ Num. embedding? ┐ ┌─── MLP ──┐ 108 | ├─┤ w+ ── output 109 | X_cat ─ Cat. embedding ─┘ └─ Linear ─┘ 110 | 111 | if mlp_use_skip=False 112 | --------------------- 113 | X_num ─ Num. embedding? ┐ 114 | ├─── MLP ── output 115 | X_cat ─ Cat. embedding ─┘ 116 | 117 | splits are copies and joins are concatenations; 118 | 'w+' is weighted element-wise addition; 119 | the numeric embedding is optional 120 | """ 121 | print("\n" + textwrap.dedent(gram)) 122 | 123 | def mlp_weight_sum(self) -> Tuple[Tensor, Tensor]: 124 | """ 125 | Sum of absolute value and square of weights in MLP layers 126 | 127 | Return 128 | ------ 129 | w1 : sum of absolute value of MLP weights 130 | w2 : sum of squared MLP weights 131 | 132 | """ 133 | return self.mlp.weight_sum() 134 | 135 | def forward(self, X_num: Tensor, X_cat: Tensor) -> Tensor: 136 | """ 137 | Transform the input tensor 138 | 139 | Parameters 140 | ---------- 141 | X_num : torch.Tensor 142 | numeric fields 143 | X_cat : torch.Tensor 144 | categorical fields 145 | 146 | Return 147 | ------ 148 | torch.Tensor 149 | 150 | """ 151 | embedded = self.embed(X_num, X_cat, num_dim=2) 152 | return self.mlp(embedded) 153 | -------------------------------------------------------------------------------- /xynn/dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple DataLoader-like class for tabular X_num, X_cat, y 3 | 4 | """ 5 | 6 | from typing import Union, Tuple, Optional 7 | 8 | import numpy as np 9 | import torch 10 | from torch import Tensor 11 | from torch.utils.data import Dataset 12 | 13 | 14 | def _validate_x(X, y, X_name, device): 15 | if isinstance(X, (Tensor, np.ndarray)): 16 | if not X.shape[0] == y.shape[0]: 17 | raise ValueError( 18 | f"shape mismatch; got y.shape[0] == {y.shape[0]}, " 19 | f"{X_name}.shape[0] == {X.shape[0]}" 20 | ) 21 | if len(X.shape) != 2: 22 | raise ValueError( 23 | f"{X_name} should be 2-d; got shape {X.shape}" 24 | ) 25 | if isinstance(X, np.ndarray): 26 | X = torch.from_numpy(X).to(dtype=torch.float32) 27 | elif X is None: 28 | X = torch.empty((y.shape[0], 0)) 29 | else: 30 | raise TypeError(f"input {X_name} should be Tensor, NumPy array, or None") 31 | return X 32 | 33 | 34 | def _validate_y(y, task, device): 35 | if isinstance(y, (Tensor, np.ndarray)): 36 | if any(size == 0 for size in y.shape): 37 | raise ValueError(f"y has a zero-sized dimension; got shape {y.shape}") 38 | 39 | if task == "regression" and len(y.shape) == 1: 40 | y = y.reshape((-1, 1)) 41 | elif task == "classification" and len(y.shape) == 2: 42 | if y.shape[1] != 1: 43 | raise ValueError("for classification y must be 1-d or 2-d with one column") 44 | y = y.reshape((-1,)) 45 | elif len(y.shape) > 2: 46 | raise ValueError(f"y has too many dimensions; got shape {y.shape}") 47 | 48 | if isinstance(y, np.ndarray): 49 | y = torch.from_numpy(y).to(dtype=torch.float32) 50 | else: 51 | raise TypeError("y should be Tensor or NumPy array") 52 | return y 53 | 54 | 55 | class TabularDataLoader: 56 | """ 57 | A DataLoader-like class that aims to be faster for tabular data. 58 | 59 | Based on `FastTensorDataLoader` by Jesse Mu 60 | https://discuss.pytorch.org/t/dataloader-much-slower-than-manual-batching/27014/6 61 | 62 | """ 63 | def __init__( 64 | self, 65 | task: str, 66 | X_num: Optional[Union[np.ndarray, Tensor]], 67 | X_cat: Optional[Union[np.ndarray, Tensor]], 68 | y: Union[np.ndarray, Tensor], 69 | batch_size: int = 32, 70 | shuffle: bool = False, 71 | device: Union[str, torch.device] = "cpu", 72 | ): 73 | """ 74 | Parameters 75 | ---------- 76 | task : {"regression", "classification"} 77 | X_num : PyTorch Tensor, NumPy array, or None 78 | numeric input fields 79 | X_cat : PyTorch Tensor, NumPy array, or None 80 | categorical input fields (represented as numeric values) 81 | y : PyTorch Tensor, NumPy array, or None 82 | target field 83 | batch_size : int, optional 84 | default is 32 85 | shuffle : bool, optional 86 | default is False 87 | device : string or torch.device, optional 88 | default is "cpu" 89 | 90 | """ 91 | if X_num is None and X_cat is None: 92 | raise TypeError("X_num and X_cat cannot both be None") 93 | 94 | self.y = _validate_y(y, task, device) 95 | self.X_num = _validate_x(X_num, self.y, "X_num", device) 96 | self.X_cat = _validate_x(X_cat, self.y, "X_cat", device) 97 | self.dataset_len = y.shape[0] 98 | self.batch_size = batch_size 99 | self.shuffle = shuffle 100 | self.device = device 101 | 102 | # Calculate # batches 103 | n_batches, remainder = divmod(self.dataset_len, self.batch_size) 104 | if remainder > 0: 105 | n_batches += 1 106 | self.n_batches = n_batches 107 | 108 | def __iter__(self): 109 | if self.shuffle: 110 | self.indices = torch.randperm(self.dataset_len) 111 | else: 112 | self.indices = None 113 | self.i = 0 114 | return self 115 | 116 | def __next__(self): 117 | if self.i >= self.dataset_len: 118 | raise StopIteration 119 | if self.indices is not None: 120 | indices = self.indices[self.i:self.i+self.batch_size] 121 | batch = ( 122 | torch.index_select(self.X_num, 0, indices).to(device=self.device), 123 | torch.index_select(self.X_cat, 0, indices).to(device=self.device), 124 | torch.index_select(self.y, 0, indices).to(device=self.device), 125 | ) 126 | else: 127 | batch = ( 128 | self.X_num[self.i:self.i+self.batch_size].to(device=self.device), 129 | self.X_cat[self.i:self.i+self.batch_size].to(device=self.device), 130 | self.y[self.i:self.i+self.batch_size].to(device=self.device), 131 | ) 132 | self.i += self.batch_size 133 | return batch 134 | 135 | def __len__(self): 136 | return self.n_batches 137 | -------------------------------------------------------------------------------- /xynn/mlpnet/estimators.py: -------------------------------------------------------------------------------- 1 | """ 2 | Scikit-learn style classes for the MLP model 3 | 4 | """ 5 | 6 | from typing import Union, Callable, Optional, Type, List, Tuple 7 | 8 | import torch 9 | from torch import nn 10 | 11 | from ..base_classes.estimators import BaseClassifier, BaseRegressor, ESTIMATOR_INIT_DOC 12 | from ..embedding import EmbeddingBase 13 | from .modules import MLPNet 14 | 15 | 16 | INIT_DOC = ESTIMATOR_INIT_DOC.format("") 17 | 18 | 19 | class MLPClassifier(BaseClassifier): 20 | """ 21 | Scikit-learn style classification model for the MLP model 22 | 23 | """ 24 | 25 | diagram = MLPNet.diagram 26 | 27 | def __init__( 28 | self, 29 | embedding_num: Optional[Union[str, EmbeddingBase]] = "auto", 30 | embedding_cat: Optional[Union[str, EmbeddingBase]] = "auto", 31 | embedding_l1_reg: float = 0.0, 32 | embedding_l2_reg: float = 0.0, 33 | mlp_hidden_sizes: Union[int, Tuple[int, ...], List[int]] = (512, 256, 128, 64), 34 | mlp_activation: Type[nn.Module] = nn.LeakyReLU, 35 | mlp_use_bn: bool = True, 36 | mlp_bn_momentum: float = 0.1, 37 | mlp_ghost_batch: Optional[int] = None, 38 | mlp_dropout: float = 0.0, 39 | mlp_l1_reg: float = 0.0, 40 | mlp_l2_reg: float = 0.0, 41 | mlp_use_skip: bool = True, 42 | use_leaky_gate: bool = True, 43 | weighted_sum: bool = True, 44 | loss_fn: Union[str, Callable] = "auto", 45 | seed: Union[int, None] = None, 46 | device: Union[str, torch.device] = "cpu", 47 | ): 48 | super().__init__( 49 | embedding_num=embedding_num, 50 | embedding_cat=embedding_cat, 51 | embedding_l1_reg=embedding_l1_reg, 52 | embedding_l2_reg=embedding_l2_reg, 53 | mlp_hidden_sizes=mlp_hidden_sizes, 54 | mlp_activation=mlp_activation, 55 | mlp_use_bn=mlp_use_bn, 56 | mlp_bn_momentum=mlp_bn_momentum, 57 | mlp_ghost_batch=mlp_ghost_batch, 58 | mlp_dropout=mlp_dropout, 59 | mlp_l1_reg=mlp_l1_reg, 60 | mlp_l2_reg=mlp_l2_reg, 61 | mlp_use_skip=mlp_use_skip, 62 | use_leaky_gate=use_leaky_gate, 63 | weighted_sum=weighted_sum, 64 | loss_fn=loss_fn, 65 | seed=seed, 66 | device=device, 67 | ) 68 | self._model_class = MLPNet 69 | self._require_numeric_embedding = False 70 | 71 | __init__.__doc__ = INIT_DOC 72 | 73 | def _create_model(self): 74 | self._model = self._model_class( 75 | task="classification", 76 | output_size=len(self.classes), 77 | embedding_num=self.embedding_num, 78 | embedding_cat=self.embedding_cat, 79 | num_numeric_fields=self._num_numeric_fields, 80 | loss_fn=self.loss_fn, 81 | device=self._device, 82 | **self.model_kwargs 83 | ) 84 | 85 | 86 | class MLPRegressor(BaseRegressor): 87 | """ 88 | Scikit-learn style regression model for the MLP model 89 | 90 | """ 91 | 92 | diagram = MLPNet.diagram 93 | 94 | def __init__( 95 | self, 96 | embedding_num: Optional[Union[str, EmbeddingBase]] = "auto", 97 | embedding_cat: Optional[Union[str, EmbeddingBase]] = "auto", 98 | embedding_l1_reg: float = 0.0, 99 | embedding_l2_reg: float = 0.0, 100 | mlp_hidden_sizes: Union[int, Tuple[int, ...], List[int]] = (512, 256, 128, 64), 101 | mlp_activation: Type[nn.Module] = nn.LeakyReLU, 102 | mlp_use_bn: bool = True, 103 | mlp_bn_momentum: float = 0.1, 104 | mlp_ghost_batch: Optional[int] = None, 105 | mlp_dropout: float = 0.0, 106 | mlp_l1_reg: float = 0.0, 107 | mlp_l2_reg: float = 0.0, 108 | mlp_use_skip: bool = True, 109 | use_leaky_gate: bool = True, 110 | weighted_sum: bool = True, 111 | loss_fn: Union[str, Callable] = "auto", 112 | seed: Optional[int] = None, 113 | device: Union[str, torch.device] = "cpu", 114 | ): 115 | super().__init__( 116 | embedding_num=embedding_num, 117 | embedding_cat=embedding_cat, 118 | embedding_l1_reg=embedding_l1_reg, 119 | embedding_l2_reg=embedding_l2_reg, 120 | mlp_hidden_sizes=mlp_hidden_sizes, 121 | mlp_activation=mlp_activation, 122 | mlp_use_bn=mlp_use_bn, 123 | mlp_bn_momentum=mlp_bn_momentum, 124 | mlp_ghost_batch=mlp_ghost_batch, 125 | mlp_dropout=mlp_dropout, 126 | mlp_l1_reg=mlp_l1_reg, 127 | mlp_l2_reg=mlp_l2_reg, 128 | mlp_use_skip=mlp_use_skip, 129 | use_leaky_gate=use_leaky_gate, 130 | weighted_sum=weighted_sum, 131 | loss_fn=loss_fn, 132 | seed=seed, 133 | device=device, 134 | ) 135 | self._model_class = MLPNet 136 | self._require_numeric_embedding = False 137 | 138 | __init__.__doc__ = INIT_DOC 139 | 140 | def _create_model(self): 141 | self._model = self._model_class( 142 | task="regression", 143 | output_size=self.num_targets, 144 | embedding_num=self.embedding_num, 145 | embedding_cat=self.embedding_cat, 146 | num_numeric_fields=self._num_numeric_fields, 147 | loss_fn=self.loss_fn, 148 | device=self._device, 149 | **self.model_kwargs, 150 | ) 151 | -------------------------------------------------------------------------------- /tests/test_embedding/test_common.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import torch 4 | import pytest 5 | 6 | from xynn.embedding.common import _isnan, _isnan_index, _unique, _value_counts 7 | from .utils import example_data 8 | 9 | 10 | def test__isnan(): 11 | assert _isnan(float("nan")) 12 | assert not _isnan("NaN") 13 | assert not _isnan(20.22) 14 | assert not _isnan(20122) 15 | 16 | 17 | def test__isnan_index_with_simple_example(): 18 | data = pd.Series([10, 8, 6, 4, 2], index=[0, np.nan, 1, 4, np.nan]) 19 | assert np.all(_isnan_index(data) == [False, True, False, False, True]) 20 | 21 | 22 | def test_that__unique_raises_error_on_bad_input(): 23 | msg = "input should be Pandas DataFrame, NumPy array, or PyTorch Tensor" 24 | with pytest.raises(TypeError, match=msg): 25 | _unique([10, 8, 6, 8, 4, 2, 0, 10, 2, 0, 2]) 26 | 27 | 28 | def test__unique_on_numpy_examples(): 29 | data = example_data()[["cat_a", "cat_b", "cat_c"]] 30 | data["cat_d"] = ["a", "b", np.nan, "c", "a", np.nan, np.nan, "a", "b", "c"] 31 | data["cat_e"] = [1, 1, np.nan, 0, 0, 0, np.nan, 1, 1, 0] 32 | uniques, has_nan = _unique(data.values) 33 | assert [set(values) for values in uniques] == [ 34 | set([0, 1, 2, 3]), set([0, 1, 2]), set([0, 1]), set("abc"), set([0, 1]) 35 | ] 36 | assert has_nan == [False, False, True, True, True] 37 | 38 | data = example_data()[["cat_c"]] 39 | data["cat_e"] = [1, 1, np.nan, 0, 0, 0, np.nan, 1, 1, 0] 40 | uniques, has_nan = _unique(data.values) 41 | assert [set(values) for values in uniques] == [set([0, 1]), set([0, 1])] 42 | assert has_nan == [True, True] 43 | 44 | 45 | def test__unique_on_pandas_examples(): 46 | data = example_data()[["cat_a", "cat_b", "cat_c"]] 47 | data["cat_d"] = ["a", "b", np.nan, "c", "a", np.nan, np.nan, "a", "b", "c"] 48 | data["cat_e"] = [1, 1, np.nan, 0, 0, 0, np.nan, 1, 1, 0] 49 | uniques, has_nan = _unique(data) 50 | assert [set(values) for values in uniques] == [ 51 | set([0, 1, 2, 3]), set([0, 1, 2]), set([0, 1]), set("abc"), set([0, 1]) 52 | ] 53 | assert has_nan == [False, False, True, True, True] 54 | 55 | data = example_data()[["cat_c"]] 56 | data["cat_e"] = [1, 1, np.nan, 0, 0, 0, np.nan, 1, 1, 0] 57 | uniques, has_nan = _unique(data) 58 | assert [set(values) for values in uniques] == [set([0, 1]), set([0, 1])] 59 | assert has_nan == [True, True] 60 | 61 | 62 | def test__unique_on_tensor_example(): 63 | data = example_data()[["cat_a", "cat_b", "cat_c"]] 64 | data["cat_e"] = [1, 1, np.nan, 0, 0, 0, np.nan, 1, 1, 0] 65 | uniques, has_nan = _unique(torch.from_numpy(data.values)) 66 | assert [set(values) for values in uniques] == [ 67 | set([0, 1, 2, 3]), set([0, 1, 2]), set([0, 1]), set([0, 1]) 68 | ] 69 | assert has_nan == [False, False, True, True] 70 | 71 | data = example_data()[["cat_c"]] 72 | data["cat_e"] = [1, 1, np.nan, 0, 0, 0, np.nan, 1, 1, 0] 73 | uniques, has_nan = _unique(torch.from_numpy(data.values)) 74 | assert [set(values) for values in uniques] == [set([0, 1]), set([0, 1])] 75 | assert has_nan == [True, True] 76 | 77 | 78 | def test_that__value_counts_raises_error_on_bad_input(): 79 | msg = "input should be Pandas DataFrame, NumPy array, or PyTorch Tensor" 80 | with pytest.raises(TypeError, match=msg): 81 | _value_counts([10, 8, 6, 8, 4, 2, 0, 10, 2, 0, 2]) 82 | 83 | 84 | def test__value_counts_on_numpy_examples(): 85 | data = example_data()[["cat_a", "cat_b", "cat_c"]] 86 | data["cat_d"] = ["a", "b", np.nan, "c", "a", np.nan, np.nan, "a", "b", "c"] 87 | data["cat_e"] = [1, 1, np.nan, 0, 0, 0, np.nan, 1, 1, 0] 88 | unique_counts, nan_counts = _value_counts(data.values) 89 | assert unique_counts == [ 90 | {0: 4, 1: 3, 2: 2, 3: 1}, 91 | {0: 4, 1: 5, 2: 1}, 92 | {0: 3, 1: 6}, 93 | {"a": 3, "b": 2, "c": 2}, 94 | {0: 4, 1: 4}, 95 | ] 96 | assert nan_counts == [0, 0, 1, 3, 2] 97 | 98 | data = example_data()[["cat_c"]] 99 | data["cat_e"] = [1, 1, np.nan, 0, 0, 0, np.nan, 1, 1, 0] 100 | unique_counts, nan_counts = _value_counts(data.values) 101 | assert unique_counts == [{0: 3, 1: 6}, {0: 4, 1: 4}] 102 | assert nan_counts == [1, 2] 103 | 104 | 105 | def test__value_counts_on_pandas_examples(): 106 | data = example_data()[["cat_a", "cat_b", "cat_c"]] 107 | data["cat_d"] = ["a", "b", np.nan, "c", "a", np.nan, np.nan, "a", "b", "c"] 108 | data["cat_e"] = [1, 1, np.nan, 0, 0, 0, np.nan, 1, 1, 0] 109 | unique_counts, nan_counts = _value_counts(data) 110 | assert unique_counts == [ 111 | {0: 4, 1: 3, 2: 2, 3: 1}, 112 | {0: 4, 1: 5, 2: 1}, 113 | {0: 3, 1: 6}, 114 | {"a": 3, "b": 2, "c": 2}, 115 | {0: 4, 1: 4}, 116 | ] 117 | assert nan_counts == [0, 0, 1, 3, 2] 118 | 119 | data = example_data()[["cat_c"]] 120 | data["cat_e"] = [1, 1, np.nan, 0, 0, 0, np.nan, 1, 1, 0] 121 | unique_counts, nan_counts = _value_counts(data.values) 122 | assert unique_counts == [{0: 3, 1: 6}, {0: 4, 1: 4}] 123 | assert nan_counts == [1, 2] 124 | 125 | 126 | def test__value_counts_on_tensor_examples(): 127 | data = example_data()[["cat_a", "cat_b", "cat_c"]] 128 | data["cat_e"] = [1, 1, np.nan, 0, 0, 0, np.nan, 1, 1, 0] 129 | unique_counts, nan_counts = _value_counts(torch.from_numpy(data.values)) 130 | assert unique_counts == [ 131 | {0: 4, 1: 3, 2: 2, 3: 1}, {0: 4, 1: 5, 2: 1}, {0: 3, 1: 6}, {0: 4, 1: 4} 132 | ] 133 | assert nan_counts == [0, 0, 1, 2] 134 | 135 | data = example_data()[["cat_c"]] 136 | data["cat_e"] = [1, 1, np.nan, 0, 0, 0, np.nan, 1, 1, 0] 137 | unique_counts, nan_counts = _value_counts(torch.from_numpy(data.values)) 138 | assert unique_counts == [{0: 3, 1: 6}, {0: 4, 1: 4}] 139 | assert nan_counts == [1, 2] 140 | -------------------------------------------------------------------------------- /xynn/embedding/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Functions for fitting embeddings or checking embeddings 3 | 4 | """ 5 | from typing import Tuple, Optional 6 | from collections import namedtuple 7 | 8 | from torch.utils.data import DataLoader 9 | 10 | from .common import _linear_agg, _unique_agg, _value_counts_agg, EmbeddingBase 11 | from .uniform import UniformBase, LinearEmbedding, BasicEmbedding 12 | from .ragged import RaggedEmbedding 13 | 14 | 15 | EmbeddingInfo = namedtuple("EmbeddingInfo", ["num_fields", "output_size"]) 16 | UniformEmbeddingInfo = namedtuple( 17 | "EmbeddingInfo", ["num_fields", "embedding_size", "output_size"] 18 | ) 19 | 20 | 21 | def _init_embed_info(embedding): 22 | if isinstance(embedding, (LinearEmbedding)): 23 | info_1 = 0 24 | info_2 = None 25 | agg_fn = _linear_agg 26 | else: 27 | info_1 = [] 28 | info_2 = [] 29 | if isinstance(embedding, (BasicEmbedding, RaggedEmbedding)): 30 | agg_fn = _unique_agg 31 | else: 32 | agg_fn = _value_counts_agg 33 | return info_1, info_2, agg_fn 34 | 35 | 36 | def fit_embeddings( 37 | data: DataLoader, 38 | embedding_num: Optional[EmbeddingBase], 39 | embedding_cat: Optional[EmbeddingBase], 40 | ) -> Tuple[Optional[EmbeddingBase], Optional[EmbeddingBase]]: 41 | """ 42 | Create the internal embedding info for the given numerical and categorical 43 | embeddings. 44 | 45 | Note: the passed embeddings are modified in place 46 | 47 | Parameters 48 | ---------- 49 | data : torch.utils.data.DataLoader 50 | embedding_num : initialized embedding or None 51 | embedding_cat : initialized embedding or None 52 | 53 | Returns 54 | ------- 55 | embedding_num, embedding_cat inputs, modified in place 56 | 57 | """ 58 | if embedding_num is None and embedding_cat is None: 59 | return None, None 60 | 61 | # get initial values and aggregation functions 62 | if embedding_num is not None: 63 | num_info_1, num_info_2, num_agg_fn = _init_embed_info(embedding_num) 64 | if embedding_cat is not None: 65 | cat_info_1, cat_info_2, cat_agg_fn = _init_embed_info(embedding_cat) 66 | 67 | # iterate and update the initial aggregation values/objects 68 | for batch in data: 69 | if embedding_num is not None: 70 | num_info_1, num_info_2 = num_agg_fn(num_info_1, num_info_2, batch[0]) 71 | if embedding_cat is not None: 72 | cat_info_1, cat_info_2 = cat_agg_fn(cat_info_1, cat_info_2, batch[1]) 73 | 74 | # use aggregated values to set the embeddings 75 | if embedding_num is None: 76 | pass 77 | elif isinstance(embedding_num, LinearEmbedding): 78 | embedding_num.from_summary(num_info_1) 79 | else: 80 | embedding_num.from_summary(num_info_1, num_info_2) 81 | 82 | if embedding_cat is not None: 83 | embedding_cat.from_summary(cat_info_1, cat_info_2) 84 | 85 | return embedding_num, embedding_cat 86 | 87 | 88 | def _check_is_uniform(embedding, name): 89 | if embedding is None: 90 | return 91 | if not isinstance(embedding, UniformBase): 92 | raise TypeError( 93 | "only 'uniform' embeddings are allowed for this model; " 94 | f"{name} is not a uniform embedding" 95 | ) 96 | 97 | 98 | def check_uniform_embeddings( 99 | embedding_num: Optional[EmbeddingBase], 100 | embedding_cat: Optional[EmbeddingBase], 101 | ) -> EmbeddingInfo: 102 | """ 103 | Check that embeddings are uniform, are not both None, and have same 104 | embedding_size 105 | 106 | Parameters 107 | ---------- 108 | embedding_num : XyNN embedding or None 109 | embedding_cat : XyNN embedding or None 110 | 111 | Return 112 | ------ 113 | UniformEmbeddingInfo NamedTuple containing 114 | - num_fields 115 | - embedding_size 116 | - output_size = num_fields * embedding_size 117 | 118 | """ 119 | # check embedding sizes and get derived values 120 | if embedding_num is None and embedding_cat is None: 121 | raise ValueError("embedding_num and embedding_cat cannot both be None") 122 | 123 | _check_is_uniform(embedding_num, "embedding_num") 124 | _check_is_uniform(embedding_cat, "embedding_cat") 125 | 126 | if ( 127 | embedding_num is not None 128 | and embedding_cat is not None 129 | and not embedding_num.embedding_size == embedding_cat.embedding_size 130 | ): 131 | raise ValueError( 132 | "embedding sizes must be the same for numeric and catgorical; got " 133 | f"{embedding_num.embedding_size} and {embedding_cat.embedding_size}" 134 | ) 135 | 136 | num_fields = 0 137 | if embedding_num is not None: 138 | num_fields += embedding_num.num_fields 139 | embedding_size = embedding_num.embedding_size 140 | 141 | if embedding_cat is not None: 142 | num_fields += embedding_cat.num_fields 143 | embedding_size = embedding_cat.embedding_size 144 | 145 | return UniformEmbeddingInfo(num_fields, embedding_size, num_fields * embedding_size) 146 | 147 | 148 | def check_embeddings( 149 | embedding_num: Optional[EmbeddingBase], 150 | embedding_cat: Optional[EmbeddingBase], 151 | ) -> EmbeddingInfo: 152 | """ 153 | Return combined embedding info 154 | 155 | Parameters 156 | ---------- 157 | embedding_num : XyNN embedding or None 158 | embedding_cat : XyNN embedding or None 159 | 160 | Return 161 | ------ 162 | EmbeddingInfo NamedTuple containing 163 | - num_fields 164 | - output_size = sum of individual output sizes 165 | 166 | """ 167 | # get number of fields and total output size 168 | if embedding_num is None and embedding_cat is None: 169 | return EmbeddingInfo(0, 0) 170 | 171 | num_fields = 0 172 | output_size = 0 173 | if embedding_num is not None: 174 | num_fields += embedding_num.num_fields 175 | output_size += embedding_num.output_size 176 | 177 | if embedding_cat is not None: 178 | num_fields += embedding_cat.num_fields 179 | output_size += embedding_cat.output_size 180 | 181 | return EmbeddingInfo(num_fields, output_size) 182 | -------------------------------------------------------------------------------- /xynn/embedding/uniform/numeric.py: -------------------------------------------------------------------------------- 1 | """ 2 | Classes for embedding numeric fields 3 | 4 | LinearEmbedding 5 | - embed each numeric *field* with a vector; for each numeric value, multiply the 6 | field vector by the value 7 | DenseEmbedding 8 | - a dense linear layer followed by an activation 9 | 10 | """ 11 | 12 | from typing import Union, List, Optional, Tuple, Type 13 | from functools import reduce 14 | import operator 15 | 16 | import torch 17 | from torch import Tensor 18 | from torch import nn 19 | 20 | from .base import UniformBase 21 | 22 | 23 | class LinearEmbedding(UniformBase): 24 | """ 25 | An embedding for numeric fields. There is one embedded vector for each field. 26 | The embedded vector for a value is that value times its field's vector. 27 | 28 | """ 29 | 30 | def __init__(self, embedding_size: int = 10, device: Union[str, torch.device] = "cpu"): 31 | """ 32 | Parameters 33 | ---------- 34 | embedding_size : int, optional 35 | size of each value's embedding vector; default is 10 36 | device : string or torch.device 37 | 38 | """ 39 | super().__init__() 40 | self.num_fields = 0 41 | self.output_size = 0 42 | self.embedding: Optional[nn.Embedding] = None 43 | self.embedding_size = embedding_size 44 | self._device = device 45 | self.to(device) 46 | self._isfit = False 47 | 48 | def __repr__(self): 49 | return f"LinearEmbedding({self.embedding_size}, {repr(self._device)})" 50 | 51 | def from_summary(self, num_fields: int): 52 | """ 53 | Create the embedding for the given number of fields 54 | 55 | Parameters 56 | ---------- 57 | num_fields : int 58 | 59 | Return 60 | ------ 61 | self 62 | 63 | """ 64 | self.num_fields = num_fields 65 | self.output_size = num_fields * self.embedding_size 66 | self.embedding = nn.Embedding(num_fields, self.embedding_size).to(device=self._device) 67 | nn.init.xavier_uniform_(self.embedding.weight) 68 | 69 | self._isfit = True 70 | 71 | return self 72 | 73 | def _fit_array(self, X): 74 | self.from_summary(X.shape[1]) 75 | 76 | def _fit_iterable(self, X): 77 | for batch in X: 78 | self._fit_array(batch) 79 | break 80 | 81 | def forward(self, X: Tensor) -> Tensor: 82 | """ 83 | Produce embedding for each value in input 84 | 85 | Parameters 86 | ---------- 87 | X : torch.Tensor 88 | 89 | Return 90 | ------ 91 | torch.Tensor 92 | 93 | """ 94 | if not self._isfit: 95 | raise RuntimeError("need to call `fit` or `from_summary` first") 96 | return self.embedding.weight * X.unsqueeze(dim=-1) 97 | 98 | 99 | class DenseEmbedding(UniformBase): 100 | """ 101 | An embedding for numeric fields, consisting of just a linear transformation with 102 | an activation. Maps an input with shape n_rows * n_fields to an output with shape 103 | n_rows * 1 * embedding_size if one value passed for embedding_size or 104 | n_rows * embeddin_size[0] * embedding_size[1] if two values are passed 105 | 106 | """ 107 | 108 | def __init__( 109 | self, 110 | embedding_size: Union[int, Tuple[int, ...], List[int]] = 10, 111 | activation: Type[nn.Module] = nn.LeakyReLU, 112 | device: Union[str, torch.device] = "cpu", 113 | ): 114 | """ 115 | Parameters 116 | ---------- 117 | embedding_size : int, tuple of ints, or list of ints; optional 118 | size of each value's embedding vector; default is 10 119 | activation : subclass of torch.nn.Module, optional 120 | default is nn.LeakyReLU 121 | device : string or torch.device 122 | 123 | """ 124 | super().__init__() 125 | 126 | if isinstance(embedding_size, int): 127 | embedding_size = (1, embedding_size) 128 | elif len(embedding_size) == 1: 129 | embedding_size = (1, embedding_size[0]) 130 | 131 | self.num_fields = 0 132 | self.output_size = 0 133 | self.embedding_w: Optional[nn.Parameter] = None 134 | self.embedding_b: Optional[nn.Parameter] = None 135 | self.dense_out_size = embedding_size 136 | self.embedding_size = embedding_size[-1] 137 | self.activation = activation().to(device=device) 138 | self._device = device 139 | self.to(device) 140 | self._isfit = False 141 | 142 | def __repr__(self): 143 | dense_size = self.dense_out_size 144 | activation = self.activation.__class__.__name__ 145 | device = repr(self._device) 146 | return f"DenseEmbedding({dense_size}, {activation}, {device})" 147 | 148 | def from_summary(self, num_fields: int): 149 | """ 150 | Create the embedding for the given number of fields 151 | 152 | Parameters 153 | ---------- 154 | num_fields : int 155 | 156 | Return 157 | ------ 158 | self 159 | 160 | """ 161 | self.num_fields = num_fields 162 | self.output_size = reduce(operator.mul, self.dense_out_size, 1) 163 | self.embedding_w = nn.Parameter( 164 | torch.zeros((num_fields, *self.dense_out_size)) 165 | ).to(device=self._device) 166 | self.embedding_b = nn.Parameter( 167 | torch.zeros(self.dense_out_size) 168 | ).to(device=self._device) 169 | nn.init.xavier_uniform_(self.embedding_w) 170 | 171 | self._isfit = True 172 | 173 | return self 174 | 175 | def _fit_array(self, X): 176 | self.from_summary(X.shape[1]) 177 | 178 | def _fit_iterable(self, X): 179 | for batch in X: 180 | self._fit_array(batch) 181 | break 182 | 183 | def forward(self, X: Tensor) -> Tensor: 184 | """ 185 | Produce embedding for each value in input 186 | 187 | Parameters 188 | ---------- 189 | X : torch.Tensor 190 | 191 | Return 192 | ------ 193 | torch.Tensor 194 | 195 | """ 196 | if not self._isfit: 197 | raise RuntimeError("need to call `fit` or `from_summary` first") 198 | embedded = self.embedding_w.T.matmul(X.T.to(dtype=torch.float)).T + self.embedding_b 199 | embedded = self.activation(embedded.reshape((X.shape[0], -1))) 200 | return embedded.reshape((X.shape[0], *self.dense_out_size)) 201 | -------------------------------------------------------------------------------- /xynn/xdeepfm/estimators.py: -------------------------------------------------------------------------------- 1 | """ 2 | Scikit-learn style classes for the xDeepFM model 3 | 4 | """ 5 | 6 | import textwrap 7 | from typing import Type, Union, Callable, Tuple, List, Optional 8 | 9 | import torch 10 | from torch import nn 11 | 12 | from .modules import XDeepFM 13 | from ..base_classes.estimators import BaseClassifier, BaseRegressor, ESTIMATOR_INIT_DOC 14 | from ..embedding import EmbeddingBase 15 | 16 | 17 | INIT_DOC = ESTIMATOR_INIT_DOC.format( 18 | textwrap.dedent( 19 | """\ 20 | cin_layer_sizes : int, tuple of int or list of int; optional 21 | if `cin_full_agg` is False, all sizes except the last must be even; 22 | default is (128, 128) 23 | cin_activation : subclass of torch.nn.Module, optional 24 | default is nn.Identity 25 | cin_full_agg : bool, optional 26 | if True, each intermediate output is aggregated in the final CIN output; 27 | if False, half of each intermediate output is aggregated; 28 | default is False 29 | cin_use_bn : bool, optional 30 | default is True 31 | cin_bn_momentum: float, optional 32 | default is 0.1 33 | cin_use_residual: bool, optional 34 | whether to use a skip connection from CIN to output; default is True 35 | cin_use_mlp : bool, optional 36 | default is True""" 37 | ) 38 | ) 39 | 40 | 41 | class XDeepFMClassifier(BaseClassifier): 42 | """ 43 | Scikit-learn style classification model for the xDeepFM model 44 | 45 | """ 46 | 47 | diagram = XDeepFM.diagram 48 | 49 | def __init__( 50 | self, 51 | embedding_num: Optional[Union[str, EmbeddingBase]] = "auto", 52 | embedding_cat: Optional[Union[str, EmbeddingBase]] = "auto", 53 | embedding_l1_reg: float=0.0, 54 | embedding_l2_reg: float=0.0, 55 | cin_layer_sizes: Union[int, Tuple[int, ...], List[int]] = (128, 128), 56 | cin_activation: Type[nn.Module] = nn.Identity, 57 | cin_full_agg: bool = False, 58 | cin_use_bn: bool = True, 59 | cin_bn_momentum: float = 0.1, 60 | cin_use_residual: bool = True, 61 | cin_use_mlp: bool = True, 62 | mlp_hidden_sizes: Union[int, Tuple[int, ...], List[int]] = (512, 256, 128, 64), 63 | mlp_activation: Type[nn.Module] = nn.LeakyReLU, 64 | mlp_use_bn: bool = True, 65 | mlp_bn_momentum: float = 0.1, 66 | mlp_ghost_batch: Optional[int] = None, 67 | mlp_dropout: float = 0.0, 68 | mlp_l1_reg: float = 0.0, 69 | mlp_l2_reg: float = 0.0, 70 | mlp_use_skip: bool = True, 71 | use_leaky_gate: bool = True, 72 | loss_fn: Union[str, Callable] = "auto", 73 | seed: Union[int, None] = None, 74 | device: Union[str, torch.device] = "cpu", 75 | ): 76 | super().__init__( 77 | embedding_num=embedding_num, 78 | embedding_cat=embedding_cat, 79 | embedding_l1_reg=embedding_l1_reg, 80 | embedding_l2_reg=embedding_l2_reg, 81 | cin_layer_sizes=cin_layer_sizes, 82 | cin_activation=cin_activation, 83 | cin_full_agg=cin_full_agg, 84 | cin_use_bn=cin_use_bn, 85 | cin_bn_momentum=cin_bn_momentum, 86 | cin_use_residual=cin_use_residual, 87 | cin_use_mlp=cin_use_mlp, 88 | mlp_hidden_sizes=mlp_hidden_sizes, 89 | mlp_activation=mlp_activation, 90 | mlp_use_bn=mlp_use_bn, 91 | mlp_bn_momentum=mlp_bn_momentum, 92 | mlp_ghost_batch=mlp_ghost_batch, 93 | mlp_dropout=mlp_dropout, 94 | mlp_l1_reg=mlp_l1_reg, 95 | mlp_l2_reg=mlp_l2_reg, 96 | mlp_use_skip=mlp_use_skip, 97 | use_leaky_gate=use_leaky_gate, 98 | loss_fn=loss_fn, 99 | seed=seed, 100 | device=device, 101 | ) 102 | self._model_class = XDeepFM 103 | 104 | __init__.__doc__ = INIT_DOC 105 | 106 | 107 | class XDeepFMRegressor(BaseRegressor): 108 | """ 109 | Scikit-learn style regression model for the xDeepFM model 110 | 111 | """ 112 | 113 | diagram = XDeepFM.diagram 114 | 115 | def __init__( 116 | self, 117 | embedding_num: Optional[Union[str, EmbeddingBase]] = "auto", 118 | embedding_cat: Optional[Union[str, EmbeddingBase]] = "auto", 119 | embedding_l1_reg: float=0.0, 120 | embedding_l2_reg: float=0.0, 121 | cin_layer_sizes: Union[int, Tuple[int, ...], List[int]] = (128, 128), 122 | cin_activation: Type[nn.Module] = nn.Identity, 123 | cin_full_agg: bool = False, 124 | cin_use_bn: bool = True, 125 | cin_bn_momentum: float = 0.1, 126 | cin_use_residual: bool = True, 127 | cin_use_mlp: bool = True, 128 | mlp_hidden_sizes: Union[int, Tuple[int, ...], List[int]] = (512, 256, 128, 64), 129 | mlp_activation: Type[nn.Module] = nn.LeakyReLU, 130 | mlp_use_bn: bool = True, 131 | mlp_bn_momentum: float = 0.1, 132 | mlp_ghost_batch: Optional[int] = None, 133 | mlp_dropout: float = 0.0, 134 | mlp_l1_reg: float = 0.0, 135 | mlp_l2_reg: float = 0.0, 136 | mlp_use_skip: bool = True, 137 | use_leaky_gate: bool = True, 138 | loss_fn: Union[str, Callable] = "auto", 139 | seed: Union[int, None] = None, 140 | device: Union[str, torch.device] = "cpu", 141 | ): 142 | super().__init__( 143 | embedding_num=embedding_num, 144 | embedding_cat=embedding_cat, 145 | embedding_l1_reg=embedding_l1_reg, 146 | embedding_l2_reg=embedding_l2_reg, 147 | cin_layer_sizes=cin_layer_sizes, 148 | cin_activation=cin_activation, 149 | cin_full_agg=cin_full_agg, 150 | cin_use_bn=cin_use_bn, 151 | cin_bn_momentum=cin_bn_momentum, 152 | cin_use_residual=cin_use_residual, 153 | cin_use_mlp=cin_use_mlp, 154 | mlp_hidden_sizes=mlp_hidden_sizes, 155 | mlp_activation=mlp_activation, 156 | mlp_use_bn=mlp_use_bn, 157 | mlp_bn_momentum=mlp_bn_momentum, 158 | mlp_ghost_batch=mlp_ghost_batch, 159 | mlp_dropout=mlp_dropout, 160 | mlp_l1_reg=mlp_l1_reg, 161 | mlp_l2_reg=mlp_l2_reg, 162 | mlp_use_skip=mlp_use_skip, 163 | use_leaky_gate=use_leaky_gate, 164 | loss_fn=loss_fn, 165 | seed=seed, 166 | device=device, 167 | ) 168 | self._model_class = XDeepFM 169 | 170 | __init__.__doc__ = INIT_DOC 171 | -------------------------------------------------------------------------------- /tests/test_mlpnet/test_modules.py: -------------------------------------------------------------------------------- 1 | 2 | import random 3 | 4 | import pytest 5 | import torch 6 | from torch import nn 7 | import numpy as np 8 | 9 | from xynn.base_classes.estimators import _set_seed 10 | from xynn.base_classes.modules import BaseNN 11 | from xynn.mlpnet.modules import MLPNet 12 | from xynn.embedding import LinearEmbedding 13 | from xynn.mlp import LeakyGate, GhostBatchNorm 14 | 15 | from ..common import simple_train_inputs, simple_model_train_loop, SimpleEmbedding 16 | 17 | 18 | def test_that_mlpnet_raises_error_without_numeric_field_info(): 19 | with pytest.raises( 20 | TypeError, 21 | match="when embedding_num is None, num_numeric_fields must be an integer" 22 | ): 23 | MLPNet( 24 | task="classification", 25 | output_size=3, 26 | embedding_num=None, 27 | embedding_cat=None, 28 | ) 29 | 30 | 31 | def test_that_mlpnet_subclasses_basenn(): 32 | assert issubclass(MLPNet, BaseNN) 33 | 34 | 35 | def test_that_mlpnet_uses_basenn_init(): 36 | embedding_num = SimpleEmbedding(20, 3) 37 | model = MLPNet( 38 | task="classification", 39 | output_size=3, 40 | embedding_num=embedding_num, 41 | embedding_cat=None, 42 | embedding_l1_reg=0.1, 43 | num_numeric_fields=20, 44 | mlp_l2_reg=0.2, 45 | ) 46 | 47 | assert model.task == "classification" 48 | assert model.num_epochs == 0 49 | assert isinstance(model.loss_fn, nn.CrossEntropyLoss) 50 | assert model.embedding_num is embedding_num 51 | assert model.embedding_cat is None 52 | assert model.embedding_l1_reg == 0.1 53 | assert model.embedding_l2_reg == 0.0 54 | assert model.mlp_l1_reg == 0.0 55 | assert model.mlp_l2_reg == 0.2 56 | assert model.optimizer is None 57 | assert model.optimizer_info == {} 58 | assert model.scheduler == {} 59 | assert model._device == "cpu" 60 | 61 | 62 | def test_that_activation_and_sizes_are_passed_to_mlp_module(): 63 | embedding_num = SimpleEmbedding(20, 3) 64 | model = MLPNet( 65 | task="classification", 66 | output_size=3, 67 | embedding_num=embedding_num, 68 | embedding_cat=None, 69 | num_numeric_fields=20, 70 | mlp_activation=nn.ReLU, 71 | mlp_hidden_sizes=(512, 128, 32), 72 | mlp_use_bn=False, 73 | mlp_use_skip=False, 74 | use_leaky_gate=False, 75 | ) 76 | expected_classes = [ 77 | nn.Linear, 78 | nn.ReLU, 79 | nn.Linear, 80 | nn.ReLU, 81 | nn.Linear, 82 | nn.ReLU, 83 | nn.Linear, 84 | ] 85 | assert len(model.mlp.main_layers) == len(expected_classes) 86 | for layer, expected_class in zip(model.mlp.main_layers, expected_classes): 87 | assert isinstance(layer, expected_class) 88 | assert model.mlp.skip_layers is None 89 | 90 | 91 | def test_that_more_parameters_are_passed_to_mlp_module(): 92 | embedding_num = SimpleEmbedding(20, 3) 93 | model = MLPNet( 94 | task="classification", 95 | output_size=3, 96 | embedding_num=embedding_num, 97 | embedding_cat=None, 98 | num_numeric_fields=20, 99 | mlp_hidden_sizes=(512, 64), 100 | mlp_use_bn=True, 101 | mlp_dropout=0.1, 102 | mlp_use_skip=True, 103 | use_leaky_gate=True, 104 | ) 105 | 106 | expected_classes = [ 107 | LeakyGate, 108 | nn.Dropout, 109 | nn.Linear, 110 | nn.BatchNorm1d, 111 | nn.LeakyReLU, 112 | nn.Dropout, 113 | nn.Linear, 114 | nn.BatchNorm1d, 115 | nn.LeakyReLU, 116 | nn.Dropout, 117 | nn.Linear, 118 | ] 119 | assert len(model.mlp.main_layers) == len(expected_classes) 120 | for layer, expected_class in zip(model.mlp.main_layers, expected_classes): 121 | assert isinstance(layer, expected_class) 122 | 123 | expected_classes = [LeakyGate, nn.Linear] 124 | assert len(model.mlp.skip_layers) == len(expected_classes) 125 | for layer, expected_class in zip(model.mlp.skip_layers, expected_classes): 126 | assert isinstance(layer, expected_class) 127 | 128 | 129 | def test_mlp_module_layers_with_ghost_batch(): 130 | embedding_num = SimpleEmbedding(20, 3) 131 | model = MLPNet( 132 | task="classification", 133 | output_size=3, 134 | embedding_num=embedding_num, 135 | embedding_cat=None, 136 | num_numeric_fields=20, 137 | mlp_hidden_sizes=(512, 64), 138 | mlp_use_bn=True, 139 | mlp_ghost_batch=16, 140 | mlp_use_skip=True, 141 | use_leaky_gate=False, 142 | ) 143 | 144 | expected_classes = [ 145 | nn.Linear, 146 | GhostBatchNorm, 147 | nn.LeakyReLU, 148 | nn.Linear, 149 | GhostBatchNorm, 150 | nn.LeakyReLU, 151 | nn.Linear, 152 | ] 153 | assert len(model.mlp.main_layers) == len(expected_classes) 154 | for layer, expected_class in zip(model.mlp.main_layers, expected_classes): 155 | assert isinstance(layer, expected_class) 156 | 157 | assert isinstance(model.mlp.skip_layers, nn.Linear) 158 | 159 | 160 | def test_that_diagram_exists_and_prints_something(capsys): 161 | MLPNet.diagram() 162 | captured = capsys.readouterr() 163 | assert len(captured.out.split("\n")) > 5 164 | 165 | 166 | def test_mlp_weight(): 167 | model = MLPNet( 168 | task="regression", 169 | output_size=1, 170 | embedding_num=SimpleEmbedding(20, 3), 171 | embedding_cat=None, 172 | num_numeric_fields=3, 173 | mlp_use_bn=False, 174 | mlp_use_skip=False, 175 | use_leaky_gate=False, 176 | ) 177 | mlp = model.mlp 178 | w1, w2 = model.mlp_weight_sum() 179 | exp_w1 = sum(l.weight.abs().sum().item() for l in mlp.main_layers[::2]) 180 | exp_w2 = sum((l.weight ** 2).sum().item() for l in mlp.main_layers[::2]) 181 | assert np.isclose(w1.item(), exp_w1) 182 | assert np.isclose(w2.item(), exp_w2) 183 | 184 | 185 | def test_that_mlpnet_learns(): 186 | _set_seed(10101) 187 | 188 | X = torch.randint(0, 10, (100, 10)) 189 | y = torch.rand((100, 1)) * 6 - 3 190 | embedding_num = LinearEmbedding(embedding_size=3).fit(X) 191 | model = MLPNet( 192 | task="regression", 193 | output_size=1, 194 | embedding_num=embedding_num, 195 | embedding_cat=None, 196 | num_numeric_fields=10, 197 | mlp_hidden_sizes=[10, 8, 8, 6], 198 | mlp_use_bn=False, 199 | mlp_use_skip=False, 200 | use_leaky_gate=False, 201 | ) 202 | loss_func = nn.MSELoss() 203 | optimizer = torch.optim.Adam(model.parameters(), lr=1e-1) 204 | loss_vals = simple_model_train_loop(model, X, None, y, loss_func, optimizer, num_epochs=5) 205 | assert loss_vals[0] > loss_vals[-1] 206 | -------------------------------------------------------------------------------- /xynn/autoint/estimators.py: -------------------------------------------------------------------------------- 1 | """ 2 | Scikit-learn style classes for the AutoInt model 3 | 4 | """ 5 | 6 | import textwrap 7 | from typing import Type, Union, Callable, Tuple, List, Optional 8 | 9 | import torch 10 | from torch import nn 11 | 12 | from .modules import AutoInt 13 | from ..base_classes.estimators import BaseClassifier, BaseRegressor, ESTIMATOR_INIT_DOC 14 | from ..embedding import EmbeddingBase 15 | 16 | 17 | INIT_DOC = ESTIMATOR_INIT_DOC.format( 18 | textwrap.dedent( 19 | """\ 20 | attn_embedding_size : int, optional 21 | default is 8 22 | attn_num_layers : int, optional 23 | default is 3 24 | attn_num_head : int, optional 25 | default is 2 26 | attn_activation : subclass of torch.nn.Module or None, optional 27 | applied to the transformation tensors; default is None 28 | attn_use_residual : bool, optional 29 | default is True 30 | attn_dropout : float, optional 31 | amount of dropout to use on the product of queries and keys; 32 | default is 0.1 33 | attn_normalize : bool, optional 34 | whether to normalize each attn layer output; default is True""" 35 | ) 36 | ) 37 | 38 | 39 | class AutoIntClassifier(BaseClassifier): 40 | """ 41 | Scikit-learn style classification model for the AutoInt model 42 | 43 | """ 44 | 45 | diagram = AutoInt.diagram 46 | 47 | def __init__( 48 | self, 49 | embedding_num: Optional[Union[str, EmbeddingBase]] = "auto", 50 | embedding_cat: Optional[Union[str, EmbeddingBase]] = "auto", 51 | embedding_l1_reg: float=0.0, 52 | embedding_l2_reg: float=0.0, 53 | attn_embedding_size: int = 8, 54 | attn_num_layers: int = 3, 55 | attn_num_heads: int = 2, 56 | attn_activation: Optional[Type[nn.Module]] = None, 57 | attn_use_residual: bool = True, 58 | attn_dropout: float = 0.1, 59 | attn_normalize: bool = True, 60 | attn_use_mlp: bool = True, 61 | mlp_hidden_sizes: Union[int, Tuple[int, ...], List[int]] = (512, 256, 128, 64), 62 | mlp_activation: Type[nn.Module] = nn.LeakyReLU, 63 | mlp_use_bn: bool = True, 64 | mlp_bn_momentum: float = 0.1, 65 | mlp_ghost_batch: Optional[int] = None, 66 | mlp_dropout: float = 0.0, 67 | mlp_l1_reg: float = 0.0, 68 | mlp_l2_reg: float = 0.0, 69 | mlp_use_skip: bool = True, 70 | use_leaky_gate: bool = True, 71 | weighted_sum: bool = True, 72 | loss_fn: Union[str, Callable] = "auto", 73 | seed: Union[int, None] = None, 74 | device: Union[str, torch.device] = "cpu", 75 | ): 76 | super().__init__( 77 | embedding_num=embedding_num, 78 | embedding_cat=embedding_cat, 79 | embedding_l1_reg=embedding_l1_reg, 80 | embedding_l2_reg=embedding_l2_reg, 81 | attn_embedding_size=attn_embedding_size, 82 | attn_num_layers=attn_num_layers, 83 | attn_num_heads=attn_num_heads, 84 | attn_activation=attn_activation, 85 | attn_use_residual=attn_use_residual, 86 | attn_dropout=attn_dropout, 87 | attn_normalize=attn_normalize, 88 | attn_use_mlp=attn_use_mlp, 89 | mlp_hidden_sizes=mlp_hidden_sizes, 90 | mlp_activation=mlp_activation, 91 | mlp_use_bn=mlp_use_bn, 92 | mlp_bn_momentum=mlp_bn_momentum, 93 | mlp_ghost_batch=mlp_ghost_batch, 94 | mlp_dropout=mlp_dropout, 95 | mlp_l1_reg=mlp_l1_reg, 96 | mlp_l2_reg=mlp_l2_reg, 97 | mlp_use_skip=mlp_use_skip, 98 | use_leaky_gate=use_leaky_gate, 99 | weighted_sum=weighted_sum, 100 | loss_fn=loss_fn, 101 | seed=seed, 102 | device=device, 103 | ) 104 | self._model_class = AutoInt 105 | self._require_numeric_embedding = True 106 | 107 | __init__.__doc__ = INIT_DOC 108 | 109 | 110 | class AutoIntRegressor(BaseRegressor): 111 | """ 112 | Scikit-learn style regression model for the AutoInt model 113 | 114 | """ 115 | 116 | diagram = AutoInt.diagram 117 | 118 | def __init__( 119 | self, 120 | embedding_num: Optional[Union[str, EmbeddingBase]] = "auto", 121 | embedding_cat: Optional[Union[str, EmbeddingBase]] = "auto", 122 | embedding_l1_reg: float=0.0, 123 | embedding_l2_reg: float=0.0, 124 | attn_embedding_size: int = 8, 125 | attn_num_layers: int = 3, 126 | attn_num_heads: int = 2, 127 | attn_activation: Optional[Type[nn.Module]] = None, 128 | attn_use_residual: bool = True, 129 | attn_dropout: float = 0.1, 130 | attn_normalize: bool = True, 131 | attn_use_mlp: bool = True, 132 | mlp_hidden_sizes: Union[int, Tuple[int, ...], List[int]] = (512, 256, 128, 64), 133 | mlp_activation: Type[nn.Module] = nn.LeakyReLU, 134 | mlp_use_bn: bool = True, 135 | mlp_bn_momentum: float = 0.1, 136 | mlp_ghost_batch: Optional[int] = None, 137 | mlp_dropout: float = 0.0, 138 | mlp_l1_reg: float = 0.0, 139 | mlp_l2_reg: float = 0.0, 140 | mlp_use_skip: bool = True, 141 | use_leaky_gate: bool = True, 142 | weighted_sum: bool = True, 143 | loss_fn: Union[str, Callable] = "auto", 144 | seed: Union[int, None] = None, 145 | device: Union[str, torch.device] = "cpu", 146 | ): 147 | super().__init__( 148 | embedding_num=embedding_num, 149 | embedding_cat=embedding_cat, 150 | embedding_l1_reg=embedding_l1_reg, 151 | embedding_l2_reg=embedding_l2_reg, 152 | attn_embedding_size=attn_embedding_size, 153 | attn_num_layers=attn_num_layers, 154 | attn_num_heads=attn_num_heads, 155 | attn_activation=attn_activation, 156 | attn_use_residual=attn_use_residual, 157 | attn_dropout=attn_dropout, 158 | attn_normalize=attn_normalize, 159 | attn_use_mlp=attn_use_mlp, 160 | mlp_hidden_sizes=mlp_hidden_sizes, 161 | mlp_activation=mlp_activation, 162 | mlp_use_bn=mlp_use_bn, 163 | mlp_bn_momentum=mlp_bn_momentum, 164 | mlp_ghost_batch=mlp_ghost_batch, 165 | mlp_dropout=mlp_dropout, 166 | mlp_l1_reg=mlp_l1_reg, 167 | mlp_l2_reg=mlp_l2_reg, 168 | mlp_use_skip=mlp_use_skip, 169 | use_leaky_gate=use_leaky_gate, 170 | weighted_sum=weighted_sum, 171 | loss_fn=loss_fn, 172 | seed=seed, 173 | device=device, 174 | ) 175 | self._model_class = AutoInt 176 | self._require_numeric_embedding = True 177 | 178 | __init__.__doc__ = INIT_DOC 179 | -------------------------------------------------------------------------------- /xynn/preprocessing.py: -------------------------------------------------------------------------------- 1 | """ 2 | Data preprocessing 3 | 4 | """ 5 | 6 | from typing import Union, Any, Iterator, List 7 | 8 | import torch 9 | from torch import Tensor 10 | import numpy as np 11 | from sklearn.preprocessing import LabelEncoder 12 | 13 | 14 | UTA = Union[Tensor, np.ndarray] 15 | 16 | 17 | def _ismissing(column: UTA) -> UTA: 18 | # tensor 19 | if isinstance(column, Tensor): 20 | if column.dtype in (torch.float, torch.double, torch.half, torch.bfloat16): 21 | return torch.isnan(column) 22 | return torch.full(column.shape, False, dtype=torch.bool) 23 | 24 | # ndarray 25 | if np.issubdtype(column.dtype, np.floating): 26 | return np.isnan(column) 27 | elif column.dtype == np.dtype("O"): 28 | return np.array([isinstance(x, float) and np.isnan(x) for x in column]) 29 | return np.full(column.shape, False, dtype=np.bool) 30 | 31 | 32 | def _columns(X: UTA) -> Iterator[UTA]: 33 | """ 34 | Split 2d input into 1d columns 35 | 36 | Parameters 37 | ---------- 38 | X : NumPy array or PyTorch Tensor 39 | should be 2d 40 | 41 | Yields 42 | ------ 43 | NumPy arrays or PyTorch Tensors 44 | 45 | """ 46 | if isinstance(X, Tensor): 47 | columns = X.split(1, dim=1) 48 | else: 49 | columns = np.split(X, X.shape[1], axis=1) 50 | 51 | for column in columns: 52 | yield column.reshape((-1,)) 53 | 54 | 55 | def _isin(column: UTA, test_values: np.ndarray) -> UTA: 56 | if isinstance(column, Tensor): 57 | test_values = torch.from_numpy(test_values) 58 | return (column[..., None] == test_values).any(-1) 59 | return np.isin(column, test_values) 60 | 61 | 62 | class IntegerEncoder: 63 | """ 64 | Convert categorical inputs to integers. 65 | Input: 2d Tensor or NumPy array 66 | Output: 2d integer-valued Tensor 67 | 68 | """ 69 | 70 | def __init__(self, unexpected="increment"): 71 | """ 72 | Parameters 73 | ---------- 74 | unexpected : {"increment", "raise"}, optional 75 | when encountering unexpected values in `transform`, 76 | whether to use a new label for them ("increment") or 77 | whether to raise an error ("raise"); default is "increment" 78 | 79 | """ 80 | self.encoders: List[LabelEncoder] = [] 81 | self.classes_: List[np.ndarray] = [] 82 | self.nan_labels: List[int] = [] 83 | self.num_classes: List[int] = [] 84 | self.class_counts: List[List[int]] = [] 85 | self._isfit = False 86 | self.unexpected = unexpected 87 | 88 | def fit(self, X: UTA, y: Any = None) -> "IntegerEncoder": 89 | """ 90 | Fit encoder values from the input data 91 | 92 | Parameters 93 | ---------- 94 | X : NumPy array or PyTorch Tensor 95 | should be 2d 96 | y : any, optional 97 | not used; parameter provided to imitate Scikit-learn transformers; 98 | default is None 99 | 100 | """ 101 | for column in _columns(X): 102 | missing = _ismissing(column) 103 | encoder = LabelEncoder() 104 | encoder.fit(column[~missing]) 105 | self.encoders.append(encoder) 106 | self.classes_.append(encoder.classes_) 107 | 108 | self.num_classes.append(len(encoder.classes_)) 109 | self.class_counts.append( 110 | [(column == val).sum().item() for val in encoder.classes_] 111 | ) 112 | num_missing = missing.sum().item() 113 | if num_missing: 114 | self.nan_labels.append(len(encoder.classes_)) 115 | self.num_classes[-1] += 1 116 | self.class_counts[-1].append(int(missing.sum().item())) 117 | else: 118 | self.nan_labels.append(-1) 119 | 120 | self._isfit = True 121 | return self 122 | 123 | def _unexpected(self, column: UTA, col_idx: int) -> UTA: 124 | if not self._isfit: 125 | raise RuntimeError("encoder needs to be fit first") 126 | unexp = ~_isin(column, self.classes_[col_idx]) 127 | if self.nan_labels[col_idx] != -1: 128 | unexp[_ismissing(column)] = False 129 | return unexp 130 | 131 | def transform(self, X: UTA, y: Any = None) -> Tensor: 132 | """ 133 | Encode the input with integers 134 | 135 | Parameters 136 | ---------- 137 | X : NumPy array or PyTorch Tensor 138 | should be 2d 139 | y : any, optional 140 | not used; parameter provided to imitate Scikit-learn transformers; 141 | default is None 142 | 143 | Returns 144 | ------- 145 | PyTorch Tensor, with each column transformed to integers, from zero 146 | up to (not including) the number of classes in that column 147 | 148 | """ 149 | if not self._isfit: 150 | raise RuntimeError("encoder needs to be fit first") 151 | if not X.shape[1] == len(self.encoders): 152 | raise ValueError( 153 | "input has the wrong shape, expected " 154 | f"{len(self.encoders)} columns, got {X.shape[1]}" 155 | ) 156 | 157 | encoded_cols = [] 158 | for col_idx, column in enumerate(_columns(X)): 159 | unxpctd = self._unexpected(column, col_idx) 160 | if unxpctd.sum() and self.unexpected == "raise": 161 | values = ", ".join(str(x) for x in column[unxpctd][:3]) 162 | if unxpctd.sum() > 3: 163 | values += ", ..." 164 | raise ValueError(f"unexpected values found in input: {values}") 165 | encoder = self.encoders[col_idx] 166 | missing = _ismissing(column) 167 | allgood = ~missing & ~unxpctd 168 | encoded = torch.empty(column.shape, dtype=torch.int64) 169 | encoded[allgood] = torch.from_numpy(encoder.transform(column[allgood])) 170 | encoded[missing] = self.nan_labels[col_idx] 171 | encoded[unxpctd] = self.num_classes[col_idx] 172 | encoded_cols.append(encoded.reshape((-1, 1))) 173 | return torch.cat(encoded_cols, dim=1) 174 | 175 | def fit_transform(self, X: UTA, y: Any = None) -> Tensor: 176 | """ 177 | Fit encoder values and encode the input 178 | 179 | Parameters 180 | ---------- 181 | X : NumPy array or PyTorch Tensor 182 | should be 2d 183 | y : any, optional 184 | not used; parameter provided to imitate Scikit-learn transformers; 185 | default is None 186 | 187 | Returns 188 | ------- 189 | PyTorch Tensor, with each column transformed to integers, from zero 190 | up to (not including) the number of classes in that column 191 | 192 | """ 193 | self.fit(X) 194 | return self.transform(X) 195 | -------------------------------------------------------------------------------- /tests/test_pnn/test_estimators.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | from xynn.pnn import PNNRegressor, PNNClassifier 5 | from xynn.pnn import PNNPlusRegressor, PNNPlusClassifier 6 | from xynn.embedding import LinearEmbedding, DefaultEmbedding 7 | 8 | from ..common import check_estimator_learns, simple_data 9 | 10 | 11 | def test_that_basic_params_are_passed_to_pnn_module(): 12 | X_num, X_cat, y = simple_data(task="classification") 13 | estimator = PNNClassifier( 14 | embedding_l2_reg=0.2, 15 | mlp_l1_reg=0.1, 16 | ) 17 | estimator.fit( 18 | X_num=X_num, 19 | X_cat=X_cat, 20 | y=y, 21 | optimizer=torch.optim.Adam, 22 | opt_kwargs={"lr": 1e-1}, 23 | num_epochs=1, 24 | ) 25 | 26 | model = estimator._model 27 | 28 | assert model.task == "classification" 29 | assert model.num_epochs == 1 30 | assert isinstance(model.loss_fn, nn.CrossEntropyLoss) 31 | assert model.embedding_num is not None 32 | assert model.embedding_cat is not None 33 | assert model.embedding_l1_reg == 0.0 34 | assert model.embedding_l2_reg == 0.2 35 | assert model.mlp_l1_reg == 0.1 36 | assert model.mlp_l2_reg == 0.0 37 | assert model.optimizer is not None 38 | assert model.optimizer_info != {} 39 | assert model.scheduler == {} 40 | assert model._device == torch.device("cpu") 41 | 42 | 43 | def test_that_basic_params_are_passed_to_pnnplus_module(): 44 | X_num, X_cat, y = simple_data(task="classification") 45 | estimator = PNNPlusClassifier( 46 | embedding_l2_reg=0.2, 47 | mlp_l1_reg=0.1, 48 | ) 49 | estimator.fit( 50 | X_num=X_num, 51 | X_cat=X_cat, 52 | y=y, 53 | optimizer=torch.optim.Adam, 54 | opt_kwargs={"lr": 1e-1}, 55 | num_epochs=1, 56 | ) 57 | 58 | model = estimator._model 59 | 60 | assert model.task == "classification" 61 | assert model.num_epochs == 1 62 | assert isinstance(model.loss_fn, nn.CrossEntropyLoss) 63 | assert model.embedding_num is not None 64 | assert model.embedding_cat is not None 65 | assert model.embedding_l1_reg == 0.0 66 | assert model.embedding_l2_reg == 0.2 67 | assert model.mlp_l1_reg == 0.1 68 | assert model.mlp_l2_reg == 0.0 69 | assert model.optimizer is not None 70 | assert model.optimizer_info != {} 71 | assert model.scheduler == {} 72 | assert model._device == torch.device("cpu") 73 | 74 | 75 | def test_that_pnnregressor_learns(): 76 | estimator = PNNRegressor( 77 | mlp_hidden_sizes=[10, 8, 8, 6], 78 | mlp_use_bn=False, 79 | mlp_use_skip=False, 80 | use_leaky_gate=False, 81 | ) 82 | check_estimator_learns(estimator, task="regression") 83 | assert estimator.init_parameters == { 84 | "embedding_num": "auto", 85 | "embedding_cat": "auto", 86 | "embedding_l1_reg": 0.0, 87 | "embedding_l2_reg": 0.0, 88 | "pnn_product_type": "outer", 89 | "pnn_product_size": 10, 90 | "mlp_hidden_sizes": [10, 8, 8, 6], 91 | "mlp_activation": nn.LeakyReLU, 92 | "mlp_use_bn": False, 93 | "mlp_bn_momentum": 0.1, 94 | "mlp_ghost_batch": None, 95 | "mlp_dropout": 0.0, 96 | "mlp_l1_reg": 0.0, 97 | "mlp_l2_reg": 0.0, 98 | "mlp_use_skip": False, 99 | "use_leaky_gate": False, 100 | "loss_fn": "auto", 101 | "seed": None, 102 | "device": "cpu", 103 | } 104 | 105 | 106 | def test_that_pnnclassifier_learns(): 107 | estimator = PNNClassifier( 108 | pnn_product_type="inner", 109 | mlp_hidden_sizes=[10, 8, 8, 6], 110 | mlp_use_bn=False, 111 | mlp_ghost_batch=4, 112 | mlp_use_skip=False, 113 | use_leaky_gate=False, 114 | ) 115 | check_estimator_learns(estimator, task="classification") 116 | assert estimator.init_parameters == { 117 | "embedding_num": "auto", 118 | "embedding_cat": "auto", 119 | "embedding_l1_reg": 0.0, 120 | "embedding_l2_reg": 0.0, 121 | "pnn_product_type": "inner", 122 | "pnn_product_size": 10, 123 | "mlp_hidden_sizes": [10, 8, 8, 6], 124 | "mlp_activation": nn.LeakyReLU, 125 | "mlp_use_bn": False, 126 | "mlp_bn_momentum": 0.1, 127 | "mlp_ghost_batch": 4, 128 | "mlp_dropout": 0.0, 129 | "mlp_l1_reg": 0.0, 130 | "mlp_l2_reg": 0.0, 131 | "mlp_use_skip": False, 132 | "use_leaky_gate": False, 133 | "loss_fn": "auto", 134 | "seed": None, 135 | "device": "cpu", 136 | } 137 | 138 | 139 | 140 | def test_that_pnnplusregressor_learns(): 141 | estimator = PNNPlusRegressor( 142 | pnn_product_type="both", 143 | pnn_product_size=8, 144 | mlp_hidden_sizes=[10, 8, 8, 6], 145 | mlp_use_bn=False, 146 | mlp_ghost_batch=4, 147 | mlp_use_skip=False, 148 | use_leaky_gate=False, 149 | ) 150 | check_estimator_learns(estimator, task="regression") 151 | assert estimator.init_parameters == { 152 | "embedding_num": "auto", 153 | "embedding_cat": "auto", 154 | "embedding_l1_reg": 0.0, 155 | "embedding_l2_reg": 0.0, 156 | "pnn_product_type": "both", 157 | "pnn_product_size": 8, 158 | "mlp_hidden_sizes": [10, 8, 8, 6], 159 | "mlp_activation": nn.LeakyReLU, 160 | "mlp_use_bn": False, 161 | "mlp_bn_momentum": 0.1, 162 | "mlp_ghost_batch": 4, 163 | "mlp_dropout": 0.0, 164 | "mlp_l1_reg": 0.0, 165 | "mlp_l2_reg": 0.0, 166 | "mlp_use_skip": False, 167 | "use_leaky_gate": False, 168 | "weighted_sum": True, 169 | "loss_fn": "auto", 170 | "seed": None, 171 | "device": "cpu", 172 | } 173 | 174 | 175 | def test_that_pnnplusclassifier_learns(): 176 | embed_num = LinearEmbedding(10) 177 | embed_cat = DefaultEmbedding(10) 178 | estimator = PNNPlusClassifier( 179 | embedding_num=embed_num, 180 | embedding_cat=embed_cat, 181 | mlp_hidden_sizes=[10, 8, 8, 6], 182 | mlp_use_bn=False, 183 | mlp_use_skip=False, 184 | use_leaky_gate=False, 185 | ) 186 | check_estimator_learns(estimator, task="classification") 187 | assert estimator.init_parameters == { 188 | "embedding_num": embed_num, 189 | "embedding_cat": embed_cat, 190 | "embedding_l1_reg": 0.0, 191 | "embedding_l2_reg": 0.0, 192 | "pnn_product_type": "outer", 193 | "pnn_product_size": 10, 194 | "mlp_hidden_sizes": [10, 8, 8, 6], 195 | "mlp_activation": nn.LeakyReLU, 196 | "mlp_use_bn": False, 197 | "mlp_bn_momentum": 0.1, 198 | "mlp_ghost_batch": None, 199 | "mlp_dropout": 0.0, 200 | "mlp_l1_reg": 0.0, 201 | "mlp_l2_reg": 0.0, 202 | "mlp_use_skip": False, 203 | "use_leaky_gate": False, 204 | "weighted_sum": True, 205 | "loss_fn": "auto", 206 | "seed": None, 207 | "device": "cpu", 208 | } 209 | assert repr(estimator._model.embedding_num) == "LinearEmbedding(10, 'cpu')" 210 | assert repr(estimator._model.embedding_cat) == "DefaultEmbedding(10, 20, 'cpu')" 211 | -------------------------------------------------------------------------------- /tests/common.py: -------------------------------------------------------------------------------- 1 | import json 2 | from tempfile import NamedTemporaryFile 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import torch 7 | from torch import nn 8 | from torch.utils.data import Dataset, DataLoader 9 | 10 | from xynn.base_classes.estimators import _set_seed 11 | from xynn.base_classes.modules import BaseNN 12 | 13 | 14 | class SimpleEmbedding(nn.Module): 15 | 16 | def __init__(self, num_embeddings, embedding_dim): 17 | super().__init__() 18 | self.embedding = nn.Embedding(num_embeddings, embedding_dim) 19 | self.num_fields = 1 20 | self.embedding_size = embedding_dim 21 | self.output_size = embedding_dim 22 | 23 | def weight_sum(self): 24 | w = self.embedding.weight 25 | return w.abs().sum().item(), (w ** 2).sum().item() 26 | 27 | def forward(self, x): 28 | return self.embedding(x) 29 | 30 | 31 | def simple_train_loop(module, X, y, loss_func, optimizer, num_epochs): 32 | module.train() 33 | losses = [] 34 | for e_ in range(num_epochs): 35 | optimizer.zero_grad() 36 | y_pred = module(X) 37 | loss = loss_func(y_pred, y) 38 | loss.backward() 39 | optimizer.step() 40 | losses.append(loss.item()) 41 | return losses 42 | 43 | 44 | def simple_model_train_loop(model, X_num, X_cat, y, loss_func, optimizer, num_epochs): 45 | model.train() 46 | losses = [] 47 | for e_ in range(num_epochs): 48 | optimizer.zero_grad() 49 | y_pred = model(X_num, X_cat) 50 | print(y_pred.shape, y.shape) 51 | loss = loss_func(y_pred, y) 52 | loss.backward() 53 | optimizer.step() 54 | losses.append(loss.item()) 55 | return losses 56 | 57 | 58 | class Reshape(nn.Module): 59 | 60 | def forward(self, X): 61 | return X.reshape((X.shape[0], -1)) 62 | 63 | 64 | def example_data(): 65 | data = pd.DataFrame( 66 | { 67 | "num_a": [i / 10 for i in range(10)], 68 | "num_b": range(10, 0, -1), 69 | "cat_a": list("abcdabcaba"), 70 | "cat_b": list("abbabacbab"), 71 | "cat_c": [1, 1, 0, 0, 1, 1, 0, np.nan, 1, 1], 72 | "cat_a_num": [0, 1, 2, 3, 0, 1, 2, 0, 1, 0], 73 | "cat_b_num": [0, 1, 1, 0, 1, 0, 2, 1, 0, 1], 74 | } 75 | ) 76 | return data 77 | 78 | 79 | class SimpleMLP(BaseNN): 80 | 81 | def __init__( 82 | self, 83 | task="regression", 84 | embedding_num=None, 85 | embedding_cat=None, 86 | embedding_l1_reg=0.0, 87 | embedding_l2_reg=0.0, 88 | input_size=11, 89 | hidden_sizes=(7,), 90 | output_size=3, 91 | mlp_l1_reg=0.0, 92 | mlp_l2_reg=0.0, 93 | loss_fn="auto", 94 | mix_value=None, 95 | device="cpu", 96 | ): 97 | super().__init__( 98 | task, 99 | embedding_num, 100 | embedding_cat, 101 | embedding_l1_reg, 102 | embedding_l2_reg, 103 | mlp_l1_reg, 104 | mlp_l2_reg, 105 | loss_fn, 106 | ) 107 | layers = [] 108 | for size in hidden_sizes: 109 | layers.append(nn.Linear(input_size, size)) 110 | input_size = size 111 | layers.append(nn.ReLU()) 112 | layers.append(nn.Linear(input_size, output_size)) 113 | self.layers = nn.Sequential(*layers) 114 | if mix_value is not None: 115 | self.mix = torch.tensor([mix_value]) 116 | else: 117 | self.mix = None 118 | self._device = "cpu" 119 | self.to(device) 120 | 121 | def mlp_weight_sum(self): 122 | w1_sum = 0.0 123 | w2_sum = 0.0 124 | for layer in self.layers: 125 | if not isinstance(layer, nn.Linear): 126 | continue 127 | w1_sum += layer.weight.abs().sum().item() 128 | w2_sum += (layer.weight ** 2).sum().item() 129 | return w1_sum, w2_sum 130 | 131 | def forward(self, X_num, X_cat): 132 | x = torch.cat([X_num, X_cat], axis=1) 133 | return self.layers(x) 134 | 135 | 136 | class SimpleDataset(Dataset): 137 | 138 | def __init__(self, X_num, X_cat, y): 139 | self.X_num = X_num 140 | self.X_cat = X_cat 141 | self.y = y 142 | 143 | def __len__(self): 144 | return len(self.y) 145 | 146 | def __getitem__(self, idx): 147 | return self.X_num[idx], self.X_cat[idx], self.y[idx] 148 | 149 | 150 | def simple_data(task="regression"): 151 | X_num = torch.randint(-2, 2, (300, 10), dtype=torch.float32) 152 | X_cat = torch.randint(0, 2, (300, 1), dtype=torch.float32) 153 | z = torch.rand(size=(300, 3), dtype=torch.float32) - 0.5 154 | y = torch.tensor( 155 | [ 156 | [ 157 | 0.1 * num[0] - 0.2 * num[1] + 0.1 * num[2] * num[3] + cat[0], 158 | - 0.3 * num[4] * num[6] * num[7] + 0.1 * num[8] - num[9], 159 | - 0.2 * num[1] - 0.3 * num[4] * num[6] * num[7] + 0.1 * cat[0], 160 | ] 161 | for num, cat in zip(X_num, X_cat) 162 | ], 163 | dtype=torch.float32 164 | ) + z 165 | if task == "classification": 166 | y_sum = y.sum(dim=1) 167 | y_cuts = torch.quantile(y_sum, q=torch.tensor([1/3, 2/3])) 168 | y = ( 169 | (y_sum > y_cuts[0]).to(dtype=torch.int) 170 | + (y_sum > y_cuts[1]).to(dtype=torch.int) 171 | ) 172 | return X_num, X_cat, y 173 | 174 | 175 | def simple_train_inputs( 176 | loss_fn="auto", 177 | mix_value=None, 178 | optimizer=torch.optim.Adam, 179 | opt_kwargs={"lr": 1e-2}, 180 | scheduler=None, 181 | sch_kwargs=None, 182 | sch_options=None, 183 | configure=True, 184 | ): 185 | X_num, X_cat, y = simple_data() 186 | X_num_train, X_num_valid = X_num[:220], X_num[220:] 187 | X_cat_train, X_cat_valid = X_cat[:220], X_cat[220:] 188 | y_train, y_valid = y[:220], y[220:] 189 | 190 | model = SimpleMLP(task="regression", loss_fn=loss_fn, mix_value=mix_value) 191 | model.set_optimizer( 192 | optimizer=optimizer, 193 | opt_kwargs=opt_kwargs, 194 | scheduler=scheduler, 195 | sch_kwargs=sch_kwargs, 196 | sch_options=sch_options, 197 | ) 198 | if configure: 199 | model.configure_optimizers() 200 | 201 | train_ds = SimpleDataset(X_num_train, X_cat_train, y_train) 202 | valid_ds = SimpleDataset(X_num_valid, X_cat_valid, y_valid) 203 | train_dl = DataLoader(train_ds, batch_size=10, shuffle=True) 204 | valid_dl = DataLoader(valid_ds, batch_size=10) 205 | 206 | return model, train_dl, valid_dl 207 | 208 | 209 | def check_estimator_learns(estimator, task, data=None, seed=10101): 210 | _set_seed(seed) 211 | 212 | if data is None: 213 | X_num, X_cat, y = simple_data(task=task) 214 | else: 215 | X_num, X_cat, y = data 216 | 217 | logfile = NamedTemporaryFile() 218 | 219 | estimator.fit( 220 | X_num=X_num, 221 | X_cat=X_cat, 222 | y=y, 223 | optimizer=torch.optim.Adam, 224 | opt_kwargs={"lr": 1e-1}, 225 | log_path=logfile.name, 226 | ) 227 | 228 | with open(logfile.name, "r") as infile: 229 | train_info = json.load(infile) 230 | 231 | loss_vals = [epoch["train_loss"] for epoch in train_info["train_info"]] 232 | assert any(loss_vals[i] < loss_vals[0] for i in range(1, len(loss_vals))) 233 | -------------------------------------------------------------------------------- /tests/test_dataset.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import torch 4 | import pytest 5 | 6 | from xynn.dataset import _validate_x, _validate_y, TabularDataLoader 7 | 8 | 9 | def test_that__validate_x_raises_error_with_wrong_type(): 10 | X = list(range(100)) 11 | y = torch.tensor(list(range(100))) 12 | with pytest.raises(TypeError, match="input X should be Tensor, NumPy array, or None"): 13 | _validate_x(X, y, "X", "cpu") 14 | 15 | 16 | def test_that__validate_x_raises_error_with_wrong_size(): 17 | X = np.array(list(range(99))) 18 | y = torch.tensor(list(range(100))) 19 | with pytest.raises( 20 | ValueError, 21 | match=r"shape mismatch; got y.shape\[0\] == 100, X.shape\[0\] == 99", 22 | ): 23 | _validate_x(X, y, "X", "cpu") 24 | 25 | 26 | def test_that__validate_x_raises_error_with_wrong_shape(): 27 | X = np.array(list(range(100))) 28 | y = torch.tensor(list(range(100))) 29 | with pytest.raises(ValueError, match=r"X should be 2-d; got shape \(100,\)"): 30 | _validate_x(X, y, "X", "cpu") 31 | 32 | X = np.array(list(range(100))).reshape((100, 1, 1)) 33 | y = torch.tensor(list(range(100))) 34 | with pytest.raises(ValueError, match=r"X should be 2-d; got shape \(100, 1, 1\)"): 35 | _validate_x(X, y, "X", "cpu") 36 | 37 | 38 | def test_that__validate_x_returns_empty_tensor_when_given_none(): 39 | y = torch.tensor(list(range(100))) 40 | X = _validate_x(None, y, "X example", "cpu") 41 | assert X.shape == (100, 0) 42 | 43 | 44 | def test__validate_x_with_numpy_input(): 45 | X = np.array(list(range(100))).reshape((100, 1)) 46 | y = torch.tensor(list(range(100))) 47 | X_out = _validate_x(X, y, "X", "cpu") 48 | assert all(X[i, 0].item() == X_out[i, 0].item() for i in range(100)) 49 | 50 | 51 | def test__validate_x_with_tensor_input(): 52 | X = torch.tensor([[i, i + 1] for i in range(100)]) 53 | y = torch.tensor(list(range(100))) 54 | X_out = _validate_x(X, y, "X", "cpu") 55 | assert X_out is X 56 | 57 | 58 | def test_that__validate_y_raises_error_with_wrong_type(): 59 | with pytest.raises(TypeError, match="y should be Tensor or NumPy array"): 60 | _validate_y(None, task="regression", device="cpu") 61 | 62 | 63 | def test_that__validate_y_raises_error_with_wrong_size(): 64 | y = torch.tensor([]) 65 | with pytest.raises( 66 | ValueError, 67 | match=r"y has a zero-sized dimension; got shape torch.Size\(\[0\]\)" 68 | ): 69 | _validate_y(y, task="classification", device="cpu") 70 | 71 | 72 | def test_that__validate_y_raises_error_with_wrong_shape(): 73 | y = torch.tensor(list(range(100))).reshape((25, 4)) 74 | with pytest.raises( 75 | ValueError, 76 | match="for classification y must be 1-d or 2-d with one column" 77 | ): 78 | _validate_y(y, task="classification", device="cpu") 79 | 80 | y = torch.tensor(list(range(100))).reshape((100, 1, 1)) 81 | with pytest.raises( 82 | ValueError, 83 | match=r"y has too many dimensions; got shape torch.Size\(\[100, 1, 1\]\)", 84 | ): 85 | _validate_y(y, task="regression", device="cpu") 86 | 87 | 88 | def test__validate_y_with_numpy_input(): 89 | y = np.array(list(range(100)), dtype="int").reshape((100, 1)) 90 | y_out = _validate_y(y, task="regression", device="cpu") 91 | assert all(y[i, 0].item() == y_out[i, 0].item() for i in range(100)) 92 | assert y_out.shape == (100, 1) 93 | y_out = _validate_y(y, task="classification", device="cpu") 94 | assert y_out.shape == (100,) 95 | 96 | 97 | def test__validate_y_with_tensor_input(): 98 | y = torch.tensor(list(range(100))) 99 | y_out = _validate_y(y, task="regression", device="cpu") 100 | assert all(y[i].item() == y_out[i, 0].item() for i in range(100)) 101 | assert y_out.shape == (100, 1) 102 | y_out = _validate_y(y, task="classification", device="cpu") 103 | assert y_out is y 104 | 105 | y = torch.tensor(list(range(100))).reshape((100, 1)) 106 | y_out = _validate_y(y, task="regression", device="cpu") 107 | assert y_out is y 108 | 109 | 110 | def test_that_TabularDataLoader_raises_error_when_both_Xs_are_None(): 111 | y = np.array([0, 1] * 10) 112 | with pytest.raises(TypeError, match="X_num and X_cat cannot both be None"): 113 | TabularDataLoader(task="regression", X_num=None, X_cat=None, y=y) 114 | 115 | 116 | def test_TabularDataLoader_with_numpy_input(): 117 | X_num = np.array([[i - 1, i, i + 1] for i in range(20, 0, -1)]) 118 | X_cat = np.array([[i + j for j in range(5)] for i in range(20)]) 119 | y = np.array([0, 1] * 10) 120 | 121 | loader = TabularDataLoader(task="regression", X_num=X_num, X_cat=X_cat, y=y) 122 | assert len(loader) == 1 123 | 124 | batch = next(iter(loader)) 125 | assert isinstance(batch, tuple) 126 | assert len(batch) == 3 127 | # batch size is greater than 20 128 | assert torch.all(batch[0] == torch.from_numpy(X_num)).item() 129 | assert torch.all(batch[1] == torch.from_numpy(X_cat)).item() 130 | assert torch.all(batch[2] == torch.from_numpy(y).reshape((20, 1))).item() 131 | 132 | 133 | def test_TabularDataLoader_with_shuffled_numpy_input(): 134 | X_num = np.array([[i - 1, i, i + 1] for i in range(20, 0, -1)]) 135 | X_cat = np.array([[i + j for j in range(5)] for i in range(20)]) 136 | y = np.arange(20) 137 | 138 | loader = TabularDataLoader( 139 | task="regression", X_num=X_num, X_cat=X_cat, y=y, shuffle=True 140 | ) 141 | assert len(loader) == 1 142 | 143 | batch = next(iter(loader)) 144 | assert isinstance(batch, tuple) 145 | assert len(batch) == 3 146 | # batch size is greater than 20 147 | order = [int(x.item()) for x in batch[2].reshape((20,))] 148 | assert set(order) == set(y) 149 | assert order != list(y) 150 | assert torch.all(batch[0] == torch.from_numpy(X_num[order])).item() 151 | assert torch.all(batch[1] == torch.from_numpy(X_cat[order])).item() 152 | assert torch.all(batch[2] == torch.from_numpy(y[order]).reshape((20, 1))).item() 153 | 154 | 155 | def test_TabularDataLoader_with_numpy_input_and_smaller_batches(): 156 | X_num = np.array([[i - 1, i, i + 1] for i in range(20, 0, -1)]) 157 | X_cat = np.array([[i + j for j in range(5)] for i in range(20)]) 158 | y = np.arange(20) 159 | 160 | loader = TabularDataLoader( 161 | task="regression", X_num=X_num, X_cat=X_cat, y=y, shuffle=True, batch_size=10 162 | ) 163 | assert len(loader) == 2 164 | 165 | batch_0, batch_1 = list(loader) 166 | assert all(len(t) == 10 for batch in (batch_0, batch_1) for t in batch) 167 | batch_cat = tuple(torch.cat([t_0, t_1], dim=0) for t_0, t_1 in zip(batch_0, batch_1)) 168 | order = [int(x.item()) for x in batch_cat[2].reshape((20,))] 169 | assert set(order) == set(y) 170 | assert order != list(y) 171 | assert torch.all(batch_cat[0] == torch.from_numpy(X_num[order])).item() 172 | assert torch.all(batch_cat[1] == torch.from_numpy(X_cat[order])).item() 173 | assert torch.all(batch_cat[2] == torch.from_numpy(y[order]).reshape((20, 1))).item() 174 | 175 | 176 | def test_TabularDataLoader_with_tensor_input(): 177 | X_num = torch.tensor([[i - 1, i, i + 1] for i in range(20, 0, -1)]) 178 | y = torch.tensor([0, 1] * 10) 179 | 180 | loader = TabularDataLoader(task="classification", X_num=X_num, X_cat=None, y=y) 181 | assert len(loader) == 1 182 | 183 | batch = next(iter(loader)) 184 | assert isinstance(batch, tuple) 185 | assert len(batch) == 3 186 | # batch size is greater than 20 187 | assert torch.all(batch[0] == X_num).item() 188 | assert batch[1].shape == (20, 0) 189 | assert torch.all(batch[2] == y).item() 190 | -------------------------------------------------------------------------------- /xynn/embedding/uniform/fast_categorical.py: -------------------------------------------------------------------------------- 1 | """ 2 | Classes for embedding categorical fields 3 | 4 | FastBasicEmbedding 5 | - requires that each field's values are integers 0, 1, ... 6 | - embed each value with single vector 7 | FastDefaultEmbedding 8 | - requires that each field's values are integers 0, 1, ... 9 | - like FastBasicEmbedding, but include a "default" vector for each field 10 | - returned vector is a weighted combination between the value's own vector 11 | and the field's "default" vector 12 | - the weighting is based on the count of value in the training set; a higher 13 | count puts more weight the value's own vector 14 | - values not seen in the training data are embedded with the default vector 15 | 16 | """ 17 | 18 | from typing import Union, List, Optional 19 | 20 | import numpy as np 21 | import torch 22 | from torch import nn, Tensor 23 | 24 | from ..common import FastBasicBase, FastDefaultBase 25 | from .base import UniformBase 26 | 27 | 28 | class FastBasicEmbedding(UniformBase, FastBasicBase): 29 | """ 30 | A basic embedding that creates an embedded vector for each field value. 31 | 32 | """ 33 | 34 | def __init__(self, embedding_size: int = 10, device: Union[str, torch.device] = "cpu"): 35 | """ 36 | Parameters 37 | ---------- 38 | embedding_size : int, optional 39 | size of each value's embedding vector; default is 10 40 | device : string or torch.device 41 | 42 | """ 43 | super().__init__() 44 | self.num_fields = 0 45 | self.output_size = 0 46 | self.offsets: Optional[Tensor] = None 47 | self.embedding: Optional[nn.Embedding] = None 48 | self.embedding_size = embedding_size 49 | self._device = device 50 | self.to(device) 51 | self._isfit = False 52 | 53 | def __repr__(self): 54 | embed_size = self.embedding_size 55 | device = repr(self._device) 56 | return f"FastBasicEmbedding({embed_size}, {device})" 57 | 58 | def from_summary(self, num_classes: List[int]) -> "FastBasicEmbedding": 59 | """ 60 | Create the embedding from category values for each field 61 | 62 | Parameters 63 | ---------- 64 | num_classes : list of int 65 | number of category values for each field 66 | 67 | Return 68 | ------ 69 | self 70 | 71 | """ 72 | self.num_fields = len(num_classes) 73 | self.output_size = self.num_fields * self.embedding_size 74 | self.offsets = torch.tensor([[0] + list(np.cumsum(num_classes[:-1]))], device=self._device) 75 | self.embedding = nn.Embedding( 76 | sum(num_classes), self.embedding_size 77 | ).to(device=self._device) 78 | nn.init.xavier_uniform_(self.embedding.weight) 79 | 80 | self._isfit = True 81 | 82 | return self 83 | 84 | def forward(self, X: Tensor) -> Tensor: 85 | """ 86 | Produce embedding for each value in input 87 | 88 | Parameters 89 | ---------- 90 | X : torch.Tensor 91 | 92 | Return 93 | ------ 94 | torch.Tensor 95 | 96 | """ 97 | if not self._isfit: 98 | raise RuntimeError("need to call `fit` or `from_summary` first") 99 | 100 | return self.embedding(X + self.offsets) 101 | 102 | 103 | class FastDefaultEmbedding(UniformBase, FastDefaultBase): 104 | """ 105 | An embedding with a default value for each field. The default is returned for 106 | any field value not seen when the embedding was initialized (using `fit` or 107 | `from_summary`). For any value seen at initialization, a weighted average of 108 | that value's embedding and the default embedding is returned. The weights for 109 | the average are determined by the parameter `alpha`: 110 | 111 | weight = count / (count + alpha) 112 | final = embedding * weight + default * (1 - weight) 113 | 114 | """ 115 | 116 | def __init__( 117 | self, 118 | embedding_size: int = 10, 119 | alpha: int = 20, 120 | device: Union[str, torch.device] = "cpu", 121 | ): 122 | """ 123 | Parameters 124 | ---------- 125 | embedding_size : int, optional 126 | size of each value's embedding vector; default is 10 127 | alpha : int, optional 128 | controls the weighting of each embedding vector with the default; 129 | when `alpha`-many values are seen at initialization; the final 130 | vector is evenly weighted; the influence of the default is decreased 131 | with either higher counts or lower `alpha`; default is 20 132 | device : string or torch.device 133 | 134 | """ 135 | super().__init__() 136 | #self.offsets 137 | self.embedding_size = embedding_size 138 | self.alpha = alpha 139 | self.num_fields = 0 140 | self.output_size = 0 141 | self.max_values: Optional[Tensor] = None 142 | self.offsets: Optional[Tensor] = None 143 | self.counts: Optional[Tensor] = None 144 | self.num_cat_vals = 0 145 | self.embedding: Optional[nn.Embedding] = None 146 | self._device = device 147 | self.to(device) 148 | self._isfit = False 149 | 150 | def __repr__(self): 151 | embed_size = self.embedding_size 152 | alpha = self.alpha 153 | device = repr(self._device) 154 | return f"FastDefaultEmbedding({embed_size}, {alpha}, {device})" 155 | 156 | def from_summary(self, class_counts: List[List[int]]) -> "FastDefaultEmbedding": 157 | """ 158 | Create the embedding from known value counts for each field 159 | 160 | Parameters 161 | ---------- 162 | class_counts : list of list of int 163 | each sub-list has count of category occurrences, 164 | one sub-list for each field 165 | 166 | Return 167 | ------ 168 | self 169 | 170 | """ 171 | num_fields = len(class_counts) 172 | num_uniques = [len(counts) for counts in class_counts] 173 | max_values = [x - 1 for x in num_uniques] 174 | offsets = [0] + list(np.cumsum(num_uniques[:-1])) 175 | num_embed = sum(num_uniques) + num_fields 176 | counts_flat = [count for field in class_counts for count in field] 177 | 178 | self.num_fields = num_fields 179 | self.output_size = self.num_fields * self.embedding_size 180 | self.max_values = torch.tensor(max_values, device=self._device).reshape((1, -1)) 181 | self.offsets = torch.tensor(offsets, device=self._device) 182 | self.counts = torch.tensor(counts_flat).to(self._device) 183 | self.num_cat_vals = sum(num_uniques) 184 | self.embedding = nn.Embedding(num_embed, self.embedding_size).to(self._device) 185 | nn.init.xavier_uniform_(self.embedding.weight) 186 | 187 | self._isfit = True 188 | 189 | return self 190 | 191 | def forward(self, X: Tensor) -> Tensor: 192 | """ 193 | Produce embedding for each value in input 194 | 195 | Parameters 196 | ---------- 197 | X : torch.Tensor 198 | 199 | Return 200 | ------ 201 | torch.Tensor 202 | 203 | """ 204 | if not self._isfit: 205 | raise RuntimeError("need to call `fit` or `from_summary` first") 206 | 207 | offsets = self.offsets.expand(X.shape[0], self.offsets.shape[0]) 208 | X_offset = X + offsets 209 | 210 | unxpcted = (X > self.max_values) 211 | X_offset[unxpcted] = offsets[unxpcted] # block any unexpected categories 212 | 213 | counts = self.counts.expand(X_offset.shape[0], self.counts.shape[0]) 214 | counts = torch.gather(counts, dim=1, index=X_offset) 215 | weights = (counts / (counts + self.alpha)).unsqueeze(-1) 216 | weights[unxpcted] = 0 # block any unexpected categories 217 | primary = self.embedding(X_offset) 218 | default = self.embedding.weight[self.num_cat_vals:, :].unsqueeze(0) 219 | return weights * primary + (1 - weights) * default 220 | -------------------------------------------------------------------------------- /tests/test_base_classes/test_modules.py: -------------------------------------------------------------------------------- 1 | 2 | import pytest 3 | import torch 4 | from torch import nn 5 | from pytorch_lightning import Trainer 6 | 7 | from xynn.embedding import RaggedEmbedding, LinearEmbedding 8 | from ..common import SimpleMLP, simple_train_inputs, SimpleEmbedding 9 | 10 | 11 | def test_that_basenn_raises_error_for_bad_task_value(): 12 | with pytest.raises( 13 | ValueError, 14 | match=( 15 | "task classy-regression not recognized; " 16 | "should be 'regression' or 'classification'" 17 | ) 18 | ): 19 | SimpleMLP(task="classy-regression") 20 | 21 | 22 | def test_that_basenn_raises_error_when_configuring_optimizer_without_setting(): 23 | mlp = SimpleMLP() 24 | with pytest.raises( 25 | RuntimeError, 26 | match=( 27 | "The optimizer and learning rate info needs to first be set " 28 | "with the `set_optimizer` method" 29 | ), 30 | ): 31 | mlp.configure_optimizers() 32 | 33 | 34 | def test_num_parameters_against_known_value(): 35 | mlp = SimpleMLP() 36 | assert mlp.num_parameters() == 11 * 7 + 7 + 7 * 3 + 3 37 | mlp.embedding_num = SimpleEmbedding(20, 3) 38 | assert mlp.num_parameters() == 20 * 3 + 11 * 7 + 7 + 7 * 3 + 3 39 | 40 | 41 | def test_embedding_sum_against_known_value(): 42 | mlp = SimpleMLP() 43 | mlp.embedding_num = SimpleEmbedding(20, 3) 44 | assert mlp.embedding_sum() == mlp.embedding_num.weight_sum() 45 | 46 | mlp.embedding_num.embedding.weight = nn.Parameter( 47 | torch.tensor([[-1, 0, 1]] * 20, dtype=torch.float32) 48 | ) 49 | mlp.embedding_cat = SimpleEmbedding(10, 4) 50 | mlp.embedding_cat.embedding.weight = nn.Parameter( 51 | torch.tensor([[-1, 0, 1, 2]] * 10, dtype=torch.float32) 52 | ) 53 | assert mlp.embedding_sum() == (80, 100) 54 | 55 | 56 | def test_that_embed_raises_error_when_both_Xs_none(): 57 | mlp = SimpleMLP() 58 | with pytest.raises(ValueError, match="X_num and X_cat cannot both be None"): 59 | mlp.embed(None, None) 60 | 61 | 62 | def test_that_embed_raises_error_for_bad_num_dim(): 63 | X_num = torch.tensor([[0, 5, 15], [1, 6, 16]]) 64 | X_cat = torch.tensor([[0, 5], [1, 6]]) 65 | mlp = SimpleMLP( 66 | embedding_num=SimpleEmbedding(20, 3), 67 | embedding_cat=SimpleEmbedding(10, 3), 68 | ) 69 | 70 | with pytest.raises(ValueError, match="num_dim should be 2 or 3, got 4"): 71 | mlp.embed(X_num, X_cat, num_dim=4) 72 | 73 | with pytest.raises(ValueError, match="num_dim should be 2 or 3, got any"): 74 | mlp.embed(X_num, X_cat, num_dim="any") 75 | 76 | 77 | def test_that_embed_raises_error_for_num_dim_3_with_ragged_embedding(): 78 | X_num = torch.tensor([[0, 5, 15], [1, 6, 16]]) 79 | X_cat = torch.tensor([[0, 5], [1, 6]]) 80 | mlp = SimpleMLP( 81 | embedding_num=SimpleEmbedding(20, 3), 82 | embedding_cat=RaggedEmbedding(), 83 | ) 84 | with pytest.raises(ValueError, match="cannot use num_dim=3 with ragged embeddings"): 85 | mlp.embed(X_num, X_cat, num_dim=3) 86 | 87 | 88 | def test_embed(): 89 | X_num = torch.tensor([[0, 5, 15], [1, 6, 16]]) 90 | X_cat = torch.tensor([[0, 5], [1, 6]]) 91 | mlp = SimpleMLP( 92 | embedding_num=SimpleEmbedding(20, 3), 93 | embedding_cat=SimpleEmbedding(10, 3), 94 | ) 95 | X_num_emb, X_cat_emb = mlp.embed(X_num, X_cat, concat=False) 96 | assert X_num_emb.shape == (2, 3, 3) 97 | assert X_cat_emb.shape == (2, 2, 3) 98 | 99 | 100 | def test_embed_when_numeric_data_has_zero_columns(): 101 | mlp = SimpleMLP( 102 | embedding_num=SimpleEmbedding(20, 3), 103 | embedding_cat=SimpleEmbedding(10, 4), 104 | ) 105 | X_cat = torch.tensor([[0, 5], [1, 6]]) 106 | X_num = torch.empty(size=(2, 0)) 107 | 108 | X_num_emb, X_cat_emb = mlp.embed(X_num, X_cat, concat=False) 109 | assert X_num_emb.shape == (2, 0, 4) 110 | assert X_cat_emb.shape == (2, 2, 4) 111 | 112 | embedded = mlp.embed(X_num, X_cat) 113 | assert embedded.shape == (2, 2, 4), str(embedded.shape) 114 | 115 | embedded = mlp.embed(X_num, X_cat, num_dim=2) 116 | assert embedded.shape == (2, 8), str(embedded.shape) 117 | 118 | # show that None works the same as the empty X_num above 119 | assert torch.all(mlp.embed(X_num, X_cat) == mlp.embed(None, X_cat)).item() 120 | 121 | 122 | def test_embed_when_categorical_data_has_zero_columns(): 123 | mlp = SimpleMLP( 124 | embedding_num=SimpleEmbedding(20, 3), 125 | embedding_cat=SimpleEmbedding(10, 4), 126 | ) 127 | X_num = torch.tensor([[0, 5, 15], [1, 6, 16]]) 128 | X_cat = torch.empty(size=(2, 0)) 129 | 130 | X_num_emb, X_cat_emb = mlp.embed(X_num, X_cat, concat=False) 131 | assert X_num_emb.shape == (2, 3, 3) 132 | assert X_cat_emb.shape == (2, 0, 3) 133 | 134 | embedded = mlp.embed(X_num, X_cat) 135 | assert embedded.shape == (2, 3, 3), str(embedded.shape) 136 | 137 | embedded = mlp.embed(X_num, X_cat, num_dim=2) 138 | assert embedded.shape == (2, 9), str(embedded.shape) 139 | 140 | # show that None works the same as the empty X_cat above 141 | assert torch.all(mlp.embed(X_num, X_cat) == mlp.embed(X_num, None)).item() 142 | 143 | 144 | def test_embed_results_without_numeric_embeddings(): 145 | X_num = torch.tensor([[0, 5, 15], [1, 6, 16]]) 146 | X_cat = torch.tensor([[0, 5], [1, 6]]) 147 | mlp = SimpleMLP( 148 | embedding_num=None, 149 | embedding_cat=SimpleEmbedding(10, 4), 150 | ) 151 | 152 | X_num_emb, X_cat_emb = mlp.embed(X_num, X_cat, concat=False) 153 | assert X_num_emb.shape == (2, 3, 1) 154 | assert X_cat_emb.shape == (2, 2, 4) 155 | 156 | X_num_emb, X_cat_emb = mlp.embed(X_num, X_cat, num_dim=2, concat=False) 157 | assert X_num_emb.shape == (2, 3) 158 | assert X_cat_emb.shape == (2, 8) 159 | 160 | embedded = mlp.embed(X_num, X_cat, num_dim=2) 161 | assert embedded.shape == (2, 11) 162 | 163 | # cannot concat with different numbers in dim 2 164 | with pytest.raises(RuntimeError): 165 | mlp.embed(X_num, X_cat) 166 | 167 | 168 | def test_embed_results_without_categorical_embeddings(): 169 | X_num = torch.tensor([[0, 5, 15], [1, 6, 16]]) 170 | X_cat = torch.tensor([[0, 5], [1, 6]]) 171 | mlp = SimpleMLP( 172 | embedding_num=SimpleEmbedding(20, 3), 173 | embedding_cat=None, 174 | ) 175 | 176 | X_num_emb, X_cat_emb = mlp.embed(X_num, X_cat, concat=False) 177 | assert X_num_emb.shape == (2, 3, 3) 178 | assert X_cat_emb.shape == (2, 0, 3) 179 | 180 | embedded = mlp.embed(X_num, X_cat) 181 | assert embedded.shape == (2, 3, 3) 182 | 183 | embedded = mlp.embed(X_num, X_cat, num_dim=2) 184 | assert embedded.shape == (2, 9) 185 | 186 | 187 | def test_embed_results_without_any_embeddings(): 188 | X_num = torch.tensor([[0, 5, 15], [1, 6, 16]]) 189 | X_cat = torch.tensor([[0, 5], [1, 6]]) 190 | mlp = SimpleMLP( 191 | embedding_num=None, 192 | embedding_cat=None, 193 | ) 194 | 195 | X_num_emb, X_cat_emb = mlp.embed(X_num, X_cat, concat=False) 196 | assert torch.all(X_num_emb == X_num.reshape((2, 3, 1))).item() 197 | assert X_cat_emb.shape == (2, 0, 1) 198 | 199 | embedded = mlp.embed(X_num, X_cat) 200 | assert torch.all(embedded == X_num.reshape((2, 3, 1))).item() 201 | 202 | embedded = mlp.embed(X_num, X_cat, num_dim=2) 203 | assert torch.all(embedded == X_num).item() 204 | 205 | 206 | def test_embed_results_with_ragged_embedding(): 207 | X_num = torch.tensor([[0, 5, 15], [1, 6, 16]]) 208 | X_cat = torch.tensor([[0, 5], [1, 6]]) 209 | 210 | mlp = SimpleMLP( 211 | embedding_num=None, 212 | embedding_cat=RaggedEmbedding(embedding_size=(3, 4)).fit(X_cat), 213 | ) 214 | X_num_emb, X_cat_emb = mlp.embed(X_num, X_cat, num_dim=2, concat=False) 215 | assert X_num_emb.shape == (2, 3) 216 | assert X_cat_emb.shape == (2, 7) 217 | 218 | embedded = mlp.embed(X_num, X_cat, num_dim=2) 219 | assert embedded.shape == (2, 10) 220 | 221 | mlp = SimpleMLP( 222 | embedding_num=LinearEmbedding(embedding_size=3).fit(X_num), 223 | embedding_cat=RaggedEmbedding(embedding_size=(3, 4)).fit(X_cat), 224 | ) 225 | X_num_emb, X_cat_emb = mlp.embed(X_num, X_cat, num_dim=2, concat=False) 226 | assert X_num_emb.shape == (2, 9) 227 | assert X_cat_emb.shape == (2, 7) 228 | 229 | embedded = mlp.embed(X_num, X_cat, num_dim=2) 230 | assert embedded.shape == (2, 16) 231 | 232 | 233 | def test_that_pytorch_lightning_runs_without_error(): 234 | model, train_dl, valid_dl = simple_train_inputs(configure=False) 235 | test_dl = valid_dl # just to check that the code runs 236 | trainer = Trainer(max_epochs=5) 237 | trainer.fit(model, train_dl, valid_dl) 238 | trainer.test(model, test_dl) 239 | -------------------------------------------------------------------------------- /xynn/fibinet/estimators.py: -------------------------------------------------------------------------------- 1 | """ 2 | Scikit-learn style classes for the FiBiNet model 3 | 4 | """ 5 | 6 | import textwrap 7 | from typing import Type, Union, Callable, List, Tuple, Optional 8 | 9 | import torch 10 | from torch import nn 11 | 12 | from .modules import FiBiNet 13 | from ..base_classes.estimators import BaseClassifier, BaseRegressor, ESTIMATOR_INIT_DOC 14 | from ..embedding import EmbeddingBase 15 | 16 | 17 | INIT_DOC = ESTIMATOR_INIT_DOC.format( 18 | textwrap.dedent( 19 | """\ 20 | fibi_reduction_ratio : int, optional 21 | used in the SENET layer; default is 3 22 | fibi_activation : subclass of torch.nn.Module, optional 23 | activation used in the SENET layer; default is nn.LeakyReLU 24 | fibi_senet_product : str, optional 25 | options: 26 | - "field-all" 27 | - "field-each" 28 | - "field-interaction" 29 | - "sym-all" 30 | - "sym-each" 31 | - "sym-interaction" 32 | - "hadamard" 33 | "field" : 34 | the original asymmetric bilinear products, with products like 35 | `linear(field_1) * field_2` 36 | where `*` is elementwise multiplication 37 | "sym" : 38 | symmetric versions of the "field" products 39 | `(linear(field_1) * field_2 + field_1 * linear(field_2)) / 2` 40 | "all" : a single product matrix is shared across all pairs of fields 41 | "each" : each field has an associated product matrix 42 | "interaction" : each pair of fields has an associated product matrix 43 | "hadamard" : elementwise multiplication of each pair of fields 44 | default is \"sym-interaction\" 45 | fibi_embed_product : str, optional 46 | options: 47 | - "shared" 48 | - "field-all" 49 | - "field-each" 50 | - "field-interaction" 51 | - "sym-all" 52 | - "sym-each" 53 | - "sym-interaction" 54 | - "hadamard" 55 | "shared" : 56 | use the same product layer (not just the same option) as the SENET 57 | product (previous parameter) 58 | for descriptions of other options, see notes under `fibi_senet_product`; 59 | default is \"sym-interaction\" 60 | fibi_senet_skip: bool, optional 61 | whether SENET output should also be used in both the MLP and Bilinear 62 | layer (True), or just the Bilinear layer (False); see FiBiNet.diagram(); 63 | default is True""" 64 | ) 65 | ) 66 | 67 | 68 | class FiBiNetClassifier(BaseClassifier): 69 | """ 70 | Scikit-learn style classification model for the FiBiNet model 71 | 72 | """ 73 | 74 | diagram = FiBiNet.diagram 75 | 76 | def __init__( 77 | self, 78 | embedding_num: Optional[Union[str, EmbeddingBase]] = "auto", 79 | embedding_cat: Optional[Union[str, EmbeddingBase]] = "auto", 80 | embedding_l1_reg: float = 0.0, 81 | embedding_l2_reg: float = 0.0, 82 | fibi_reduction_ratio: int = 3, 83 | fibi_activation: Type[nn.Module] = nn.LeakyReLU, 84 | fibi_senet_product: str = "sym-interaction", 85 | fibi_embed_product: str = "sym-interaction", 86 | fibi_senet_skip: bool = True, 87 | mlp_hidden_sizes: Union[int, Tuple[int, ...], List[int]] = (512, 256, 128, 64), 88 | mlp_activation: Type[nn.Module] = nn.LeakyReLU, 89 | mlp_use_bn: bool = True, 90 | mlp_bn_momentum: float = 0.1, 91 | mlp_ghost_batch: Optional[int] = None, 92 | mlp_dropout: float = 0.0, 93 | mlp_l1_reg: float = 0.0, 94 | mlp_l2_reg: float = 0.0, 95 | mlp_use_skip: bool = True, 96 | use_leaky_gate: bool = True, 97 | loss_fn: Union[str, Callable] = "auto", 98 | seed: Union[int, None] = None, 99 | device: Union[str, torch.device] = "cpu", 100 | ): 101 | super().__init__( 102 | embedding_num=embedding_num, 103 | embedding_cat=embedding_cat, 104 | embedding_l1_reg=embedding_l1_reg, 105 | embedding_l2_reg=embedding_l2_reg, 106 | fibi_reduction_ratio=fibi_reduction_ratio, 107 | fibi_activation=fibi_activation, 108 | fibi_senet_product=fibi_senet_product, 109 | fibi_embed_product=fibi_embed_product, 110 | fibi_senet_skip=fibi_senet_skip, 111 | mlp_hidden_sizes=mlp_hidden_sizes, 112 | mlp_activation=mlp_activation, 113 | mlp_use_bn=mlp_use_bn, 114 | mlp_bn_momentum=mlp_bn_momentum, 115 | mlp_ghost_batch=mlp_ghost_batch, 116 | mlp_dropout=mlp_dropout, 117 | mlp_l1_reg=mlp_l1_reg, 118 | mlp_l2_reg=mlp_l2_reg, 119 | mlp_use_skip=mlp_use_skip, 120 | use_leaky_gate=use_leaky_gate, 121 | loss_fn=loss_fn, 122 | seed=seed, 123 | device=device, 124 | ) 125 | self._model_class = FiBiNet 126 | 127 | __init__.__doc__ = INIT_DOC 128 | 129 | def _create_model(self): 130 | model_kwargs = { 131 | k: v for k, v in self.model_kwargs.items() if k != "embed_numeric_fields" 132 | } 133 | self._model = self._model_class( 134 | task="classification", 135 | output_size=len(self.classes), 136 | embedding_num=self.embedding_num, 137 | embedding_cat=self.embedding_cat, 138 | num_numeric_fields=self._num_numeric_fields, 139 | loss_fn=self.loss_fn, 140 | device=self._device, 141 | **model_kwargs, 142 | ) 143 | 144 | 145 | class FiBiNetRegressor(BaseRegressor): 146 | """ 147 | Scikit-learn style regression model for the FiBiNet model 148 | 149 | """ 150 | 151 | diagram = FiBiNet.diagram 152 | 153 | def __init__( 154 | self, 155 | embedding_num: Optional[Union[str, EmbeddingBase]] = "auto", 156 | embedding_cat: Optional[Union[str, EmbeddingBase]] = "auto", 157 | embedding_l1_reg: float = 0.0, 158 | embedding_l2_reg: float = 0.0, 159 | fibi_reduction_ratio: int = 3, 160 | fibi_activation: Type[nn.Module] = nn.LeakyReLU, 161 | fibi_senet_product: str = "sym-interaction", 162 | fibi_embed_product: str = "sym-interaction", 163 | fibi_senet_skip: bool = True, 164 | mlp_hidden_sizes: Union[int, Tuple[int, ...], List[int]] = (512, 256, 128, 64), 165 | mlp_activation: Type[nn.Module] = nn.LeakyReLU, 166 | mlp_use_bn: bool = True, 167 | mlp_bn_momentum: float = 0.1, 168 | mlp_ghost_batch: Optional[int] = None, 169 | mlp_dropout: float = 0.0, 170 | mlp_l1_reg: float = 0.0, 171 | mlp_l2_reg: float = 0.0, 172 | mlp_use_skip: bool = True, 173 | use_leaky_gate: bool = True, 174 | loss_fn: Union[str, Callable] = "auto", 175 | seed: Union[int, None] = None, 176 | device: Union[str, torch.device] = "cpu", 177 | ): 178 | super().__init__( 179 | embedding_num=embedding_num, 180 | embedding_cat=embedding_cat, 181 | embedding_l1_reg=embedding_l1_reg, 182 | embedding_l2_reg=embedding_l2_reg, 183 | fibi_reduction_ratio=fibi_reduction_ratio, 184 | fibi_activation=fibi_activation, 185 | fibi_senet_product=fibi_senet_product, 186 | fibi_embed_product=fibi_embed_product, 187 | fibi_senet_skip=fibi_senet_skip, 188 | mlp_hidden_sizes=mlp_hidden_sizes, 189 | mlp_activation=mlp_activation, 190 | mlp_use_bn=mlp_use_bn, 191 | mlp_bn_momentum=mlp_bn_momentum, 192 | mlp_ghost_batch=mlp_ghost_batch, 193 | mlp_dropout=mlp_dropout, 194 | mlp_l1_reg=mlp_l1_reg, 195 | mlp_l2_reg=mlp_l2_reg, 196 | mlp_use_skip=mlp_use_skip, 197 | use_leaky_gate=use_leaky_gate, 198 | loss_fn=loss_fn, 199 | seed=seed, 200 | device=device, 201 | ) 202 | self._model_class = FiBiNet 203 | 204 | __init__.__doc__ = INIT_DOC 205 | 206 | def _create_model(self): 207 | model_kwargs = { 208 | k: v for k, v in self.model_kwargs.items() if k != "embed_numeric_fields" 209 | } 210 | self._model = self._model_class( 211 | task="regression", 212 | output_size=self.num_targets, 213 | embedding_num=self.embedding_num, 214 | embedding_cat=self.embedding_cat, 215 | num_numeric_fields=self._num_numeric_fields, 216 | loss_fn=self.loss_fn, 217 | device=self._device, 218 | **model_kwargs, 219 | ) 220 | -------------------------------------------------------------------------------- /tests/test_embedding/test_uniform/test_numeric.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import torch 3 | from torch import nn 4 | from torch.utils.data import DataLoader 5 | import pytest 6 | 7 | from xynn.embedding import LinearEmbedding, DenseEmbedding 8 | from ...common import simple_train_loop 9 | from ..utils import example_data, Reshape, SimpleDataset 10 | 11 | 12 | def test_that_linearembedding_must_be_fit(): 13 | embedding = LinearEmbedding(embedding_size=2) 14 | data_test = pd.DataFrame( 15 | { 16 | "num_a": [1, 0, 0.5, 0, -1], 17 | "num_b": [1, 0.5, 0, 0, -1], 18 | } 19 | ) 20 | msg = "need to call `fit` or `from_summary` first" 21 | with pytest.raises(RuntimeError, match=msg): 22 | embedding(data_test.values) 23 | 24 | 25 | def test_linearembedding_repr(): 26 | embedding = LinearEmbedding(embedding_size=2) 27 | assert repr(embedding) == "LinearEmbedding(2, 'cpu')" 28 | embedding = LinearEmbedding() 29 | assert repr(embedding) == "LinearEmbedding(10, 'cpu')" 30 | 31 | 32 | def test_linearembedding_with_pandas_example(): 33 | data_num = example_data()[["num_a", "num_b"]] 34 | embedding = LinearEmbedding(embedding_size=3).fit(data_num) 35 | data_test = pd.DataFrame( 36 | { 37 | "num_a": [1, 0, 0.5, 0, -1], 38 | "num_b": [1, 0.5, 0, 0, -1], 39 | } 40 | ) 41 | weight = embedding.embedding.weight 42 | output = embedding(torch.from_numpy(data_test.values)).to(dtype=weight.dtype) 43 | assert weight.shape == (2, 3) 44 | assert output.shape == (5, 2, 3) 45 | # test returned vectors vs weight matrix 46 | assert torch.all(output[0] == weight).item() 47 | assert torch.all(output[1, 0] == 0).item() 48 | assert torch.allclose(output[1, 1], 0.5 * weight[1]) 49 | assert torch.allclose(output[2, 0], 0.5 * weight[0]) 50 | assert torch.all(output[2, 1] == 0).item() 51 | assert torch.all(output[3] == 0).item() 52 | assert torch.all(output[4] == -weight).item() 53 | 54 | 55 | def test_linearembedding_with_tensor_example(): 56 | data_num = torch.from_numpy(example_data()[["num_a", "num_b"]].values) 57 | embedding = LinearEmbedding(embedding_size=3).fit(data_num) 58 | data_test = pd.DataFrame( 59 | { 60 | "num_a": [1, 0, 0.5, 0, -1], 61 | "num_b": [1, 0.5, 0, 0, -1], 62 | } 63 | ) 64 | weight = embedding.embedding.weight 65 | output = embedding(torch.from_numpy(data_test.values)).to(dtype=weight.dtype) 66 | assert weight.shape == (2, 3) 67 | assert output.shape == (5, 2, 3) 68 | # test returned vectors vs weight matrix 69 | assert torch.all(output[0] == weight).item() 70 | assert torch.all(output[1, 0] == 0).item() 71 | assert torch.allclose(output[1, 1], 0.5 * weight[1]) 72 | assert torch.allclose(output[2, 0], 0.5 * weight[0]) 73 | assert torch.all(output[2, 1] == 0).item() 74 | assert torch.all(output[3] == 0).item() 75 | assert torch.all(output[4] == -weight).item() 76 | 77 | 78 | def test_linearembedding_with_dataloader(): 79 | data_num = torch.from_numpy(example_data()[["num_a", "num_b"]].values) 80 | dataset = SimpleDataset(data_num) 81 | dataloader = DataLoader(dataset, batch_size=5) 82 | embedding = LinearEmbedding(embedding_size=3).fit(dataloader) 83 | data_test = pd.DataFrame( 84 | { 85 | "num_a": [1, 0, 0.5, 0, -1], 86 | "num_b": [1, 0.5, 0, 0, -1], 87 | } 88 | ) 89 | weight = embedding.embedding.weight 90 | output = embedding(torch.from_numpy(data_test.values)).to(dtype=weight.dtype) 91 | assert weight.shape == (2, 3) 92 | assert output.shape == (5, 2, 3) 93 | # test returned vectors vs weight matrix 94 | assert torch.all(output[0] == weight).item() 95 | assert torch.all(output[1, 0] == 0).item() 96 | assert torch.allclose(output[1, 1], 0.5 * weight[1]) 97 | assert torch.allclose(output[2, 0], 0.5 * weight[0]) 98 | assert torch.all(output[2, 1] == 0).item() 99 | assert torch.all(output[3] == 0).item() 100 | assert torch.all(output[4] == -weight).item() 101 | 102 | 103 | def test_that_linearembedding_learns(): 104 | X = torch.rand((100, 10)) 105 | y = torch.rand((100, 3)) 106 | embedding = LinearEmbedding(embedding_size=3).fit(X) 107 | model = nn.Sequential(embedding, Reshape(), nn.Linear(30, 3)) 108 | loss_func = nn.MSELoss() 109 | optimizer = torch.optim.Adam(embedding.parameters(), lr=1e-1) # not model.parameters() 110 | wt_before = torch.clone(embedding.embedding.weight) 111 | loss_vals = simple_train_loop(model, X, y, loss_func, optimizer, num_epochs=5) 112 | assert torch.all(embedding.embedding.weight != wt_before).item() 113 | assert loss_vals[0] > loss_vals[-1] 114 | 115 | 116 | def test_that_denseembedding_must_be_fit(): 117 | embedding = DenseEmbedding(embedding_size=(2, 2)) 118 | data_test = pd.DataFrame( 119 | { 120 | "num_a": [1, 0, 0.5, 0, -1], 121 | "num_b": [1, 0.5, 0, 0, -1], 122 | } 123 | ) 124 | msg = "need to call `fit` or `from_summary` first" 125 | with pytest.raises(RuntimeError, match=msg): 126 | embedding(data_test.values) 127 | 128 | 129 | def test_denseembedding_repr(): 130 | embedding = DenseEmbedding(embedding_size=(2, 2)) 131 | assert repr(embedding) == "DenseEmbedding((2, 2), LeakyReLU, 'cpu')" 132 | embedding = DenseEmbedding(activation=nn.ReLU) 133 | assert repr(embedding) == "DenseEmbedding((1, 10), ReLU, 'cpu')" 134 | 135 | 136 | def test_denseembedding_with_pandas_example(): 137 | data_num = example_data()[["num_a", "num_b"]] 138 | embedding = DenseEmbedding(embedding_size=3, activation=nn.ReLU).fit(data_num) 139 | data_test = pd.DataFrame( 140 | { 141 | "num_a": [1, 0, 0.5, 0.0, -1], 142 | "num_b": [0, 1, 0.0, 0.5, -1], 143 | } 144 | ) 145 | emb_w = embedding.embedding_w 146 | emb_b = embedding.embedding_b 147 | output = embedding(torch.from_numpy(data_test.values)).to(dtype=emb_w.dtype) 148 | assert emb_w.shape == (2, 1, 3) 149 | assert emb_b.shape == (1, 3) 150 | assert output.shape == (5, 1, 3) 151 | ## test returned vectors vs weight matrix 152 | identity_relu = emb_b + torch.where( 153 | emb_w > 0, emb_w, torch.zeros(emb_w.shape, dtype=emb_w.dtype) 154 | ) 155 | assert torch.allclose(output[:2], identity_relu) 156 | 157 | 158 | def test_denseembedding_with_tensor_example(): 159 | data_num = torch.from_numpy(example_data()[["num_a", "num_b"]].values) 160 | embedding = DenseEmbedding(embedding_size=(2, 3), activation=nn.ReLU).fit(data_num) 161 | data_test = pd.DataFrame( 162 | { 163 | "num_a": [1, 0, 0.5, 0.0, -1], 164 | "num_b": [0, 1, 0.0, 0.5, -1], 165 | } 166 | ) 167 | emb_w = embedding.embedding_w 168 | emb_b = embedding.embedding_b 169 | output = embedding(torch.from_numpy(data_test.values)).to(dtype=emb_w.dtype) 170 | assert emb_w.shape == (2, 2, 3) 171 | assert emb_b.shape == (2, 3) 172 | assert output.shape == (5, 2, 3) 173 | ## test returned vectors vs weight matrix 174 | identity_relu = emb_b + torch.where( 175 | emb_w > 0, emb_w, torch.zeros(emb_w.shape, dtype=emb_w.dtype) 176 | ) 177 | assert torch.allclose(output[:2], identity_relu) 178 | 179 | 180 | def test_denseembedding_with_dataloader(): 181 | data_num = torch.from_numpy(example_data()[["num_a", "num_b"]].values) 182 | dataset = SimpleDataset(data_num) 183 | dataloader = DataLoader(dataset, batch_size=5) 184 | embedding = DenseEmbedding(embedding_size=(1, 3), activation=nn.ReLU).fit(dataloader) 185 | data_test = pd.DataFrame( 186 | { 187 | "num_a": [1, 0, 0.5, 0.0, -1], 188 | "num_b": [0, 1, 0.0, 0.5, -1], 189 | } 190 | ) 191 | emb_w = embedding.embedding_w 192 | emb_b = embedding.embedding_b 193 | output = embedding(torch.from_numpy(data_test.values)).to(dtype=emb_w.dtype) 194 | assert emb_w.shape == (2, 1, 3) 195 | assert emb_b.shape == (1, 3) 196 | assert output.shape == (5, 1, 3) 197 | ## test returned vectors vs weight matrix 198 | identity_relu = emb_b + torch.where( 199 | emb_w > 0, emb_w, torch.zeros(emb_w.shape, dtype=emb_w.dtype) 200 | ) 201 | assert torch.allclose(output[:2], identity_relu) 202 | 203 | 204 | def test_that_denseembedding_learns(): 205 | X = torch.rand((100, 10)) 206 | y = torch.rand((100, 3)) 207 | embedding = DenseEmbedding(embedding_size=(10, 3)).fit(X) 208 | model = nn.Sequential(embedding, Reshape(), nn.Linear(30, 3)) 209 | loss_func = nn.MSELoss() 210 | optimizer = torch.optim.Adam(embedding.parameters(), lr=1e-1) # not model.parameters() 211 | wt_before = torch.clone(embedding.embedding_w) 212 | loss_vals = simple_train_loop(model, X, y, loss_func, optimizer, num_epochs=5) 213 | assert torch.all(embedding.embedding_w != wt_before).item() 214 | assert loss_vals[0] > loss_vals[-1] 215 | -------------------------------------------------------------------------------- /xynn/mlp.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for MLP (multi-layer perceptron) and related modules 3 | 4 | """ 5 | 6 | from math import ceil 7 | from typing import Union, Tuple, List, Type, Optional 8 | 9 | import torch 10 | from torch import Tensor 11 | from torch import nn 12 | 13 | from .ghost_norm import GhostBatchNorm 14 | 15 | 16 | class LeakyGate(nn.Module): 17 | """ 18 | This performs an element-wise linear transformation followed by a chosen 19 | activation; the default activation is nn.LeakyReLU. Fields may be 20 | represented by individual values or vectors of values (i.e., embedded). 21 | 22 | Input needs to be shaped like (num_rows, num_fields) or 23 | (num_rows, num_fields, embedding_size) 24 | 25 | """ 26 | 27 | def __init__( 28 | self, 29 | input_size: int, 30 | bias: bool = True, 31 | activation: Type[nn.Module] = nn.LeakyReLU, 32 | device: Union[str, torch.device] = "cpu", 33 | ): 34 | """ 35 | Parameters 36 | ---------- 37 | input_size : int 38 | bias : boolean, optional 39 | whether to include an additive bias; default is True 40 | activation : torch.nn.Module, optional 41 | default is nn.LeakyReLU 42 | device : string or torch.device, optional 43 | default is "cpu" 44 | 45 | """ 46 | super().__init__() 47 | self.weight = nn.Parameter(torch.normal(mean=0, std=1.0, size=(1, input_size))) 48 | self.bias = nn.Parameter(torch.zeros(size=(1, input_size)), requires_grad=bias) 49 | self.activation = activation() 50 | self.to(device) 51 | 52 | def forward(self, X: Tensor) -> Tensor: 53 | """ 54 | Transform the input tensor 55 | 56 | Parameters 57 | ---------- 58 | X : torch.Tensor 59 | 60 | Return 61 | ------ 62 | torch.Tensor 63 | 64 | """ 65 | out = X 66 | if len(X.shape) > 2: 67 | out = out.reshape((X.shape[0], -1)) 68 | out = out * self.weight + self.bias 69 | if len(X.shape) > 2: 70 | out = out.reshape(X.shape) 71 | out = self.activation(out) 72 | return out 73 | 74 | 75 | class MLP(nn.Module): 76 | """ 77 | A "multi-layer perceptron". This forms layes of fully-connected linear 78 | transformations, with opional batch norm, dropout, and an initial 79 | "leaky gate". 80 | 81 | Input should be shaped like (num_rows, num_fields) 82 | 83 | """ 84 | 85 | def __init__( 86 | self, 87 | task: str, 88 | input_size: int, 89 | hidden_sizes: Union[int, Tuple[int, ...], List[int]], 90 | output_size: int, 91 | activation: Type[nn.Module] = nn.LeakyReLU, 92 | dropout: Union[float, Tuple[float], List[float]] = 0.0, 93 | dropout_first: bool = False, 94 | use_bn: bool = True, 95 | bn_momentum: float = 0.1, 96 | ghost_batch: Optional[int] = None, 97 | leaky_gate: bool = True, 98 | use_skip: bool = True, 99 | weighted_sum: bool = True, 100 | device: Union[str, torch.device] = "cpu", 101 | ): 102 | """ 103 | Parameters 104 | ---------- 105 | task : {"regression", "classification"} 106 | input_size : int 107 | the number of inputs into the first layer 108 | hidden_sizes : iterable of int 109 | intermediate sizes between `input_size` and `output_size` 110 | output_size : int 111 | the number of outputs from the last layer 112 | activation : subclass of torch.nn.Module (uninitialized), optional 113 | default is nn.LeakyReLU 114 | dropout : float or iterable of float 115 | should be between 0.0 and 1.0; if iterable of float, there 116 | should be one value for each hidden size, plus an additional 117 | value if `use_bn` is True 118 | dropout_first : boolean, optional 119 | whether to include dropout before the first fully-connected 120 | linear layer (and after "leaky_gate", if using); 121 | default is False 122 | use_bn : boolean, optional 123 | whether to use batch normalization; default is True 124 | bn_momentum : float, optional 125 | default is 0.1 126 | ghost_batch : int or None, optional 127 | only used if `use_bn` is True; size of batch in "ghost batch norm"; 128 | if None, normal batch norm is used; defualt is None 129 | leaky_gate : boolean, optional 130 | whether to include a LeakyGate layer before the linear layers; 131 | default is True 132 | use_skip : boolean, optional 133 | use a side path containing just the optional leaky gate plust 134 | a single linear layer; default is True 135 | weighted_sum : boolean, optional 136 | only used with use_skip; when adding main MLP output with side 137 | "skip" output, use a weighted sum with learnable weight; default is True 138 | device : string or torch.device, optional 139 | default is "cpu" 140 | 141 | """ 142 | super().__init__() 143 | 144 | if isinstance(hidden_sizes, int): 145 | hidden_sizes = [hidden_sizes] 146 | 147 | dropout_len = len(hidden_sizes) + (1 if dropout_first else 0) 148 | 149 | if isinstance(dropout, float): 150 | dropout = [dropout] * dropout_len 151 | elif not len(dropout) == dropout_len: 152 | raise ValueError( 153 | f"expected a single dropout value or {dropout_len} values " 154 | f"({'one more than' if dropout_first else 'same as'} hidden_sizes)" 155 | ) 156 | 157 | main_layers: List[nn.Module] = [] 158 | 159 | if leaky_gate: 160 | main_layers.append(LeakyGate(input_size)) 161 | 162 | if dropout_first and dropout[0] > 0: 163 | main_layers.append(nn.Dropout(dropout[0])) 164 | dropout = dropout[1:] 165 | 166 | input_size_i = input_size 167 | for hidden_size_i, dropout_i in zip(hidden_sizes, dropout): 168 | main_layers.append(nn.Linear(input_size_i, hidden_size_i, bias=(not use_bn))) 169 | if use_bn: 170 | if ghost_batch is None: 171 | bnlayer = nn.BatchNorm1d(hidden_size_i, momentum=bn_momentum) 172 | else: 173 | bnlayer = GhostBatchNorm( 174 | hidden_size_i, ghost_batch, momentum=bn_momentum 175 | ) 176 | main_layers.append(bnlayer) 177 | main_layers.append(activation()) 178 | if dropout_i > 0: 179 | main_layers.append(nn.Dropout(dropout_i)) 180 | input_size_i = hidden_size_i 181 | 182 | main_layers.append( 183 | nn.Linear(input_size_i, output_size, bias=(task != "classification")) 184 | ) 185 | 186 | self.main_layers = nn.Sequential(*main_layers) 187 | 188 | self.use_skip = use_skip 189 | if use_skip: 190 | skip_linear = nn.Linear(input_size, output_size, bias=(task != "classification")) 191 | if leaky_gate: 192 | self.skip_layers = nn.Sequential(LeakyGate(input_size), skip_linear) 193 | else: 194 | self.skip_layers = skip_linear 195 | if weighted_sum: 196 | self.mix = nn.Parameter(torch.tensor([0.0])) 197 | else: 198 | self.mix = torch.tensor([0.0], device=device) 199 | else: 200 | self.skip_layers = None 201 | self.mix = None 202 | 203 | self.to(device) 204 | 205 | def weight_sum(self) -> Tuple[Tensor, Tensor]: 206 | """ 207 | Sum of absolute value and squared weights, for regularization 208 | 209 | Return 210 | ------ 211 | w1 : float 212 | sum of absolute value of weights 213 | w2 : float 214 | sum of squared weights 215 | 216 | """ 217 | w1_sum = 0.0 218 | w2_sum = 0.0 219 | for layer_group in (self.main_layers, self.skip_layers): 220 | if layer_group is None: 221 | continue 222 | for layer in layer_group: 223 | if not isinstance(layer, nn.Linear): 224 | continue 225 | w1_sum += layer.weight.abs().sum() 226 | w2_sum += (layer.weight ** 2).sum() 227 | return w1_sum, w2_sum 228 | 229 | def forward(self, X: Tensor) -> Tuple[float, float]: 230 | """ 231 | Transform the input tensor 232 | 233 | Parameters 234 | ---------- 235 | X : torch.Tensor 236 | 237 | Return 238 | ------ 239 | torch.Tensor 240 | 241 | """ 242 | out = self.main_layers(X) 243 | if self.use_skip: 244 | mix = torch.sigmoid(self.mix) 245 | skip_out = self.skip_layers(X) 246 | out = mix * skip_out + (1 - mix) * out 247 | return out 248 | -------------------------------------------------------------------------------- /tests/test_autoint/test_modules.py: -------------------------------------------------------------------------------- 1 | 2 | import random 3 | 4 | import torch 5 | from torch import nn 6 | import numpy as np 7 | import pytest 8 | 9 | from xynn.base_classes.estimators import _set_seed 10 | from xynn.base_classes.modules import BaseNN 11 | from xynn.autoint.modules import AttnInteractionLayer, AttnInteractionBlock 12 | from xynn.autoint import AutoInt 13 | from xynn.embedding import LinearEmbedding, BasicEmbedding 14 | from xynn.mlp import LeakyGate, GhostBatchNorm 15 | 16 | from ..common import simple_train_inputs, simple_model_train_loop 17 | 18 | 19 | def test_attnlayer_basic_initialization(): 20 | attn = AttnInteractionLayer( 21 | field_input_size=5, 22 | use_residual=False, 23 | dropout=0.0, 24 | normalize=False, 25 | ) 26 | assert attn.W_q.shape == (5, 8, 2) 27 | assert attn.W_k.shape == (5, 8, 2) 28 | assert attn.W_v.shape == (5, 8, 2) 29 | assert attn.W_r is None 30 | assert isinstance(attn.w_act, nn.Identity) 31 | assert isinstance(attn.dropout, nn.Identity) 32 | assert isinstance(attn.layer_norm, nn.Identity) 33 | 34 | 35 | def test_attnlayer_intitialization_with_more_options(): 36 | attn = AttnInteractionLayer( 37 | field_input_size=5, 38 | field_output_size=10, 39 | activation=nn.ReLU, 40 | ) 41 | assert attn.W_q.shape == (5, 10, 2) 42 | assert attn.W_k.shape == (5, 10, 2) 43 | assert attn.W_v.shape == (5, 10, 2) 44 | assert attn.W_r.shape == (5, 20) 45 | assert isinstance(attn.w_act, nn.ReLU) 46 | assert isinstance(attn.dropout, nn.Dropout) 47 | assert isinstance(attn.layer_norm, nn.LayerNorm) 48 | 49 | 50 | def test_attnlayer_output_shape(): 51 | x = torch.tensor([[[1, 0]]], dtype=torch.float) 52 | attn = AttnInteractionLayer(field_input_size=2, field_output_size=3) 53 | out = attn(x) 54 | assert out.shape == (1, 1, 6) 55 | 56 | 57 | def test_that_autoint_module_subclasses_basenn(): 58 | assert issubclass(AutoInt, BaseNN) 59 | 60 | 61 | def test_that_autoint_uses_basenn_init(): 62 | X = torch.randint(0, 10, (100, 10)) 63 | embedding_num = LinearEmbedding(embedding_size=3).fit(X) 64 | 65 | model = AutoInt( 66 | task="classification", 67 | output_size=3, 68 | embedding_num=embedding_num, 69 | embedding_cat=None, 70 | embedding_l1_reg=0.1, 71 | mlp_l2_reg=0.2, 72 | ) 73 | 74 | assert model.task == "classification" 75 | assert model.num_epochs == 0 76 | assert isinstance(model.loss_fn, nn.CrossEntropyLoss) 77 | assert model.embedding_num is embedding_num 78 | assert model.embedding_cat is None 79 | assert model.embedding_l1_reg == 0.1 80 | assert model.embedding_l2_reg == 0.0 81 | assert model.mlp_l1_reg == 0.0 82 | assert model.mlp_l2_reg == 0.2 83 | assert model.optimizer is None 84 | assert model.optimizer_info == {} 85 | assert model.scheduler == {} 86 | assert model._device == "cpu" 87 | 88 | 89 | def test_that_autoint_parameters_are_passed_to_submodules(): 90 | X = torch.randint(0, 10, (100, 10)) 91 | embedding_cat = BasicEmbedding(embedding_size=3).fit(X) 92 | model = AutoInt( 93 | task="classification", 94 | output_size=3, 95 | embedding_num=None, 96 | embedding_cat=embedding_cat, 97 | mlp_activation=nn.ReLU, 98 | mlp_hidden_sizes=(512, 128, 32), 99 | mlp_use_bn=False, 100 | mlp_use_skip=False, 101 | use_leaky_gate=False, 102 | ) 103 | 104 | expected_classes = [ 105 | nn.Linear, 106 | nn.ReLU, 107 | nn.Linear, 108 | nn.ReLU, 109 | nn.Linear, 110 | nn.ReLU, 111 | nn.Linear, 112 | ] 113 | for mlp in (model.mlp, model.attn_final): 114 | assert len(mlp.main_layers) == len(expected_classes) 115 | for layer, expected_class in zip(mlp.main_layers, expected_classes): 116 | assert isinstance(layer, expected_class) 117 | assert mlp.skip_layers is None 118 | 119 | assert isinstance(model.attn_interact, AttnInteractionBlock) 120 | assert len(model.attn_interact.layers) == 3 121 | assert model.mix.requires_grad 122 | 123 | 124 | def test_that_autoint_parameters_are_passed_to_submodules_other_params(): 125 | X = torch.randint(0, 10, (100, 10)) 126 | embedding_cat = BasicEmbedding(embedding_size=3).fit(X) 127 | model = AutoInt( 128 | task="classification", 129 | output_size=3, 130 | embedding_num=None, 131 | embedding_cat=embedding_cat, 132 | attn_num_layers=2, 133 | mlp_ghost_batch=8, 134 | mlp_use_skip=True, 135 | ) 136 | 137 | expected_classes = [ 138 | LeakyGate, 139 | nn.Linear, 140 | GhostBatchNorm, 141 | nn.LeakyReLU, 142 | nn.Linear, 143 | GhostBatchNorm, 144 | nn.LeakyReLU, 145 | nn.Linear, 146 | GhostBatchNorm, 147 | nn.LeakyReLU, 148 | nn.Linear, 149 | GhostBatchNorm, 150 | nn.LeakyReLU, 151 | nn.Linear, 152 | ] 153 | for mlp in (model.mlp, model.attn_final): 154 | assert len(mlp.main_layers) == len(expected_classes) 155 | for layer, expected_class in zip(mlp.main_layers, expected_classes): 156 | assert isinstance(layer, expected_class) 157 | assert mlp.skip_layers is not None 158 | 159 | assert isinstance(model.attn_interact, AttnInteractionBlock) 160 | assert len(model.attn_interact.layers) == 2 161 | assert model.mix.requires_grad 162 | 163 | 164 | def test_that_autoint_diagram_exists_and_prints_something(capsys): 165 | AutoInt.diagram() 166 | captured = capsys.readouterr() 167 | assert len(captured.out.split("\n")) > 5 168 | 169 | 170 | def test_autoint_mlp_weight(): 171 | X = torch.randint(0, 10, (100, 10)) 172 | embedding_num = LinearEmbedding(embedding_size=3).fit(X) 173 | 174 | # without Linear after CIN 175 | model = AutoInt( 176 | task="regression", 177 | output_size=1, 178 | embedding_num=embedding_num, 179 | embedding_cat=None, 180 | attn_use_mlp=False, 181 | mlp_use_bn=False, 182 | mlp_use_skip=False, 183 | use_leaky_gate=False, 184 | ) 185 | 186 | exp_w1 = 0 187 | exp_w2 = 0 188 | for mlp in (model.mlp, model.attn_final): 189 | exp_w1 += sum(l.weight.abs().sum().item() for l in mlp.main_layers[::2]) 190 | exp_w2 += sum((l.weight ** 2).sum().item() for l in mlp.main_layers[::2]) 191 | 192 | w1, w2 = model.mlp_weight_sum() 193 | assert np.isclose(w1.item(), exp_w1) 194 | assert np.isclose(w2.item(), exp_w2) 195 | 196 | # with MLP after CIN 197 | model = AutoInt( 198 | task="regression", 199 | output_size=1, 200 | embedding_num=embedding_num, 201 | embedding_cat=None, 202 | attn_use_mlp=True, 203 | mlp_use_bn=False, 204 | mlp_use_skip=False, 205 | use_leaky_gate=False, 206 | ) 207 | 208 | exp_w1 = 0 209 | exp_w2 = 0 210 | for mlp in (model.mlp, model.attn_final): 211 | exp_w1 += sum(l.weight.abs().sum().item() for l in mlp.main_layers[::2]) 212 | exp_w2 += sum((l.weight ** 2).sum().item() for l in mlp.main_layers[::2]) 213 | 214 | w1, w2 = model.mlp_weight_sum() 215 | assert np.isclose(w1.item(), exp_w1) 216 | assert np.isclose(w2.item(), exp_w2) 217 | 218 | 219 | def test_that_autoint_learns(): 220 | _set_seed(10101) 221 | 222 | X_num = torch.randint(0, 10, (100, 10)) 223 | X_cat = torch.randint(0, 5, (100, 1)) 224 | y = ( 225 | (X_cat - 2) + X_num[:, ::2].sum(dim=1) - X_num[:, 1::2].sum(dim=1) 226 | ).to(dtype=torch.float) 227 | 228 | model = AutoInt( 229 | task="regression", 230 | output_size=1, 231 | embedding_num=LinearEmbedding(embedding_size=3).fit(X_num), 232 | embedding_cat=BasicEmbedding(embedding_size=3).fit(X_cat), 233 | mlp_hidden_sizes=[10, 8, 6], 234 | ) 235 | 236 | loss_func = nn.MSELoss() 237 | optimizer = torch.optim.Adam(model.parameters(), lr=1e-1) 238 | loss_vals = simple_model_train_loop( 239 | model, X_num, X_cat, y, loss_func, optimizer, num_epochs=5 240 | ) 241 | 242 | assert loss_vals[0] > loss_vals[-1] 243 | 244 | 245 | def test_that_autoint_learns_with_other_params(): 246 | _set_seed(10101) 247 | 248 | X_num = torch.randint(0, 10, (100, 10)) 249 | X_cat = torch.randint(0, 5, (100, 1)) 250 | y = ( 251 | (X_cat - 2) + X_num[:, ::2].sum(dim=1) - X_num[:, 1::2].sum(dim=1) 252 | ).to(dtype=torch.float) 253 | 254 | model = AutoInt( 255 | task="regression", 256 | output_size=1, 257 | embedding_num=LinearEmbedding(embedding_size=3).fit(X_num), 258 | embedding_cat=BasicEmbedding(embedding_size=3).fit(X_cat), 259 | mlp_hidden_sizes=[], 260 | ) 261 | 262 | loss_func = nn.MSELoss() 263 | optimizer = torch.optim.Adam(model.parameters(), lr=1e-1) 264 | loss_vals = simple_model_train_loop( 265 | model, X_num, X_cat, y, loss_func, optimizer, num_epochs=5 266 | ) 267 | 268 | assert loss_vals[0] > loss_vals[-1] 269 | -------------------------------------------------------------------------------- /xynn/pnn/estimators.py: -------------------------------------------------------------------------------- 1 | """ 2 | Scikit-learn style classes for the PNN and PNNPlus models 3 | 4 | """ 5 | 6 | import textwrap 7 | from typing import Union, Callable, Optional, Type, List, Tuple 8 | 9 | import torch 10 | from torch import nn 11 | 12 | from ..base_classes.estimators import BaseClassifier, BaseRegressor, ESTIMATOR_INIT_DOC 13 | from ..embedding import EmbeddingBase 14 | from .modules import PNN, PNNPlus 15 | 16 | 17 | INIT_DOC = ESTIMATOR_INIT_DOC.format( 18 | textwrap.dedent( 19 | """\ 20 | pnn_product_type : {"inner", "outer", "both"}, optional 21 | default is "outer" 22 | pnn_product_size : int, optional 23 | size of overall product output after transformation; i.e., after 24 | transformation, the batch size is num_rows x product_output_size; 25 | default is 10""" 26 | ) 27 | ) 28 | 29 | 30 | class PNNClassifier(BaseClassifier): 31 | """ 32 | Scikit-learn style classification model for the PNN model 33 | 34 | """ 35 | 36 | diagram = PNN.diagram 37 | 38 | def __init__( 39 | self, 40 | embedding_num: Optional[Union[str, EmbeddingBase]] = "auto", 41 | embedding_cat: Optional[Union[str, EmbeddingBase]] = "auto", 42 | embedding_l1_reg: float = 0.0, 43 | embedding_l2_reg: float = 0.0, 44 | pnn_product_type: str = "outer", 45 | pnn_product_size: int = 10, 46 | mlp_hidden_sizes: Union[int, Tuple[int, ...], List[int]] = (512, 256, 128, 64), 47 | mlp_activation: Type[nn.Module] = nn.LeakyReLU, 48 | mlp_use_bn: bool = True, 49 | mlp_bn_momentum: float = 0.1, 50 | mlp_ghost_batch: Optional[int] = None, 51 | mlp_dropout: float = 0.0, 52 | mlp_l1_reg: float = 0.0, 53 | mlp_l2_reg: float = 0.0, 54 | mlp_use_skip: bool = True, 55 | use_leaky_gate: bool = True, 56 | loss_fn: Union[str, Callable] = "auto", 57 | seed: Union[int, None] = None, 58 | device: Union[str, torch.device] = "cpu", 59 | ): 60 | super().__init__( 61 | embedding_num=embedding_num, 62 | embedding_cat=embedding_cat, 63 | embedding_l1_reg=embedding_l1_reg, 64 | embedding_l2_reg=embedding_l2_reg, 65 | pnn_product_type=pnn_product_type, 66 | pnn_product_size=pnn_product_size, 67 | mlp_hidden_sizes=mlp_hidden_sizes, 68 | mlp_activation=mlp_activation, 69 | mlp_use_bn=mlp_use_bn, 70 | mlp_bn_momentum=mlp_bn_momentum, 71 | mlp_ghost_batch=mlp_ghost_batch, 72 | mlp_dropout=mlp_dropout, 73 | mlp_l1_reg=mlp_l1_reg, 74 | mlp_l2_reg=mlp_l2_reg, 75 | mlp_use_skip=mlp_use_skip, 76 | use_leaky_gate=use_leaky_gate, 77 | loss_fn=loss_fn, 78 | seed=seed, 79 | device=device, 80 | ) 81 | self._model_class = PNN 82 | self._require_numeric_embedding = True 83 | 84 | __init__.__doc__ = INIT_DOC 85 | 86 | 87 | class PNNRegressor(BaseRegressor): 88 | """ 89 | Scikit-learn style regression model for the PNN model 90 | 91 | """ 92 | 93 | diagram = PNN.diagram 94 | 95 | def __init__( 96 | self, 97 | embedding_num: Optional[Union[str, EmbeddingBase]] = "auto", 98 | embedding_cat: Optional[Union[str, EmbeddingBase]] = "auto", 99 | embedding_l1_reg: float = 0.0, 100 | embedding_l2_reg: float = 0.0, 101 | pnn_product_type: str = "outer", 102 | pnn_product_size: int = 10, 103 | mlp_hidden_sizes: Union[int, Tuple[int, ...], List[int]] = (512, 256, 128, 64), 104 | mlp_activation: Type[nn.Module] = nn.LeakyReLU, 105 | mlp_use_bn: bool = True, 106 | mlp_bn_momentum: float = 0.1, 107 | mlp_ghost_batch: Optional[int] = None, 108 | mlp_dropout: float = 0.0, 109 | mlp_l1_reg: float = 0.0, 110 | mlp_l2_reg: float = 0.0, 111 | mlp_use_skip: bool = True, 112 | use_leaky_gate: bool = True, 113 | loss_fn: Union[str, Callable] = "auto", 114 | seed: Optional[int] = None, 115 | device: Union[str, torch.device] = "cpu", 116 | ): 117 | super().__init__( 118 | embedding_num=embedding_num, 119 | embedding_cat=embedding_cat, 120 | embedding_l1_reg=embedding_l1_reg, 121 | embedding_l2_reg=embedding_l2_reg, 122 | pnn_product_type=pnn_product_type, 123 | pnn_product_size=pnn_product_size, 124 | mlp_hidden_sizes=mlp_hidden_sizes, 125 | mlp_activation=mlp_activation, 126 | mlp_use_bn=mlp_use_bn, 127 | mlp_bn_momentum=mlp_bn_momentum, 128 | mlp_ghost_batch=mlp_ghost_batch, 129 | mlp_dropout=mlp_dropout, 130 | mlp_l1_reg=mlp_l1_reg, 131 | mlp_l2_reg=mlp_l2_reg, 132 | mlp_use_skip=mlp_use_skip, 133 | use_leaky_gate=use_leaky_gate, 134 | loss_fn=loss_fn, 135 | seed=seed, 136 | device=device, 137 | ) 138 | self._model_class = PNN 139 | self._require_numeric_embedding = True 140 | 141 | __init__.__doc__ = INIT_DOC 142 | 143 | 144 | class PNNPlusClassifier(BaseClassifier): 145 | """ 146 | Scikit-learn style classification model for the PNN-plus-MLP model 147 | 148 | """ 149 | 150 | diagram = PNNPlus.diagram 151 | 152 | def __init__( 153 | self, 154 | embedding_num: Optional[Union[str, EmbeddingBase]] = "auto", 155 | embedding_cat: Optional[Union[str, EmbeddingBase]] = "auto", 156 | embedding_l1_reg: float = 0.0, 157 | embedding_l2_reg: float = 0.0, 158 | pnn_product_type: str = "outer", 159 | pnn_product_size: int = 10, 160 | mlp_hidden_sizes: Union[int, Tuple[int, ...], List[int]] = (512, 256, 128, 64), 161 | mlp_activation: Type[nn.Module] = nn.LeakyReLU, 162 | mlp_use_bn: bool = True, 163 | mlp_bn_momentum: float = 0.1, 164 | mlp_ghost_batch: Optional[int] = None, 165 | mlp_dropout: float = 0.0, 166 | mlp_l1_reg: float = 0.0, 167 | mlp_l2_reg: float = 0.0, 168 | mlp_use_skip: bool = True, 169 | use_leaky_gate: bool = True, 170 | weighted_sum: bool = True, 171 | loss_fn: Union[str, Callable] = "auto", 172 | seed: Union[int, None] = None, 173 | device: Union[str, torch.device] = "cpu", 174 | ): 175 | super().__init__( 176 | embedding_num=embedding_num, 177 | embedding_cat=embedding_cat, 178 | embedding_l1_reg=embedding_l1_reg, 179 | embedding_l2_reg=embedding_l2_reg, 180 | pnn_product_type=pnn_product_type, 181 | pnn_product_size=pnn_product_size, 182 | mlp_hidden_sizes=mlp_hidden_sizes, 183 | mlp_activation=mlp_activation, 184 | mlp_use_bn=mlp_use_bn, 185 | mlp_bn_momentum=mlp_bn_momentum, 186 | mlp_ghost_batch=mlp_ghost_batch, 187 | mlp_dropout=mlp_dropout, 188 | mlp_l1_reg=mlp_l1_reg, 189 | mlp_l2_reg=mlp_l2_reg, 190 | mlp_use_skip=mlp_use_skip, 191 | use_leaky_gate=use_leaky_gate, 192 | weighted_sum=weighted_sum, 193 | loss_fn=loss_fn, 194 | seed=seed, 195 | device=device, 196 | ) 197 | self._model_class = PNNPlus 198 | self._require_numeric_embedding = True 199 | 200 | __init__.__doc__ = INIT_DOC 201 | 202 | 203 | class PNNPlusRegressor(BaseRegressor): 204 | """ 205 | Scikit-learn style regression model for the PNN-plus-MLP model 206 | 207 | """ 208 | 209 | diagram = PNNPlus.diagram 210 | 211 | def __init__( 212 | self, 213 | embedding_num: Optional[Union[str, EmbeddingBase]] = "auto", 214 | embedding_cat: Optional[Union[str, EmbeddingBase]] = "auto", 215 | embedding_l1_reg: float = 0.0, 216 | embedding_l2_reg: float = 0.0, 217 | pnn_product_type: str = "outer", 218 | pnn_product_size: int = 10, 219 | mlp_hidden_sizes: Union[int, Tuple[int, ...], List[int]] = (512, 256, 128, 64), 220 | mlp_activation: Type[nn.Module] = nn.LeakyReLU, 221 | mlp_use_bn: bool = True, 222 | mlp_bn_momentum: float = 0.1, 223 | mlp_ghost_batch: Optional[int] = None, 224 | mlp_dropout: float = 0.0, 225 | mlp_l1_reg: float = 0.0, 226 | mlp_l2_reg: float = 0.0, 227 | mlp_use_skip: bool = True, 228 | use_leaky_gate: bool = True, 229 | weighted_sum: bool = True, 230 | loss_fn: Union[str, Callable] = "auto", 231 | seed: Optional[int] = None, 232 | device: Union[str, torch.device] = "cpu", 233 | ): 234 | super().__init__( 235 | embedding_num=embedding_num, 236 | embedding_cat=embedding_cat, 237 | embedding_l1_reg=embedding_l1_reg, 238 | embedding_l2_reg=embedding_l2_reg, 239 | pnn_product_type=pnn_product_type, 240 | pnn_product_size=pnn_product_size, 241 | mlp_hidden_sizes=mlp_hidden_sizes, 242 | mlp_activation=mlp_activation, 243 | mlp_use_bn=mlp_use_bn, 244 | mlp_bn_momentum=mlp_bn_momentum, 245 | mlp_ghost_batch=mlp_ghost_batch, 246 | mlp_dropout=mlp_dropout, 247 | mlp_l1_reg=mlp_l1_reg, 248 | mlp_l2_reg=mlp_l2_reg, 249 | mlp_use_skip=mlp_use_skip, 250 | use_leaky_gate=use_leaky_gate, 251 | weighted_sum=weighted_sum, 252 | loss_fn=loss_fn, 253 | seed=seed, 254 | device=device, 255 | ) 256 | self._model_class = PNNPlus 257 | self._require_numeric_embedding = True 258 | 259 | __init__.__doc__ = INIT_DOC 260 | -------------------------------------------------------------------------------- /xynn/embedding/common.py: -------------------------------------------------------------------------------- 1 | """ 2 | The base classes and common functions for embeddings 3 | 4 | """ 5 | 6 | from abc import ABCMeta, abstractmethod 7 | from collections import defaultdict 8 | from typing import Any, Union, List, Dict, Iterable, Tuple 9 | 10 | import numpy as np 11 | import pandas as pd 12 | import torch 13 | from torch import Tensor 14 | from torch import nn 15 | from torch.utils.data import DataLoader 16 | 17 | from ..preprocessing import IntegerEncoder 18 | 19 | 20 | def _isnan(value: Any) -> bool: 21 | return isinstance(value, float) and np.isnan(value) 22 | 23 | 24 | def _isnan_index(series: pd.Series) -> np.ndarray: 25 | return np.array([_isnan(value) for value in series.index]) 26 | 27 | 28 | def _linear_agg(num_fields, empty_param, batch): 29 | if num_fields != 0: 30 | return num_fields, empty_param 31 | return batch.shape[1], empty_param 32 | 33 | 34 | def _unique( 35 | X: Union[Tensor, np.ndarray, pd.DataFrame] 36 | ) -> Tuple[List[Iterable], List[bool]]: 37 | if isinstance(X, pd.DataFrame): 38 | uniques = [X[col].unique() for col in X.columns] 39 | elif isinstance(X, np.ndarray): 40 | uniques = [pd.unique(X[:, i]) for i in range(X.shape[1])] 41 | elif isinstance(X, Tensor): 42 | uniques = [torch.unique(X[:, i]).numpy() for i in range(X.shape[1])] 43 | else: 44 | raise TypeError( 45 | "input should be Pandas DataFrame, NumPy array, or PyTorch Tensor" 46 | ) 47 | nan_chk = [np.array([_isnan(value) for value in group]) for group in uniques] 48 | has_nan = [np.any(check) for check in nan_chk] 49 | uniques = [group[~check] for group, check in zip(uniques, nan_chk)] 50 | return uniques, has_nan 51 | 52 | 53 | def _unique_agg(uniques, has_nan, batch): 54 | for row in batch: 55 | for colnum, value in enumerate(row): 56 | value = value.item() 57 | if colnum >= len(uniques): 58 | uniques.append(set()) 59 | has_nan.append(False) 60 | if _isnan(value): 61 | has_nan[colnum] = True 62 | else: 63 | uniques[colnum].add(value) 64 | return uniques, has_nan 65 | 66 | 67 | def _value_counts( 68 | X: Union[Tensor, np.ndarray, pd.DataFrame] 69 | ) -> Tuple[List[Dict[Any, int]], List[int]]: 70 | if isinstance(X, (np.ndarray, pd.DataFrame)): 71 | if isinstance(X, pd.DataFrame): 72 | counts = [ 73 | X[col].value_counts(dropna=False, ascending=True) for col in X.columns 74 | ] 75 | else: 76 | counts = [ 77 | pd.value_counts(X[:, i], dropna=False, ascending=True) 78 | for i in range(X.shape[1]) 79 | ] 80 | nan_check = [_isnan_index(count) for count in counts] 81 | nan_counts = [sum(count.loc[isnan]) for count, isnan in zip(counts, nan_check)] 82 | unique_counts = [ 83 | count.loc[~isnan].to_dict() for count, isnan in zip(counts, nan_check) 84 | ] 85 | elif isinstance(X, Tensor): 86 | counts = [ 87 | [values.numpy() for values in torch.unique(X[:, i], return_counts=True)] 88 | for i in range(X.shape[1]) 89 | ] 90 | nan_check = [np.array([_isnan(val) for val in values]) for values, _ in counts] 91 | nan_counts = [np.sum(check) for check in nan_check] 92 | unique_counts = [ 93 | dict(zip(vals[~check], counts[~check])) 94 | for (vals, counts), check in zip(counts, nan_check) 95 | ] 96 | else: 97 | raise TypeError( 98 | "input should be Pandas DataFrame, NumPy array, or PyTorch Tensor" 99 | ) 100 | return unique_counts, nan_counts 101 | 102 | 103 | def _value_counts_agg(unique_counts, nan_counts, batch): 104 | for row in batch: 105 | for colnum, value in enumerate(row): 106 | value = value.item() 107 | if colnum >= len(unique_counts): 108 | unique_counts.append(defaultdict(int)) 109 | nan_counts.append(0) 110 | if _isnan(value): 111 | nan_counts[colnum] += 1 112 | else: 113 | unique_counts[colnum][value] += 1 114 | return unique_counts, nan_counts 115 | 116 | 117 | def _flatten_counts(unique_counts: List[Dict[int, int]]) -> List[int]: 118 | counts = [ 119 | [count.get(i, 0) for i in range(max(count) + 1)] 120 | for count in unique_counts 121 | ] 122 | return counts 123 | 124 | 125 | class EmbeddingBase(nn.Module, metaclass=ABCMeta): 126 | """ 127 | Base class for embeddings 128 | 129 | """ 130 | 131 | def __init__(self): 132 | super().__init__() 133 | self._isfit = False 134 | 135 | @abstractmethod 136 | def _fit_array(self, X): 137 | return 138 | 139 | @abstractmethod 140 | def _fit_iterable(self, X): 141 | return 142 | 143 | def fit(self, X) -> "EmbeddingBase": 144 | """ 145 | Create the embedding from training data 146 | 147 | Parameters 148 | ---------- 149 | X : array-like or iterable of array-like 150 | should be a PyTorch Tensor, NumPy array, Pandas DataFrame 151 | or iterable of arrays/tensors (i.e., batches) 152 | 153 | Return 154 | ------ 155 | self 156 | 157 | """ 158 | if isinstance(X, (np.ndarray, Tensor, pd.DataFrame)): 159 | self._fit_array(X) 160 | elif isinstance(X, DataLoader): 161 | self._fit_iterable(X) 162 | else: 163 | raise TypeError( 164 | "input X must be a PyTorch Tensor, PyTorch DataLoader, " 165 | "NumPy array, or Pandas DataFrame" 166 | ) 167 | 168 | self._isfit = True 169 | 170 | return self 171 | 172 | 173 | class BasicBase(EmbeddingBase): 174 | """Base class for embeddings that do not have defaults""" 175 | 176 | @abstractmethod 177 | def from_summary(self, uniques, has_nan) -> "BasicBase": 178 | return self 179 | 180 | def _fit_array(self, X): 181 | uniques, has_nan = _unique(X) 182 | self.from_summary(uniques, has_nan) 183 | 184 | def _fit_iterable(self, X): 185 | uniques = [] 186 | has_nan = [] 187 | for batch in X: 188 | _unique_agg(uniques, has_nan, batch) 189 | self.from_summary(uniques, has_nan) 190 | 191 | 192 | class DefaultBase(EmbeddingBase): 193 | """Base class for embeddings that have a default embedding for each field""" 194 | 195 | @abstractmethod 196 | def from_summary(self, unique_counts, nan_counts) -> "DefaultBase": 197 | return self 198 | 199 | def _fit_array(self, X): 200 | unique_counts, nan_counts = _value_counts(X) 201 | self.from_summary(unique_counts, nan_counts) 202 | 203 | def _fit_iterable(self, X): 204 | unique_counts = [] 205 | nan_counts = [] 206 | for batch in X: 207 | _value_counts_agg(unique_counts, nan_counts, batch) 208 | self.from_summary(unique_counts, nan_counts) 209 | 210 | 211 | class FastBasicBase(EmbeddingBase): 212 | """Base class for embeddings that do not have defaults""" 213 | 214 | @abstractmethod 215 | def from_summary(self, num_classes: List[int]) -> "FastBasicBase": 216 | return self 217 | 218 | def from_encoder(self, encoder: IntegerEncoder) -> "FastBasicBase": 219 | """ 220 | Initialize from a fit IntegerEncoder 221 | 222 | Parameters 223 | ---------- 224 | encoder : IntegerEncoder 225 | 226 | Return 227 | ------ 228 | self 229 | 230 | """ 231 | if not isinstance(encoder, IntegerEncoder): 232 | raise TypeError("encoder needs to be a fit IntegerEncoder") 233 | if not encoder._isfit: 234 | raise ValueError("encoder needs to be fit") 235 | return self.from_summary(encoder.num_classes) 236 | 237 | def _fit_array(self, X): 238 | uniques, has_nan = _unique(X) 239 | if any(has_nan): 240 | raise ValueError("NaN found in categorical data") 241 | self.from_summary([max(col_uniques) + 1 for col_uniques in uniques]) 242 | 243 | def _fit_iterable(self, X): 244 | uniques = [] 245 | has_nan = [] 246 | for batch in X: 247 | _unique_agg(uniques, has_nan, batch) 248 | if any(has_nan): 249 | raise ValueError("NaN found in categorical data") 250 | self.from_summary([max(col_uniques) + 1 for col_uniques in uniques]) 251 | 252 | 253 | class FastDefaultBase(EmbeddingBase): 254 | """Base class for embeddings that have a default embedding for each field""" 255 | 256 | @abstractmethod 257 | def from_summary(self, class_counts: List[List[int]]) -> "FastDefaultBase": 258 | return self 259 | 260 | def from_encoder(self, encoder: IntegerEncoder) -> "FastDefaultBase": 261 | """ 262 | Initialize from a fit IntegerEncoder 263 | 264 | Parameters 265 | ---------- 266 | encoder : IntegerEncoder 267 | 268 | Return 269 | ------ 270 | self 271 | 272 | """ 273 | if not isinstance(encoder, IntegerEncoder): 274 | raise TypeError("encoder needs to be a fit IntegerEncoder") 275 | if not encoder._isfit: 276 | raise ValueError("encoder needs to be fit") 277 | return self.from_summary(encoder.class_counts) 278 | 279 | def _fit_array(self, X): 280 | unique_counts, nan_counts = _value_counts(X) 281 | if any(nan_counts): 282 | raise ValueError("NaN found in categorical data") 283 | counts = _flatten_counts(unique_counts) 284 | self.from_summary(counts) 285 | 286 | def _fit_iterable(self, X): 287 | unique_counts = [] 288 | nan_counts = [] 289 | for batch in X: 290 | _value_counts_agg(unique_counts, nan_counts, batch) 291 | if any(nan_counts): 292 | raise ValueError("NaN found in categorical data") 293 | counts = _flatten_counts(unique_counts) 294 | self.from_summary(counts) 295 | -------------------------------------------------------------------------------- /xynn/embedding/uniform/categorical.py: -------------------------------------------------------------------------------- 1 | """ 2 | Classes for embedding categorical fields 3 | 4 | BasicEmbedding 5 | - embed each value with single vector 6 | DefaultEmbedding 7 | - like BasicEmbedding, but include a "default" vector for each field 8 | - returned vector is a weighted combination between the value's own vector 9 | and the field's "default" vector 10 | - the weighting is based on the count of value in the training set; a higher 11 | count puts more weight the value's own vector 12 | - values not seen in the training data are embedded with the default vector 13 | 14 | """ 15 | 16 | from typing import Any, Union, List, Dict, Optional, Tuple 17 | 18 | import numpy as np 19 | import torch 20 | from torch import Tensor 21 | from torch import nn 22 | 23 | from ..common import _isnan, BasicBase, DefaultBase 24 | from .base import UniformBase 25 | 26 | 27 | class BasicEmbedding(UniformBase, BasicBase): 28 | """ 29 | A basic embedding that creates an embedded vector for each field value. 30 | 31 | """ 32 | 33 | def __init__(self, embedding_size: int = 10, device: Union[str, torch.device] = "cpu"): 34 | """ 35 | Parameters 36 | ---------- 37 | embedding_size : int, optional 38 | size of each value's embedding vector; default is 10 39 | device : string or torch.device 40 | 41 | """ 42 | super().__init__() 43 | self.num_fields = 0 44 | self.output_size = 0 45 | self.lookup: Dict[Tuple[int, Any], int] = {} 46 | self.lookup_nan: Dict[int, int] = {} 47 | self.num_values = 0 48 | self.embedding: Optional[nn.Embedding] = None 49 | self.embedding_size = embedding_size 50 | self._device = device 51 | self.to(device) 52 | self._isfit = False 53 | 54 | def __repr__(self): 55 | return f"BasicEmbedding({repr(self.embedding_size)}, {repr(self._device)})" 56 | 57 | def from_summary( 58 | self, uniques: List[Union[List, Tensor, np.ndarray]], has_nan: List[bool] 59 | ): 60 | """ 61 | Create the embedding from category values for each field 62 | 63 | Parameters 64 | ---------- 65 | uniques : list of array-like 66 | all possible category values for each field 67 | has_nan : list of boolean 68 | whether each field can have NaN 69 | 70 | Return 71 | ------ 72 | self 73 | 74 | """ 75 | if not len(uniques) == len(has_nan): 76 | raise ValueError( 77 | "length of uniques and has_nan should be equal, " 78 | f"got {len(uniques)}, {len(has_nan)}" 79 | ) 80 | 81 | lookup = {} 82 | lookup_nan = {} 83 | num_values = 0 84 | for fieldnum, (field, use_nan) in enumerate(zip(uniques, has_nan)): 85 | for value in field: 86 | if (fieldnum, value) in lookup: 87 | # extra defense against repeated values 88 | continue 89 | lookup[(fieldnum, value)] = num_values 90 | num_values += 1 91 | if use_nan: 92 | lookup_nan[fieldnum] = num_values 93 | num_values += 1 94 | 95 | self.num_fields = len(uniques) 96 | self.output_size = self.num_fields * self.embedding_size 97 | self.lookup = lookup 98 | self.lookup_nan = lookup_nan 99 | self.num_values = num_values 100 | self.embedding = nn.Embedding(num_values, self.embedding_size).to(device=self._device) 101 | nn.init.xavier_uniform_(self.embedding.weight) 102 | 103 | self._isfit = True 104 | 105 | return self 106 | 107 | def forward(self, X: Tensor) -> Tensor: 108 | """ 109 | Produce embedding for each value in input 110 | 111 | Parameters 112 | ---------- 113 | X : torch.Tensor 114 | 115 | Return 116 | ------ 117 | torch.Tensor 118 | 119 | """ 120 | if not self._isfit: 121 | raise RuntimeError("need to call `fit` or `from_summary` first") 122 | 123 | idxs: List[List[int]] = [] 124 | for row in X: 125 | idxs.append([]) 126 | for col, val in enumerate(row): 127 | val = val.item() 128 | if _isnan(val): 129 | idx = self.lookup_nan[col] 130 | else: 131 | idx = self.lookup[(col, val)] 132 | idxs[-1].append(idx) 133 | 134 | return self.embedding(torch.tensor(idxs, dtype=torch.int64, device=self._device)) 135 | 136 | 137 | class DefaultEmbedding(UniformBase, DefaultBase): 138 | """ 139 | An embedding with a default value for each field. The default is returned for 140 | any field value not seen when the embedding was initialized (using `fit` or 141 | `from_summary`). For any value seen at initialization, a weighted average of 142 | that value's embedding and the default embedding is returned. The weights for 143 | the average are determined by the parameter `alpha`: 144 | 145 | weight = count / (count + alpha) 146 | final = embedding * weight + default * (1 - weight) 147 | 148 | """ 149 | 150 | def __init__( 151 | self, 152 | embedding_size: int = 10, 153 | alpha: int = 20, 154 | device: Union[str, torch.device] = "cpu", 155 | ): 156 | """ 157 | Parameters 158 | ---------- 159 | embedding_size : int, optional 160 | size of each value's embedding vector; default is 10 161 | alpha : int, optional 162 | controls the weighting of each embedding vector with the default; 163 | when `alpha`-many values are seen at initialization; the final 164 | vector is evenly weighted; the influence of the default is decreased 165 | with either higher counts or lower `alpha`; default is 20 166 | device : string or torch.device 167 | 168 | """ 169 | super().__init__() 170 | self.num_fields = 0 171 | self.output_size = 0 172 | self.alpha = alpha 173 | self.lookup: Dict[Tuple[int, Any], Tuple[int, int]] = {} 174 | self.lookup_nan: Dict[int, Tuple[int, int]] = {} 175 | self.lookup_default: Dict[int, Tuple[int, int]] = {} 176 | self.num_values = 0 177 | self.embedding: Optional[nn.Embedding] = None 178 | self.embedding_size = embedding_size 179 | self._device = device 180 | self.to(device) 181 | self._isfit = False 182 | 183 | def __repr__(self): 184 | embed_size = self.embedding_size 185 | alpha = self.alpha 186 | device = repr(self._device) 187 | return f"DefaultEmbedding({embed_size}, {alpha}, {device})" 188 | 189 | def from_summary(self, unique_counts: List[Dict[Any, int]], nan_counts: List[int]): 190 | """ 191 | Create the embedding from known value counts for each field 192 | 193 | Parameters 194 | ---------- 195 | unique_counts : list of dicts 196 | each dict is a mapping from Python object to count of occurrences, 197 | one dict for each field 198 | nan_counts : list of int 199 | count of NaN occurrences for each field 200 | 201 | Return 202 | ------ 203 | self 204 | 205 | """ 206 | if not len(unique_counts) == len(nan_counts): 207 | raise ValueError( 208 | "length of unique_counts and nan_counts should be equal, " 209 | f"got {len(unique_counts)}, {len(nan_counts)}" 210 | ) 211 | 212 | lookup = {} 213 | lookup_nan = {} 214 | lookup_default = {} 215 | num_values = 0 216 | for fieldnum, (counts, nan_count) in enumerate(zip(unique_counts, nan_counts)): 217 | lookup_default[fieldnum] = (num_values, 0) 218 | num_values += 1 219 | for value, count in counts.items(): 220 | lookup[(fieldnum, value)] = (num_values, count) 221 | num_values += 1 222 | if nan_count: 223 | lookup_nan[fieldnum] = (num_values, nan_count) 224 | num_values += 1 225 | 226 | self.num_fields = len(unique_counts) 227 | self.output_size = self.num_fields * self.embedding_size 228 | self.lookup = lookup 229 | self.lookup_nan = lookup_nan 230 | self.lookup_default = lookup_default 231 | self.num_values = num_values 232 | self.embedding = nn.Embedding(num_values, self.embedding_size).to(device=self._device) 233 | nn.init.xavier_uniform_(self.embedding.weight) 234 | 235 | self._isfit = True 236 | 237 | return self 238 | 239 | def forward(self, X: Tensor) -> Tensor: 240 | """ 241 | Produce embedding for each value in input 242 | 243 | Parameters 244 | ---------- 245 | X : torch.Tensor 246 | 247 | Return 248 | ------ 249 | torch.Tensor 250 | 251 | """ 252 | if not self._isfit: 253 | raise RuntimeError("need to call `fit` or `from_summary` first") 254 | 255 | list_weights: List[List[List[float]]] = [] 256 | idxs_primary: List[List[int]] = [] 257 | idxs_default: List[List[int]] = [] 258 | for row in X: 259 | list_weights.append([]) 260 | idxs_primary.append([]) 261 | idxs_default.append([]) 262 | for col, val in enumerate(row): 263 | val = val.item() 264 | default = self.lookup_default[col] 265 | if _isnan(val): 266 | idx, count = self.lookup_nan.get(col, default) 267 | else: 268 | idx, count = self.lookup.get((col, val), default) 269 | list_weights[-1].append([count / (count + self.alpha)]) 270 | idxs_primary[-1].append(idx) 271 | idxs_default[-1].append(default[0]) 272 | tsr_weights = torch.tensor(list_weights, dtype=torch.float32, device=self._device) 273 | emb_primary = self.embedding( 274 | torch.tensor(idxs_primary, dtype=torch.int64, device=self._device) 275 | ) 276 | emb_default = self.embedding( 277 | torch.tensor(idxs_default, dtype=torch.int64, device=self._device) 278 | ) 279 | return tsr_weights * emb_primary + (1 - tsr_weights) * emb_default 280 | -------------------------------------------------------------------------------- /xynn/embedding/ragged/fast_ragged.py: -------------------------------------------------------------------------------- 1 | """ 2 | Embeddings that allow embedding of multiple fields together and 3 | allow a different vector size for each field 4 | 5 | FastRaggedEmbedding 6 | - requires that each field's values are integers 0, 1, ... 7 | - embed each value with single vector 8 | - allows a different vector size for each field 9 | FastRaggedDefaultEmbedding 10 | - requires that each field's values are integers 0, 1, ... 11 | - like RaggedEmbedding, but include a "default" vector for each field 12 | - returned vector is a weighted combination between the value's own vector 13 | and the field's "default" vector 14 | - the weighting is based on the count of value in the training set; a higher 15 | count puts more weight the value's own vector 16 | - values not seen in the training data are embedded with the default vector 17 | - allows a different vector size for each field 18 | 19 | """ 20 | 21 | from typing import Union, List, Optional, Iterable 22 | 23 | import torch 24 | from torch import Tensor 25 | from torch import nn 26 | 27 | from ..common import FastBasicBase, FastDefaultBase 28 | from .common import RaggedBase, _check_embedding_size, _parse_embedding_size 29 | 30 | 31 | class FastRaggedEmbedding(RaggedBase, FastBasicBase): 32 | """ 33 | Creates an embedded vector for each field value, with each field allowed 34 | a different size of embedding 35 | 36 | """ 37 | 38 | def __init__( 39 | self, 40 | embedding_size: Union[str, int, Iterable[int]] = "sqrt", 41 | max_size: int = 100, 42 | device: Union[str, torch.device] = "cpu", 43 | ): 44 | """ 45 | Parameters 46 | ---------- 47 | embedding_size : {"sqrt", "log", "fastai"} or iterable of int; optional 48 | - "sqrt": square root of number of classes in each field, rounded up 49 | - "log": log of number of classes in each field, rounded up 50 | - "fastai": `round(1.6 * num_classes**0.56))` 51 | if iterable of int, the number of values must match the number of 52 | fields when calling `fit`; the embedding size can also be 53 | passed in later with `fit` or `from_summary`; default is "sqrt" 54 | max_size : int, optional 55 | maximum embedding size if using "sqrt", "log", or "fastai"; 56 | default is 100 57 | device : string or torch.device, optional 58 | 59 | """ 60 | super().__init__() 61 | embedding_size = _check_embedding_size(embedding_size) 62 | self.num_fields = 0 63 | self.output_size = 0 64 | self.num_classes: List[int] = [] 65 | self.embedding: Optional[nn.ModuleList] = None 66 | self.embedding_size_orig = embedding_size 67 | self.embedding_size = embedding_size 68 | self.max_size = max_size 69 | self._device = device 70 | self.to(device) 71 | self._isfit = False 72 | 73 | def __repr__(self): 74 | embed_size = repr(self.embedding_size_orig) 75 | max_size = self.max_size 76 | device = repr(self._device) 77 | return f"FastRaggedEmbedding({embed_size}, {max_size}, {device})" 78 | 79 | def from_summary(self, num_classes: List[int]) -> "FastRaggedEmbedding": 80 | """ 81 | Create the embedding from category values for each field 82 | 83 | Parameters 84 | ---------- 85 | num_classes : list of int 86 | number of category values for each field 87 | 88 | Return 89 | ------ 90 | self 91 | 92 | """ 93 | embedding_size = _parse_embedding_size( 94 | self.embedding_size, self.max_size, num_classes 95 | ) 96 | 97 | self.embedding = nn.ModuleList([]) 98 | for num_cats, size in zip(num_classes, embedding_size): 99 | embedding = nn.Embedding(num_cats, size).to(device=self._device) 100 | nn.init.xavier_uniform_(embedding.weight) 101 | self.embedding.append(embedding) 102 | 103 | self.num_fields = len(num_classes) 104 | self.output_size = sum(embedding_size) 105 | self.num_classes = num_classes 106 | self.embedding_size = embedding_size 107 | 108 | self._isfit = True 109 | 110 | return self 111 | 112 | def forward(self, X: Tensor) -> Tensor: 113 | """ 114 | Produce embedding for each value in input 115 | 116 | Parameters 117 | ---------- 118 | X : torch.Tensor 119 | 120 | Return 121 | ------ 122 | torch.Tensor 123 | 124 | """ 125 | if not self._isfit: 126 | raise RuntimeError("need to call `fit` or `from_summary` first") 127 | 128 | embedded = [ 129 | embedding(column).reshape((X.shape[0], -1)) 130 | for embedding, column in zip(self.embedding, X.split(1, dim=1)) 131 | ] 132 | 133 | return torch.cat(embedded, dim=1) 134 | 135 | 136 | class FastRaggedDefaultEmbedding(RaggedBase, FastDefaultBase): 137 | """ 138 | An embedding with a default value for each field and which allows a different 139 | embedding size for each field. The default is returned for any field value 140 | not seen when the embedding was initialized (using `fit` or `from_summary`). 141 | For any value seen at initialization, a weighted average of that value's 142 | embedding and the default embedding is returned. The weights for the average 143 | are determined by the parameter `alpha`: 144 | 145 | weight = count / (count + alpha) 146 | final = embedding * weight + default * (1 - weight) 147 | 148 | """ 149 | 150 | def __init__( 151 | self, 152 | embedding_size: Union[str, Iterable[int]] = "sqrt", 153 | max_size: int = 100, 154 | alpha: int = 20, 155 | device: Union[str, torch.device] = "cpu", 156 | ): 157 | """ 158 | Parameters 159 | ---------- 160 | embedding_size : {"sqrt", "log", "fastai"} or iterable of int; optional 161 | - "sqrt": square root of number of classes in each field, rounded up 162 | - "log": log of number of classes in each field, rounded up 163 | - "fastai": `round(1.6 * num_classes**0.56))` 164 | if iterable of int, the number of values must match the number of 165 | fields when calling `fit`; the embedding size can also be 166 | passed in later with `fit` or `from_summary`; default is "sqrt" 167 | max_size : int, optional 168 | maximum embedding size if using "sqrt", "log", or "fastai"; 169 | default is 100 170 | alpha : int, optional 171 | controls the weighting of each embedding vector with the default; 172 | when `alpha`-many values are seen at initialization; the final 173 | vector is evenly weighted; the influence of the default is decreased 174 | with either higher counts or lower `alpha`; default is 20 175 | device : string or torch.device 176 | 177 | """ 178 | super().__init__() 179 | embedding_size = _check_embedding_size(embedding_size) 180 | self.num_fields = 0 181 | self.output_size = 0 182 | self.alpha = alpha 183 | self.embedding: Optional[nn.ModuleList] = None 184 | self.embedding_size_orig = embedding_size 185 | self.embedding_size = embedding_size 186 | self.max_size = max_size 187 | self.num_classes: List[int] = [] 188 | self.max_values: Optional[Tensor] = None 189 | self.counts: List[Tensor] = [] 190 | self._device = device 191 | self.to(device) 192 | self._isfit = False 193 | 194 | def __repr__(self): 195 | embed_size = repr(self.embedding_size_orig) 196 | max_size = self.max_size 197 | alpha = self.alpha 198 | device = repr(self._device) 199 | return f"FastRaggedDefaultEmbedding({embed_size}, {max_size}, {alpha}, {device})" 200 | 201 | def from_summary(self, class_counts: List[List[int]]) -> "FastRaggedDefaultEmbedding": 202 | """ 203 | Create the embedding from known value counts for each field 204 | 205 | Parameters 206 | ---------- 207 | class_counts : list of list of int 208 | each sub-list has count of category occurrences, 209 | one sub-list for each field 210 | 211 | Return 212 | ------ 213 | self 214 | 215 | """ 216 | num_classes = [len(counts) for counts in class_counts] 217 | 218 | embedding_size = _parse_embedding_size( 219 | self.embedding_size, self.max_size, num_classes 220 | ) 221 | 222 | self.embedding = nn.ModuleList([]) 223 | for num_cls, size in zip(num_classes, embedding_size): 224 | embedding = nn.Embedding(num_cls + 1, size).to(device=self._device) 225 | nn.init.xavier_uniform_(embedding.weight) 226 | self.embedding.append(embedding) 227 | 228 | self.num_fields = len(class_counts) 229 | self.output_size = sum(embedding_size) 230 | self.num_classes = num_classes 231 | self.embedding_size = embedding_size 232 | self.max_values = torch.tensor([[x - 1 for x in num_classes]], device=self._device) 233 | self.counts = [ 234 | torch.tensor(counts, device=self._device) for counts in class_counts 235 | ] 236 | 237 | self._isfit = True 238 | 239 | return self 240 | 241 | def forward(self, X: Tensor) -> Tensor: 242 | """ 243 | Produce embedding for each value in input 244 | 245 | Parameters 246 | ---------- 247 | X : torch.Tensor 248 | 249 | Return 250 | ------ 251 | torch.Tensor 252 | 253 | """ 254 | if not self._isfit: 255 | raise RuntimeError("need to call `fit` or `from_summary` first") 256 | 257 | embedded = [] 258 | for embedding, counts, num_cls, X_col in zip( 259 | self.embedding, self.counts, self.num_classes, X.split(1, dim=1) 260 | ): 261 | unxpctd = (X_col >= num_cls) 262 | idx = torch.clone(X_col) 263 | idx[unxpctd] = 0 # block any unexpected categories 264 | 265 | counts = counts.expand(idx.shape[0], num_cls) 266 | counts = torch.gather(counts, dim=1, index=idx) 267 | weights = (counts / (counts + self.alpha)).unsqueeze(-1) 268 | weights[unxpctd] = 0 # block any unexpected categories 269 | 270 | primary = embedding(X_col) 271 | default = embedding.weight[num_cls:, :].unsqueeze(0) 272 | output = (weights * primary + (1 - weights) * default).reshape(X_col.shape[0], -1) 273 | embedded.append(output) 274 | 275 | return torch.cat(embedded, dim=1) 276 | -------------------------------------------------------------------------------- /tests/test_xdeepfm/test_modules.py: -------------------------------------------------------------------------------- 1 | 2 | import random 3 | 4 | import torch 5 | from torch import nn 6 | import numpy as np 7 | import pytest 8 | 9 | from xynn.base_classes.estimators import _set_seed 10 | from xynn.base_classes.modules import BaseNN 11 | from xynn.xdeepfm.modules import CIN 12 | from xynn.xdeepfm import XDeepFM 13 | from xynn.embedding import LinearEmbedding, BasicEmbedding 14 | from xynn.mlp import LeakyGate, GhostBatchNorm 15 | 16 | from ..common import simple_train_inputs, simple_model_train_loop 17 | 18 | 19 | def test_that_cin_raises_error_for_bad_layer_sizes_when_not_full_agg(): 20 | with pytest.raises( 21 | ValueError, 22 | match="when using full_agg=False, all but the last layer size must be even" 23 | ): 24 | CIN(num_fields=5, layer_sizes=(127, 127), full_agg=False) 25 | 26 | 27 | def test_cin_layers_without_activation_and_bn(): 28 | cin = CIN(num_fields=5, use_bn=False) 29 | assert len(cin.convs) == 2 30 | assert len(cin.actns) == 2 31 | assert len(cin.norms) == 2 32 | assert all(isinstance(conv, nn.Conv1d) for conv in cin.convs) 33 | assert all(isinstance(actn, nn.Identity) for actn in cin.actns) 34 | assert all(isinstance(norm, nn.Identity) for norm in cin.norms) 35 | 36 | 37 | def test_cin_layers_with_activation_and_bn(): 38 | cin = CIN(num_fields=5, activation=nn.ReLU, use_bn=True) 39 | assert len(cin.convs) == 2 40 | assert len(cin.actns) == 2 41 | assert len(cin.norms) == 2 42 | assert all(isinstance(conv, nn.Conv1d) for conv in cin.convs) 43 | assert all(isinstance(actn, nn.ReLU) for actn in cin.actns) 44 | assert all(isinstance(norm, nn.BatchNorm1d) for norm in cin.norms) 45 | 46 | 47 | def test_cin_shape_of_output(): 48 | x = torch.rand((20, 5, 8)) 49 | 50 | cin = CIN(num_fields=5, layer_sizes=(10,)) 51 | out = cin(x) 52 | assert out.shape == (20, 10) 53 | 54 | cin = CIN(num_fields=5, layer_sizes=(10, 10)) 55 | out = cin(x) 56 | assert out.shape == (20, 15) 57 | 58 | cin = CIN(num_fields=5, layer_sizes=(10, 10), full_agg=True) 59 | out = cin(x) 60 | assert out.shape == (20, 20) 61 | 62 | 63 | def test_that_xdeepfm_module_subclasses_basenn(): 64 | assert issubclass(XDeepFM, BaseNN) 65 | 66 | 67 | def test_that_xdeepfm_uses_basenn_init(): 68 | X = torch.randint(0, 10, (100, 10)) 69 | embedding_num = LinearEmbedding(embedding_size=3).fit(X) 70 | 71 | model = XDeepFM( 72 | task="classification", 73 | output_size=3, 74 | embedding_num=embedding_num, 75 | embedding_cat=None, 76 | embedding_l2_reg=0.2, 77 | mlp_l1_reg=0.1 78 | ) 79 | 80 | assert model.task == "classification" 81 | assert model.num_epochs == 0 82 | assert isinstance(model.loss_fn, nn.CrossEntropyLoss) 83 | assert model.embedding_num is embedding_num 84 | assert model.embedding_cat is None 85 | assert model.embedding_l1_reg == 0.0 86 | assert model.embedding_l2_reg == 0.2 87 | assert model.mlp_l1_reg == 0.1 88 | assert model.mlp_l2_reg == 0.0 89 | assert model.optimizer is None 90 | assert model.optimizer_info == {} 91 | assert model.scheduler == {} 92 | assert model._device == "cpu" 93 | 94 | 95 | def test_that_xdeepfm_parameters_are_passed_to_submodules(): 96 | X = torch.randint(0, 10, (100, 10)) 97 | embedding_cat = BasicEmbedding(embedding_size=3).fit(X) 98 | model = XDeepFM( 99 | task="classification", 100 | output_size=3, 101 | embedding_num=None, 102 | embedding_cat=embedding_cat, 103 | mlp_activation=nn.ReLU, 104 | mlp_hidden_sizes=(512, 128, 32), 105 | mlp_use_bn=False, 106 | mlp_use_skip=False, 107 | use_leaky_gate=False, 108 | ) 109 | 110 | expected_classes = [ 111 | nn.Linear, 112 | nn.ReLU, 113 | nn.Linear, 114 | nn.ReLU, 115 | nn.Linear, 116 | nn.ReLU, 117 | nn.Linear, 118 | ] 119 | for mlp in (model.mlp, model.cin_final): 120 | assert len(mlp.main_layers) == len(expected_classes) 121 | for layer, expected_class in zip(mlp.main_layers, expected_classes): 122 | assert isinstance(layer, expected_class) 123 | assert mlp.skip_layers is None 124 | 125 | assert model.use_residual 126 | assert isinstance(model.cin, CIN) and len(model.cin.convs) == 2 127 | assert model.mix.requires_grad 128 | 129 | 130 | def test_that_xdeepfm_parameters_are_passed_to_submodules_other_params(): 131 | X = torch.randint(0, 10, (100, 10)) 132 | embedding_cat = BasicEmbedding(embedding_size=3).fit(X) 133 | model = XDeepFM( 134 | task="classification", 135 | output_size=3, 136 | embedding_num=None, 137 | embedding_cat=embedding_cat, 138 | cin_layer_sizes=(20, 20, 20), 139 | cin_use_residual=False, 140 | mlp_use_skip=True, 141 | ) 142 | 143 | expected_classes = [ 144 | LeakyGate, 145 | nn.Linear, 146 | nn.BatchNorm1d, 147 | nn.LeakyReLU, 148 | nn.Linear, 149 | nn.BatchNorm1d, 150 | nn.LeakyReLU, 151 | nn.Linear, 152 | nn.BatchNorm1d, 153 | nn.LeakyReLU, 154 | nn.Linear, 155 | nn.BatchNorm1d, 156 | nn.LeakyReLU, 157 | nn.Linear, 158 | ] 159 | for mlp in (model.mlp, model.cin_final): 160 | assert len(mlp.main_layers) == len(expected_classes) 161 | for layer, expected_class in zip(mlp.main_layers, expected_classes): 162 | assert isinstance(layer, expected_class) 163 | assert mlp.skip_layers is not None 164 | 165 | assert not model.use_residual 166 | assert isinstance(model.cin, CIN) and len(model.cin.convs) == 3 167 | assert model.mix.requires_grad 168 | 169 | 170 | def test_xdeepfm_parameters_with_ghost_batch(): 171 | X = torch.randint(0, 10, (100, 10)) 172 | embedding_cat = BasicEmbedding(embedding_size=3).fit(X) 173 | model = XDeepFM( 174 | task="classification", 175 | output_size=3, 176 | embedding_num=None, 177 | embedding_cat=embedding_cat, 178 | cin_layer_sizes=(20, 20, 20), 179 | cin_use_residual=False, 180 | mlp_hidden_sizes=(128, 128), 181 | mlp_ghost_batch=12, 182 | mlp_use_skip=False, 183 | ) 184 | 185 | expected_classes = [ 186 | LeakyGate, 187 | nn.Linear, 188 | GhostBatchNorm, 189 | nn.LeakyReLU, 190 | nn.Linear, 191 | GhostBatchNorm, 192 | nn.LeakyReLU, 193 | nn.Linear, 194 | ] 195 | for mlp in (model.mlp, model.cin_final): 196 | assert len(mlp.main_layers) == len(expected_classes) 197 | for layer, expected_class in zip(mlp.main_layers, expected_classes): 198 | assert isinstance(layer, expected_class) 199 | assert mlp.skip_layers is None 200 | 201 | assert not model.use_residual 202 | assert isinstance(model.cin, CIN) and len(model.cin.convs) == 3 203 | assert model.mix.requires_grad 204 | 205 | 206 | def test_that_xdeepfm_diagram_exists_and_prints_something(capsys): 207 | XDeepFM.diagram() 208 | captured = capsys.readouterr() 209 | assert len(captured.out.split("\n")) > 5 210 | 211 | 212 | def test_xdeepfm_mlp_weight(): 213 | X = torch.randint(0, 10, (100, 10)) 214 | embedding_num = LinearEmbedding(embedding_size=3).fit(X) 215 | 216 | # without Linear after CIN 217 | model = XDeepFM( 218 | task="regression", 219 | output_size=1, 220 | embedding_num=embedding_num, 221 | embedding_cat=None, 222 | cin_use_mlp=False, 223 | mlp_use_bn=False, 224 | mlp_use_skip=False, 225 | use_leaky_gate=False, 226 | ) 227 | 228 | exp_w1 = 0 229 | exp_w2 = 0 230 | for mlp in (model.mlp, model.cin_final): 231 | exp_w1 += sum(l.weight.abs().sum().item() for l in mlp.main_layers[::2]) 232 | exp_w2 += sum((l.weight ** 2).sum().item() for l in mlp.main_layers[::2]) 233 | 234 | w1, w2 = model.mlp_weight_sum() 235 | assert np.isclose(w1.item(), exp_w1) 236 | assert np.isclose(w2.item(), exp_w2) 237 | 238 | # with MLP after CIN 239 | model = XDeepFM( 240 | task="regression", 241 | output_size=1, 242 | embedding_num=embedding_num, 243 | embedding_cat=None, 244 | cin_use_mlp=True, 245 | mlp_use_bn=False, 246 | mlp_use_skip=False, 247 | use_leaky_gate=False, 248 | ) 249 | 250 | exp_w1 = 0 251 | exp_w2 = 0 252 | for mlp in (model.mlp, model.cin_final): 253 | exp_w1 += sum(l.weight.abs().sum().item() for l in mlp.main_layers[::2]) 254 | exp_w2 += sum((l.weight ** 2).sum().item() for l in mlp.main_layers[::2]) 255 | 256 | w1, w2 = model.mlp_weight_sum() 257 | assert np.isclose(w1.item(), exp_w1) 258 | assert np.isclose(w2.item(), exp_w2) 259 | 260 | 261 | def test_that_xdeepfm_learns(): 262 | _set_seed(10101) 263 | 264 | X_num = torch.randint(0, 10, (100, 10)) 265 | X_cat = torch.randint(0, 5, (100, 1)) 266 | y = ( 267 | (X_cat - 2) + X_num[:, ::2].sum(dim=1) - X_num[:, 1::2].sum(dim=1) 268 | ).to(dtype=torch.float) 269 | 270 | model = XDeepFM( 271 | task="regression", 272 | output_size=1, 273 | embedding_num=LinearEmbedding(embedding_size=3).fit(X_num), 274 | embedding_cat=BasicEmbedding(embedding_size=3).fit(X_cat), 275 | mlp_hidden_sizes=[10, 8, 6], 276 | mlp_use_bn=False, 277 | mlp_use_skip=False, 278 | use_leaky_gate=False, 279 | ) 280 | 281 | loss_func = nn.MSELoss() 282 | optimizer = torch.optim.Adam(model.parameters(), lr=1e-1) 283 | loss_vals = simple_model_train_loop( 284 | model, X_num, X_cat, y, loss_func, optimizer, num_epochs=5 285 | ) 286 | 287 | assert loss_vals[0] > loss_vals[-1] 288 | 289 | 290 | def test_that_xdeepfm_learns_with_other_params(): 291 | _set_seed(10101) 292 | 293 | X_num = torch.randint(0, 10, (100, 10)) 294 | X_cat = torch.randint(0, 5, (100, 1)) 295 | y = ( 296 | (X_cat - 2) + X_num[:, ::2].sum(dim=1) - X_num[:, 1::2].sum(dim=1) 297 | ).to(dtype=torch.float) 298 | 299 | model = XDeepFM( 300 | task="regression", 301 | output_size=1, 302 | embedding_num=LinearEmbedding(embedding_size=3).fit(X_num), 303 | embedding_cat=BasicEmbedding(embedding_size=3).fit(X_cat), 304 | mlp_hidden_sizes=[], 305 | mlp_use_bn=False, 306 | mlp_use_skip=False, 307 | use_leaky_gate=False, 308 | ) 309 | 310 | loss_func = nn.MSELoss() 311 | optimizer = torch.optim.Adam(model.parameters(), lr=1e-1) 312 | loss_vals = simple_model_train_loop( 313 | model, X_num, X_cat, y, loss_func, optimizer, num_epochs=5 314 | ) 315 | 316 | assert loss_vals[0] > loss_vals[-1] 317 | --------------------------------------------------------------------------------