├── .gitignore ├── LICENSE ├── README.md ├── onfire ├── __init__.py ├── colab │ ├── __init__.py │ └── runners.py ├── data.py ├── embedders.py ├── fields.py ├── transformers.py └── utils.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 joshfp 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PyTorch OnFire 2 | > PyTorch meets Sklearn Pipelines. -------------------------------------------------------------------------------- /onfire/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.1.4" 2 | -------------------------------------------------------------------------------- /onfire/colab/__init__.py: -------------------------------------------------------------------------------- 1 | from .runners import SupervisedRunner 2 | -------------------------------------------------------------------------------- /onfire/colab/runners.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.optim import Adam 3 | from torch.optim.lr_scheduler import OneCycleLR 4 | from fastprogress.fastprogress import master_bar, progress_bar 5 | from collections import defaultdict 6 | import inspect 7 | import matplotlib.pyplot as plt 8 | 9 | from onfire.utils import batch_to_device 10 | 11 | all = [ 12 | 'SupervisedRunner', 13 | ] 14 | 15 | 16 | class SupervisedRunner: 17 | def __init__(self, model, loss_fn): 18 | self.model = model 19 | self.loss_fn = loss_fn 20 | 21 | def fit(self, train_dl, valid_dl, epochs, lr, metrics=None, optimizer=None, scheduler=None): 22 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 23 | self.model.to(device) 24 | optimizer = optimizer or Adam(self.model.parameters(), lr) 25 | if scheduler != False: 26 | scheduler = scheduler or OneCycleLR(optimizer, lr, epochs * len(train_dl)) 27 | else: 28 | scheduler = None 29 | self.train_stats = TrainTracker(metrics, validate=(valid_dl is not None)) 30 | bar = master_bar(range(epochs)) 31 | bar.write(self.train_stats.metrics_names, table=True) 32 | 33 | for epoch in bar: 34 | self.model.train() 35 | for batch in progress_bar(train_dl, parent=bar): 36 | batch = batch_to_device(batch, device) 37 | loss = self._train_batch(batch, optimizer, scheduler) 38 | loss.backward() 39 | optimizer.step() 40 | optimizer.zero_grad() 41 | if scheduler: 42 | scheduler.step() 43 | self.train_stats.update_train_loss(loss) 44 | 45 | valid_outputs = [] 46 | if valid_dl: 47 | self.model.eval() 48 | for batch in progress_bar(valid_dl, parent=bar): 49 | batch = batch_to_device(batch, device) 50 | output = self._valid_batch(batch) 51 | valid_outputs.append(output) 52 | 53 | self.train_stats.log_epoch_results(valid_outputs) 54 | bar.write(self.train_stats.get_metrics_values(), table=True) 55 | 56 | def predict(self, dl, include_target=False): 57 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 58 | self.model.to(device) 59 | self.model.eval() 60 | preds, ys = [], [] 61 | for batch in progress_bar(dl): 62 | batch = batch_to_device(batch, device) 63 | pred, y = self._predict_batch(batch, include_target) 64 | preds.append(pred) 65 | ys.append(y) 66 | preds = torch.cat(preds) 67 | return (preds, torch.cat(ys)) if include_target else preds 68 | 69 | def _train_batch(self, batch, optimizer, scheduler): 70 | *xb, yb = batch 71 | output = self.model(*xb) 72 | return self.loss_fn(output, yb) 73 | 74 | def _valid_batch(self, batch): 75 | *xb, yb = batch 76 | with torch.no_grad(): 77 | output = self.model(*xb) 78 | loss = self.loss_fn(output, yb) 79 | return {'loss': loss.item(), 'y_true': yb.cpu(), 'y_pred': output.cpu()} 80 | 81 | def _predict_batch(self, batch, include_target): 82 | xb = batch[:-1] if len(batch) > 1 else [batch[0]] 83 | yb = batch[-1].cpu() if include_target and len(batch) > 1 else None 84 | with torch.no_grad(): 85 | output = self.model(*xb) 86 | return output.cpu(), yb 87 | 88 | 89 | class TrainTracker: 90 | def __init__(self, metrics, validate): 91 | if validate: 92 | self.valid_loss = [] 93 | metrics = metrics if isinstance(metrics, (list, tuple)) else [metrics] 94 | self.metrics = [Metric(metric_fn) for metric_fn in metrics if metric_fn] 95 | self.train_smooth_loss = ExponentialMovingAverage() 96 | self.train_loss = [] 97 | self.epoch = 0 98 | self.validate = validate 99 | 100 | @property 101 | def metrics_names(self): 102 | default_metrics = ['epoch', 'train_loss'] 103 | metrics = [] 104 | if self.validate: 105 | metrics.append('valid_loss') 106 | metrics.extend([metric.name for metric in self.metrics]) 107 | return default_metrics + metrics 108 | 109 | def update_train_loss(self, loss): 110 | self.train_smooth_loss.update(loss.item()) 111 | 112 | def log_epoch_results(self, valid_output): 113 | self.epoch = self.epoch + 1 114 | self.train_loss.append(self.train_smooth_loss.value) 115 | 116 | if self.validate: 117 | valid_output = self._process_valid_output(valid_output) 118 | valid_loss = valid_output['loss'].mean().item() 119 | for metric in self.metrics: 120 | metric.update(**valid_output) 121 | self.valid_loss.append(valid_loss) 122 | 123 | def get_metrics_values(self, decimals=5): 124 | default_metrics = [self.epoch, self.train_loss[-1]] 125 | metrics = [] 126 | if self.validate: 127 | metrics.append(self.valid_loss[-1]) 128 | metrics.extend([metric.value for metric in self.metrics]) 129 | res = default_metrics + metrics 130 | return [str(x) if isinstance(x, int) else str(round(x, decimals)) for x in res] 131 | 132 | def _process_valid_output(self, valid_output): 133 | res = defaultdict(list) 134 | for d in valid_output: 135 | for k, v in d.items(): 136 | v = v if isinstance(v, torch.Tensor) else torch.tensor(v) 137 | v = v if len(v.shape) else v.view(1) 138 | res[k].append(v) 139 | return {k: torch.cat(v) for k, v in res.items()} 140 | 141 | def plot_loss(self): 142 | fig, ax = plt.subplots() 143 | ax.plot(self.train_loss, label='train') 144 | ax.plot(self.valid_loss, label='valid') 145 | ax.legend() 146 | 147 | 148 | class ExponentialMovingAverage(): 149 | def __init__(self, beta=0.1): 150 | self.beta = beta 151 | self.initialized = False 152 | 153 | def update(self, value): 154 | if self.initialized: 155 | self.mean = value * self.beta + self.mean * (1 - self.beta) 156 | else: 157 | self.mean = value 158 | self.initialized = True 159 | 160 | @property 161 | def value(self): 162 | return self.mean 163 | 164 | 165 | class Metric: 166 | def __init__(self, metric_fn): 167 | self.metric_fn = metric_fn 168 | self.name = metric_fn.__name__ if inspect.isfunction(metric_fn) else str(metric_fn) 169 | self.value = None 170 | 171 | def update(self, **kwargs): 172 | y_true, y_pred = kwargs['y_true'], kwargs['y_pred'] 173 | self.value = self.metric_fn(y_true, y_pred) 174 | -------------------------------------------------------------------------------- /onfire/data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from torch.utils.data import Dataset, DataLoader 4 | import lmdb 5 | import tempfile 6 | import msgpack 7 | import struct 8 | 9 | __all__ = [ 10 | 'OnFireDataLoader', 11 | 'OnFireDataset', 12 | ] 13 | 14 | 15 | class OnFireDataLoader(DataLoader): 16 | def __init__(self, data, tfms, batch_size, shuffle=False, num_workers=0, 17 | sampler=None, pin_memory=None, drop_last=False, **kwargs): 18 | num_workers = num_workers if num_workers else os.cpu_count() 19 | pin_memory = pin_memory if pin_memory != None else torch.cuda.is_available() 20 | self.ds = OnFireDataset(data, max_readers=num_workers) 21 | self.tfms = tfms if isinstance(tfms, (list, tuple)) else [tfms] 22 | super().__init__(self.ds, batch_size=batch_size, shuffle=shuffle, 23 | num_workers=num_workers, collate_fn=self.__collate, sampler=sampler, 24 | pin_memory=pin_memory, drop_last=drop_last, **kwargs) 25 | 26 | def __collate(self, batch): 27 | return tuple([tfm(batch) for tfm in self.tfms]) 28 | 29 | 30 | class OnFireDataset(Dataset): 31 | def __init__(self, data, max_readers): 32 | self.use_lmdb = max_readers > 1 33 | if self.use_lmdb: 34 | tmpdir = tempfile.TemporaryDirectory().name 35 | self.db = lmdb.open(tmpdir, map_size=1024**4, lock=False, max_readers=max_readers) 36 | self.key_struct = struct.Struct("!q") 37 | it = [(self.key_struct.pack(i), msgpack.packb(x)) for i, x in enumerate(data)] 38 | with self.db.begin(write=True) as txn: 39 | with txn.cursor() as cursor: 40 | cursor.putmulti(it) 41 | else: 42 | self.data = data 43 | self._len = len(data) 44 | 45 | def __getitem__(self, idx): 46 | if self.use_lmdb: 47 | key = self.key_struct.pack(idx) 48 | with self.db.begin() as txn: 49 | return msgpack.unpackb(txn.get(key)) 50 | else: 51 | return self.data[idx] 52 | 53 | def __len__(self): 54 | return self._len 55 | -------------------------------------------------------------------------------- /onfire/embedders.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | __all__ = [ 5 | 'ConcatEmbeddings', 6 | 'PassThrough', 7 | 'MeanOfEmbeddings', 8 | ] 9 | 10 | 11 | class ConcatEmbeddings(nn.Module): 12 | def __init__(self, fields): 13 | super().__init__() 14 | self.output_dim = sum([field.output_dim for field in fields.values()]) 15 | self.embedders = nn.ModuleList([field.build_embedder() for field in fields.values()]) 16 | 17 | def forward(self, x): 18 | res = [embedder(values) for embedder, values in zip(self.embedders, x)] 19 | return torch.cat(res, dim=1) 20 | 21 | 22 | class PassThrough(nn.Module): 23 | def forward(self, x): 24 | return x 25 | 26 | 27 | class MeanOfEmbeddings(nn.Module): 28 | def __init__(self, vocab_size, emb_dim): 29 | super().__init__() 30 | self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=0) 31 | 32 | def forward(self, x): 33 | mask = (x != 0).float()[:, :, None] 34 | emb = self.emb(x) * mask.float() 35 | s = mask.squeeze(2).sum(1).clamp_min(1.)[:, None].float() 36 | return emb.sum(dim=1) / s 37 | -------------------------------------------------------------------------------- /onfire/fields.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from collections import OrderedDict 3 | from sklearn.base import TransformerMixin, BaseEstimator 4 | from sklearn.pipeline import make_pipeline 5 | from sklearn.preprocessing import StandardScaler 6 | from sklearn.impute import SimpleImputer 7 | from abc import ABC, abstractmethod 8 | 9 | from .transformers import ( 10 | Projector, LabelEncoder, BasicTokenizer, TokensEncoder, 11 | ToTensor, MultiLabelEncoder, To2DFloatArray, Log) 12 | from .embedders import ConcatEmbeddings, PassThrough, MeanOfEmbeddings 13 | 14 | __all__ = [ 15 | 'CategoricalFeature', 16 | 'TextFeature', 17 | 'ContinuousFeature', 18 | 'FeatureGroup', 19 | 'SingleLabelTarget', 20 | 'MultiLabelTarget', 21 | 'ContinuousTarget', 22 | ] 23 | 24 | 25 | class BaseField(ABC, TransformerMixin, BaseEstimator): 26 | def __init__(self, key, preprocessor, custom_tfms=None, dtype=None): 27 | tfms = [] 28 | if key: tfms.append(Projector(key)) 29 | if preprocessor: tfms.append(preprocessor) 30 | if custom_tfms: tfms.extend(custom_tfms) 31 | tfms.append(ToTensor(dtype=dtype)) 32 | self.pipe = make_pipeline(*tfms) 33 | 34 | def transform(self, X): 35 | return self.pipe.transform(X) 36 | 37 | def inverse_transform(self, X): 38 | return self.pipe.inverse_transform(X) 39 | 40 | @property 41 | @abstractmethod 42 | def output_dim(self): 43 | pass 44 | 45 | 46 | class BaseFeature(BaseField): 47 | @abstractmethod 48 | def build_embedder(self): 49 | pass 50 | 51 | @property 52 | def embedder(self): 53 | pass 54 | 55 | 56 | class CategoricalFeature(BaseFeature): 57 | def __init__(self, key=None, preprocessor=None, emb_dim=None): 58 | self.key = key 59 | self.preprocessor = preprocessor 60 | self.emb_dim = emb_dim 61 | self.categorical_encoder = LabelEncoder() 62 | 63 | tfms = [self.categorical_encoder] 64 | super().__init__(self.key, self.preprocessor, tfms, dtype=torch.int64) 65 | 66 | def fit(self, X, y=None): 67 | self.pipe.fit(X) 68 | self.vocab = self.categorical_encoder.vocab 69 | self.emb_dim = self.emb_dim or min(len(self.vocab) // 2, 50) 70 | return self 71 | 72 | def build_embedder(self): 73 | self._embedder = torch.nn.Embedding(len(self.vocab), self.emb_dim) 74 | return self.embedder 75 | 76 | @property 77 | def output_dim(self): 78 | return self.emb_dim 79 | 80 | @property 81 | def embedder(self): 82 | return self._embedder 83 | 84 | 85 | class TextFeature(BaseFeature): 86 | def __init__(self, key=None, preprocessor=None, max_len=50, max_vocab=50000, 87 | min_freq=3, emb_dim=100, tokenizer=None, embedder_cls=None, 88 | embedder_vocab_size_param='vocab_size', embedder_args=None): 89 | self.max_len = max_len 90 | self.max_vocab = max_vocab 91 | self.key = key 92 | self.preprocessor = preprocessor 93 | self.tokenizer = tokenizer or BasicTokenizer() 94 | self.min_freq = min_freq 95 | self.emb_dim = emb_dim 96 | self.token_encoder = TokensEncoder(self.max_len, self.max_vocab, self.min_freq) 97 | self.embedder_cls = embedder_cls or MeanOfEmbeddings 98 | self.embedder_vocab_size_param = embedder_vocab_size_param 99 | self.embedder_args = embedder_args or {} 100 | if self.embedder_cls == MeanOfEmbeddings: 101 | self.embedder_args['emb_dim'] = self.emb_dim 102 | 103 | tfms = [self.tokenizer, self.token_encoder] 104 | super().__init__(self.key, self.preprocessor, tfms, dtype=torch.int64) 105 | 106 | def fit(self, X, y=None): 107 | self.pipe.fit(X) 108 | self.vocab = self.token_encoder.vocab 109 | self.embedder_args[self.embedder_vocab_size_param] = len(self.vocab) 110 | return self 111 | 112 | def build_embedder(self): 113 | self._embedder = self.embedder_cls(**self.embedder_args) 114 | sample_input = torch.randint(len(self.vocab), (2, self.max_len)) 115 | self.emb_dim = self.embedder(sample_input).shape[1] 116 | return self.embedder 117 | 118 | @property 119 | def output_dim(self): 120 | return self.emb_dim 121 | 122 | @property 123 | def embedder(self): 124 | return self._embedder 125 | 126 | 127 | class ContinuousFeature(BaseFeature): 128 | def __init__(self, key=None, preprocessor=None, imputer=None, scaler=None, log=False, 129 | log_auto_scale=True): 130 | self.key = key 131 | self.preprocessor = preprocessor 132 | self.imputer = (imputer or SimpleImputer()) if imputer != False else None 133 | self.scaler = (scaler or StandardScaler()) if scaler != False else None 134 | self.log = log 135 | self.log_auto_scale = log_auto_scale 136 | 137 | tfms = [] 138 | tfms.append(To2DFloatArray()) 139 | if self.imputer: tfms.append(self.imputer) 140 | if self.log: tfms.append(Log(auto_scale=self.log_auto_scale)) 141 | if self.scaler: tfms.append(self.scaler) 142 | super().__init__(self.key, self.preprocessor, tfms, dtype=torch.float32) 143 | 144 | def fit(self, X, y=None): 145 | self.pipe.fit(X) 146 | self.emb_dim = self.transform([X[0]]).shape[1] 147 | return self 148 | 149 | def build_embedder(self): 150 | self._embedder = PassThrough() 151 | return self.embedder 152 | 153 | @property 154 | def output_dim(self): 155 | return self.emb_dim 156 | 157 | @property 158 | def embedder(self): 159 | return self._embedder 160 | 161 | 162 | class FeatureGroup(BaseFeature): 163 | def __init__(self, fields): 164 | self.fields = OrderedDict(fields) 165 | 166 | def fit(self, X, y=None): 167 | for field in self.fields.values(): 168 | field.fit(X) 169 | return self 170 | 171 | def transform(self, X, y=None): 172 | return [field.transform(X) for field in self.fields.values()] 173 | 174 | def inverse_transform(self, X): 175 | tmp = [field.inverse_transform(X[i]) for i, field in enumerate(self.fields.values())] 176 | res = [] 177 | for i in range(len(X[0])): 178 | d = {} 179 | for field in tmp: 180 | d.update(field[i]) 181 | res.append(d) 182 | return res 183 | 184 | def build_embedder(self): 185 | self._embedder = ConcatEmbeddings(self.fields) 186 | return self.embedder 187 | 188 | @property 189 | def output_dim(self): 190 | return self.embedder.output_dim 191 | 192 | @property 193 | def embedder(self): 194 | return self._embedder 195 | 196 | 197 | class SingleLabelTarget(BaseField): 198 | def __init__(self, key=None, preprocessor=None, dtype=torch.int64): 199 | self.key = key 200 | self.preprocessor = preprocessor 201 | self.categorical_encoder = LabelEncoder(is_target=True) 202 | self.dtype = dtype 203 | 204 | tfms = [self.categorical_encoder] 205 | super().__init__(self.key, self.preprocessor, tfms, dtype=self.dtype) 206 | 207 | def fit(self, X, y=None): 208 | self.pipe.fit(X) 209 | self.classes = self.categorical_encoder.vocab 210 | return self 211 | 212 | @property 213 | def output_dim(self): 214 | return len(self.classes) 215 | 216 | 217 | class MultiLabelTarget(BaseField): 218 | def __init__(self, key=None, preprocessor=None, dtype=torch.float32): 219 | self.key = key 220 | self.preprocessor = preprocessor 221 | self.multi_label_encoder = MultiLabelEncoder() 222 | self.dtype = dtype 223 | 224 | tfms = [self.multi_label_encoder] 225 | super().__init__(self.key, self.preprocessor, tfms, dtype=self.dtype) 226 | 227 | def fit(self, X, y=None): 228 | self.pipe.fit(X) 229 | self.classes = self.multi_label_encoder.vocab 230 | return self 231 | 232 | @property 233 | def output_dim(self): 234 | return len(self.classes) 235 | 236 | 237 | class ContinuousTarget(BaseField): 238 | def __init__(self, key=None, preprocessor=None, log=False, log_auto_scale=False): 239 | self.key = key 240 | self.preprocessor = preprocessor 241 | self.log = log 242 | self.log_auto_scale = log_auto_scale 243 | 244 | tfms = [] 245 | tfms.append(To2DFloatArray()) 246 | if self.log: tfms.append(Log(auto_scale=self.log_auto_scale)) 247 | 248 | super().__init__(self.key, self.preprocessor, tfms, dtype=torch.float32) 249 | 250 | def fit(self, X, y=None): 251 | self.pipe.fit(X) 252 | self.out_dim = self.transform([X[0]]).shape[1] 253 | return self 254 | 255 | @property 256 | def output_dim(self): 257 | return self.out_dim 258 | -------------------------------------------------------------------------------- /onfire/transformers.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from collections import Counter 4 | from sklearn.base import TransformerMixin, BaseEstimator 5 | from unidecode import unidecode 6 | 7 | __all__ = [ 8 | 'Projector', 9 | 'LabelEncoder', 10 | 'BasicTokenizer', 11 | 'TokensEncoder', 12 | 'ToTensor', 13 | 'MultiLabelEncoder', 14 | 'To2DFloatArray', 15 | 'Log', 16 | ] 17 | 18 | 19 | class Projector(TransformerMixin, BaseEstimator): 20 | def __init__(self, keys): 21 | self.keys = keys if isinstance(keys, list) else [keys] 22 | 23 | def fit(self, X, y=None): 24 | return self 25 | 26 | def _get(self, x): 27 | for key in self.keys: 28 | x = x.get(key) 29 | return x 30 | 31 | def transform(self, X): 32 | return [self._get(x) for x in X] 33 | 34 | def _inverse(self, x): 35 | t = x 36 | for key in reversed(self.keys): 37 | t = {key: t} 38 | return t 39 | 40 | def inverse_transform(self, X): 41 | return [self._inverse(x) for x in X] 42 | 43 | 44 | class LabelEncoder(TransformerMixin, BaseEstimator): 45 | class UnknownLabel: 46 | def __repr__(self): 47 | return "" 48 | 49 | def __init__(self, is_target=False): 50 | self.is_target = is_target 51 | 52 | def fit(self, X, y=None): 53 | self.vocab = sorted([x for x in set(X) if x is not None]) 54 | if not self.is_target: 55 | self.vocab.insert(0, self.UnknownLabel()) 56 | self.category2code = {x: i for i, x in enumerate(self.vocab)} 57 | return self 58 | 59 | def _get_category_code(self, x): 60 | return self.category2code.get(x) if self.is_target else self.category2code.get(x, 0) 61 | 62 | def transform(self, X): 63 | return np.array([self._get_category_code(x) for x in X], dtype=np.int) 64 | 65 | def inverse_transform(self, X): 66 | return [self.vocab[x] for x in X] 67 | 68 | 69 | class BasicTokenizer(TransformerMixin, BaseEstimator): 70 | def __init__(self, lower=True, map_to_ascii=True): 71 | self.lower = lower 72 | self.map_to_ascii = map_to_ascii 73 | 74 | def _preprocess(self, text): 75 | if text is None: 76 | text = "" 77 | elif not isinstance(text, str): 78 | text = str(text) 79 | 80 | if self.map_to_ascii: 81 | text = unidecode(text) 82 | if self.lower: 83 | text = text.lower() 84 | return text 85 | 86 | def _tokenize(self, text): 87 | res = [] 88 | for token in text.split(): 89 | while token and not token[-1].isalnum(): 90 | token = token[:-1] 91 | while token and not token[0].isalnum(): 92 | token = token[1:] 93 | if token: 94 | res.append(token) 95 | return res 96 | 97 | def fit(self, X, y=None): 98 | return self 99 | 100 | def transform(self, X): 101 | X = [self._preprocess(x) for x in X] 102 | transformed = {hash(x): self._tokenize(x) for x in set(X)} 103 | return [transformed[hash(x)] for x in X] 104 | 105 | def inverse_transform(self, X): 106 | return [' '.join(x) for x in X] 107 | 108 | 109 | class TokensEncoder(TransformerMixin, BaseEstimator): 110 | class PaddingToken: 111 | def __repr__(self): 112 | return "" 113 | 114 | class UnknownToken: 115 | def __repr__(self): 116 | return "" 117 | 118 | def __init__(self, max_len, max_vocab, min_freq): 119 | self.max_len = max_len 120 | self.max_vocab = max_vocab 121 | self.min_freq = min_freq 122 | 123 | def fit(self, X, y=None): 124 | token_freq = Counter() 125 | for sentence in X: 126 | token_freq.update(sentence) 127 | 128 | vocab = [token for token, count in token_freq.most_common(self.max_vocab) 129 | if count >= self.min_freq] 130 | vocab.insert(0, self.PaddingToken()) 131 | vocab.insert(1, self.UnknownToken()) 132 | self.token2code = {token: i for i, token in enumerate(vocab)} 133 | self.vocab = vocab 134 | return self 135 | 136 | def transform(self, X): 137 | res = np.zeros((len(X), self.max_len), dtype=np.int) 138 | for i, sentence in enumerate(X): 139 | codes = [self.token2code.get(token, 1) for token in sentence[:self.max_len]] 140 | sentence_len = min(len(sentence), self.max_len) 141 | res[i, :sentence_len] = np.array(codes) 142 | return res 143 | 144 | def inverse_transform(self, X): 145 | return [[str(self.vocab[token_code]) for token_code in x if token_code != 0] for x in X] 146 | 147 | 148 | class ToTensor(TransformerMixin, BaseEstimator): 149 | def __init__(self, dtype=None): 150 | self.dtype = dtype 151 | 152 | def fit(self, X, y=None): 153 | return self 154 | 155 | def transform(self, X): 156 | return torch.tensor(X, dtype=self.dtype) 157 | 158 | def inverse_transform(self, X): 159 | return X.numpy() 160 | 161 | 162 | class MultiLabelEncoder(TransformerMixin, BaseEstimator): 163 | def fit(self, X, y=None): 164 | self.vocab = sorted(set([label for row in X for label in row])) 165 | return self 166 | 167 | def transform(self, X): 168 | return [[(_class in row) for _class in self.vocab] for row in X] 169 | 170 | 171 | class To2DFloatArray(TransformerMixin, BaseEstimator): 172 | def fit(self, X, y=None): 173 | return self 174 | 175 | def transform(self, X): 176 | X = np.array(X, dtype=np.object) 177 | X[X == ''] = np.nan 178 | return X.astype(np.float32).reshape(len(X), -1) 179 | 180 | def inverse_transform(self, X): 181 | return np.squeeze(X, axis=-1) 182 | 183 | 184 | class Log(TransformerMixin, BaseEstimator): 185 | def __init__(self, auto_scale): 186 | self.auto_scale = auto_scale 187 | 188 | def fit(self, X, y=None): 189 | min_ = min(X) 190 | self.offset = 1 - min_ if (self.auto_scale and min_ < 1) else 0 191 | return self 192 | 193 | def transform(self, X): 194 | return np.log(X + self.offset) 195 | 196 | def inverse_transform(self, X): 197 | return np.exp(X) - self.offset 198 | -------------------------------------------------------------------------------- /onfire/utils.py: -------------------------------------------------------------------------------- 1 | from functools import wraps 2 | import torch 3 | 4 | __all__ = [ 5 | 'mappify', 6 | 'batch_to_device', 7 | ] 8 | 9 | 10 | def mappify(func): 11 | @wraps(func) 12 | def inner(X, **kwargs): 13 | return [func(x, **kwargs) for x in X] 14 | return inner 15 | 16 | 17 | def batch_to_device(batch, device): 18 | if isinstance(batch, torch.Tensor): 19 | return batch.to(device) 20 | elif isinstance(batch, (list, tuple)): 21 | res = [batch_to_device(x, device) for x in batch] 22 | return res if isinstance(batch, list) else tuple(res) 23 | elif isinstance(batch, dict): 24 | return {k: batch_to_device(v, device) for k, v in batch.items()} 25 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from setuptools.extern import packaging 3 | import ast 4 | 5 | with open('onfire/__init__.py') as f: 6 | for line in f: 7 | if line.startswith('__version__'): 8 | version = ast.parse(line).body[0].value.s 9 | version = str(packaging.version.Version(version)) 10 | break 11 | 12 | with open('README.md') as f: 13 | long_description = f.read() 14 | 15 | setup( 16 | name='pytorch-onfire', 17 | version=version, 18 | author='Jose Fernandez Portal, Rafael Carrascosa', 19 | author_email='jose.fp@gmail.com', 20 | description='PyTorch meets Sklearn Pipelines.', 21 | long_description=long_description, 22 | long_description_content_type='text/markdown', 23 | url='https://github.com/joshfp/pytorch-onfire', 24 | license='MIT', 25 | packages=[ 26 | 'onfire', 27 | 'onfire.colab', 28 | ], 29 | install_requires=[ 30 | 'torch', 31 | 'scikit-learn', 32 | 'Unidecode', 33 | 'lmdb', 34 | 'msgpack', 35 | 'fastprogress', 36 | 'matplotlib', 37 | ], 38 | python_requires='>=3.6', 39 | classifiers=[ 40 | 'Intended Audience :: Developers', 41 | 'Topic :: Software Development', 42 | 'Development Status :: 4 - Beta', 43 | 'Programming Language :: Python', 44 | 'Programming Language :: Python :: 3', 45 | 'Programming Language :: Python :: 3.6', 46 | 'Programming Language :: Python :: 3.7', 47 | 'Programming Language :: Python :: 3.8', 48 | ], 49 | ) 50 | --------------------------------------------------------------------------------