├── toolkit ├── __init__.py ├── preprocessing │ ├── __init__.py │ ├── text.py │ └── misc.py ├── keras_transformers │ ├── __init__.py │ ├── loaders.py │ ├── callbacks.py │ ├── contrib.py │ ├── embeddings.py │ ├── models.py │ └── architectures.py ├── catboost_transformers │ ├── __init__.py │ ├── requirements.txt │ └── models.py ├── lightgbm_transformers │ ├── __init__.py │ ├── requirements.txt │ └── models.py ├── pytorch_transformers │ ├── __init__.py │ ├── loaders │ │ ├── __init__.py │ │ ├── classification.py │ │ └── segmentation.py │ ├── architectures │ │ ├── __init__.py │ │ ├── utils.py │ │ └── unet.py │ ├── utils.py │ ├── validation.py │ ├── models.py │ └── callbacks.py ├── sklearn_transformers │ ├── __init__.py │ └── models.py ├── xgboost_transformers │ ├── __init__.py │ ├── requirements.txt │ └── models.py ├── toolkit_base.py ├── resources │ └── apostrophes.json ├── utils.py └── postprocessing.py ├── setup.cfg ├── requirements.txt ├── .github └── ISSUE_TEMPLATE │ ├── everything-else.md │ └── bug.md ├── README.md ├── PULL_REQUEST_TEMPLATE.md ├── CONTRIBUTING.md ├── LICENSE ├── setup.py ├── .gitignore └── CODE_OF_CONDUCT.md /toolkit/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /toolkit/preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /toolkit/keras_transformers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /toolkit/catboost_transformers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /toolkit/lightgbm_transformers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /toolkit/pytorch_transformers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /toolkit/sklearn_transformers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /toolkit/xgboost_transformers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /toolkit/pytorch_transformers/loaders/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /toolkit/pytorch_transformers/architectures/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /toolkit/catboost_transformers/requirements.txt: -------------------------------------------------------------------------------- 1 | catboost 2 | steppy 3 | -------------------------------------------------------------------------------- /toolkit/toolkit_base.py: -------------------------------------------------------------------------------- 1 | class SteppyToolkitError(Exception): 2 | pass 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | neptune-cli>=2.8.0 2 | setuptools>=39.2.0 3 | steppy>=0.1.9 4 | -------------------------------------------------------------------------------- /toolkit/xgboost_transformers/requirements.txt: -------------------------------------------------------------------------------- 1 | attrdict 2 | steppy 3 | xgboost 4 | -------------------------------------------------------------------------------- /toolkit/lightgbm_transformers/requirements.txt: -------------------------------------------------------------------------------- 1 | attrdict 2 | lightgbm 3 | numpy 4 | pandas 5 | sklearn 6 | steppy 7 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/everything-else.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: everything else 3 | about: Suggest an idea for this project 4 | 5 | --- 6 | 7 | 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Steppy-toolkit 2 | 3 | Set of tools to make your work with Steppy faster and more effective. [Steppy](https://github.com/minerva-ml/steps) is a lightweight, open-source, Python library for fast and reproducible experimentation. 4 | -------------------------------------------------------------------------------- /PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Pull Request template 2 | 3 | ### Code contributions 4 | Major - and most appreciated - contribution is pull request with feature or bug fix. Each pull request initiates discussion about your code contribution. 5 | 6 | Each pull request should be provided with minimal description about its contents. 7 | # 8 | 9 | Thanks! 10 | 11 | Jakub & Kamil, 12 | 13 | _core contributors to the minerva.ml_ 14 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: bug 3 | about: Create bug report 4 | 5 | --- 6 | 7 | There are two things that will make the processing of your issue faster: 8 | 1. Make sure that you are using the latest version of the code, 9 | 1. In case of bug issue, it would be nice to provide more technical details such like execution command, error message or script that reproduces your bug. 10 | # 11 | 12 | Thanks! 13 | 14 | Kamil & Jakub, 15 | 16 | *core contributors to the [minerva.ml](https://minerva.ml)* 17 | -------------------------------------------------------------------------------- /toolkit/pytorch_transformers/architectures/utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch.nn as nn 4 | 5 | 6 | class Reshape(nn.Module): 7 | def __init__(self, *shape): 8 | super(Reshape, self).__init__() 9 | self.shape = shape 10 | 11 | def forward(self, x): 12 | return x.view(*self.shape) 13 | 14 | 15 | def get_downsample_pad(stride, kernel, dilation=1): 16 | return int(math.ceil((1 - stride + dilation * kernel - 1) / 2)) 17 | 18 | 19 | def get_upsample_pad(stride, kernel, dilation=1): 20 | if kernel - stride >= 0 and (kernel - stride) % 2 == 0: 21 | return (int((kernel - stride) / 2), 0) 22 | elif kernel - stride < 0: 23 | return (0, stride - kernel) 24 | else: 25 | return (int(math.ceil((kernel - stride) / 2)), 1) 26 | -------------------------------------------------------------------------------- /toolkit/pytorch_transformers/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def persist_torch_model(model, path): 5 | model.eval() 6 | if torch.cuda.is_available(): 7 | model.cpu() 8 | torch.save(model.state_dict(), path) 9 | model.cuda() 10 | else: 11 | torch.save(model.state_dict(), path) 12 | model.train() 13 | 14 | 15 | class Averager: 16 | def __init__(self): 17 | self.current_total = 0.0 18 | self.iterations = 0.0 19 | 20 | def send(self, value): 21 | self.current_total += value 22 | self.iterations += 1 23 | 24 | @property 25 | def value(self): 26 | if self.iterations == 0: 27 | return 0 28 | else: 29 | return 1.0 * self.current_total / self.iterations 30 | 31 | def reset(self): 32 | self.current_total = 0.0 33 | self.iterations = 0.0 34 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to the Steppy-toolkit 2 | 3 | Here, at [minerva.ml](https://minerva.ml) we are creating Steppy - lightweight, open-source, Python library for fast and reproducible experimentation. 4 | 5 | ### Get involved 6 | You are welcome to contribute to the Steppy-examples. To get started: 7 | 1. Check [steppy core library kanban board](https://github.com/minerva-ml/steppy/projects/1) to see what we are working on right now. 8 | 1. Express your interest in a particular [issue](https://github.com/minerva-ml/steppy/issues) by submitting a comment or, 9 | * submit your own [issue](https://github.com/minerva-ml/steppy/issues). 10 | 1. We will get back to you in order to start working together. 11 | 12 | ### Code contributions 13 | Major - and most appreciated - contribution is [pull request](https://github.com/minerva-ml/steppy-toolkit/pulls) with feature or bug fix. 14 | 15 | ### Remarks 16 | In case of custom ideas, please contact core contributors directly at ml-team@neptune.ml. 17 | # 18 | 19 | Thanks! 20 | 21 | Jakub & Kamil, 22 | 23 | *core contributors to the [minerva.ml](https://minerva.ml)* 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 neptune.ml 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | long_description = ''' 4 | Steppy-toolkit is complementary to the steppy library. 5 | 6 | The goal of this package is to provide data scientist 7 | with curated collection of highly parameterizable implementations of neural networks 8 | together with a number of pre- and post-processing routines. 9 | 10 | Steppy-toolkit offers implementations in popular frameworks, such as PyTorch, Keras and scikit-learn. 11 | 12 | Steppy-toolkit is compatible with Python>=3.5 13 | and is distributed under the MIT license. 14 | ''' 15 | 16 | setup(name='steppy-toolkit', 17 | packages=find_packages(), 18 | version='0.1.14', 19 | description='Set of tools to make your work with steppy faster and more effective.', 20 | long_description=long_description, 21 | url='https://github.com/minerva-ml/steppy-toolkit', 22 | download_url='https://github.com/minerva-ml/steppy-toolkit/archive/0.1.14.tar.gz', 23 | author='Kamil A. Kaczmarek, Jakub Czakon', 24 | author_email='kamil.kaczmarek@neptune.ml, jakub.czakon@neptune.ml', 25 | keywords=['machine-learning', 'reproducibility', 'pipeline', 'tools'], 26 | license='MIT', 27 | install_requires=[ 28 | 'neptune-cli>=2.8.17', 29 | 'setuptools>=39.2.0', 30 | 'steppy>=0.1.15'], 31 | zip_safe=False, 32 | classifiers=[]) 33 | -------------------------------------------------------------------------------- /toolkit/resources/apostrophes.json: -------------------------------------------------------------------------------- 1 | { 2 | "arent": "are not", 3 | "cant": "cannot", 4 | "couldnt": "could not", 5 | "didnt": "did not", 6 | "doesnt": "does not", 7 | "dont": "do not", 8 | "hadnt": "had not", 9 | "hasnt": "has not", 10 | "havent": "have not", 11 | "hed": "he would", 12 | "hell": "he will", 13 | "hes": "he is", 14 | "id": "I had", 15 | "ill": "I will", 16 | "im": "I am", 17 | "isnt": "is not", 18 | "its": "it is", 19 | "itll": "it will", 20 | "ive": "I have", 21 | "lets": "let us", 22 | "mightnt": "might not", 23 | "mustnt": "must not", 24 | "shant": "shall not", 25 | "shed" : "she would", 26 | "shell": "she will", 27 | "shes": "she is", 28 | "shouldnt": "should not", 29 | "thats": "that is", 30 | "theres": "there is", 31 | "theyd": "they would", 32 | "theyll": "they will", 33 | "theyre": "they are", 34 | "theyve": "they have", 35 | "wed": "we would", 36 | "were": "we are", 37 | "werent": "were not", 38 | "weve": "we have", 39 | "whatll": "what will", 40 | "whatre": "what are", 41 | "whats": "what is", 42 | "whatve": "what have", 43 | "wheres": "where is", 44 | "whod": "who would", 45 | "wholl": "who will", 46 | "whore": "who are", 47 | "whos": "who is", 48 | "whove": "who have", 49 | "wont": "will not", 50 | "wouldnt": "would not", 51 | "youd": "you would", 52 | "youll": "you will", 53 | "youre": "you are", 54 | "youve": "you have", 55 | "re": "are", 56 | "wasnt": "was not", 57 | "well": "will" 58 | } 59 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | .pytest_cache 6 | tests/.cache 7 | 8 | # C extensions 9 | *.so 10 | 11 | # neptune, pycharm 12 | .cache 13 | .cache/ 14 | .idea/ 15 | .idea_modules/ 16 | out/ 17 | output 18 | output/ 19 | *.log 20 | target/ 21 | devbook.ipynb 22 | devbook_local.ipynb 23 | 24 | # Distribution / packaging 25 | .Python 26 | env/ 27 | build/ 28 | develop-eggs/ 29 | dist/ 30 | downloads/ 31 | eggs/ 32 | .eggs/ 33 | lib/ 34 | lib64/ 35 | parts/ 36 | sdist/ 37 | var/ 38 | wheels/ 39 | *.egg-info/ 40 | .installed.cfg 41 | *.egg 42 | 43 | # PyInstaller 44 | # Usually these files are written by a python script from a template 45 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 46 | *.manifest 47 | *.spec 48 | 49 | # Installer logs 50 | pip-log.txt 51 | pip-delete-this-directory.txt 52 | 53 | # Unit test / coverage reports 54 | htmlcov/ 55 | .tox/ 56 | .coverage 57 | .coverage.* 58 | nosetests.xml 59 | coverage.xml 60 | *.cover 61 | .hypothesis/ 62 | 63 | # Translations 64 | *.mo 65 | *.pot 66 | 67 | # Django stuff: 68 | local_settings.py 69 | 70 | # Flask stuff: 71 | instance/ 72 | .webassets-cache 73 | 74 | # Scrapy stuff: 75 | .scrapy 76 | 77 | # Sphinx documentation 78 | docs/_build/ 79 | 80 | # Jupyter Notebook 81 | Untitled*.ipynb 82 | .ipynb_checkpoints 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # celery beat schedule file 88 | celerybeat-schedule 89 | 90 | # SageMath parsed files 91 | *.sage.py 92 | 93 | # dotenv 94 | .env 95 | 96 | # virtualenv 97 | .venv 98 | venv/ 99 | ENV/ 100 | 101 | # Spyder project settings 102 | .spyderproject 103 | .spyproject 104 | 105 | # Rope project settings 106 | .ropeproject 107 | 108 | # mkdocs documentation 109 | /site 110 | 111 | # mypy 112 | .mypy_cache/ 113 | 114 | # Working directories 115 | examples/cache/ 116 | -------------------------------------------------------------------------------- /toolkit/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | import imgaug as ia 5 | import numpy as np 6 | from PIL import Image 7 | from imgaug import augmenters as iaa 8 | from pycocotools import mask as cocomask 9 | 10 | 11 | def from_pil(*images): 12 | images = [np.array(image) for image in images] 13 | if len(images) == 1: 14 | return images[0] 15 | else: 16 | return images 17 | 18 | 19 | def to_pil(*images): 20 | images = [Image.fromarray((image).astype(np.uint8)) for image in images] 21 | if len(images) == 1: 22 | return images[0] 23 | else: 24 | return images 25 | 26 | 27 | def rle_from_binary(prediction): 28 | prediction = np.asfortranarray(prediction) 29 | return cocomask.encode(prediction) 30 | 31 | 32 | def binary_from_rle(rle): 33 | return cocomask.decode(rle) 34 | 35 | 36 | class ImgAug: 37 | def __init__(self, augmenters): 38 | if not isinstance(augmenters, list): 39 | augmenters = [augmenters] 40 | self.augmenters = augmenters 41 | self.seq_det = None 42 | 43 | def _pre_call_hook(self): 44 | seq = iaa.Sequential(self.augmenters) 45 | seq = reseed(seq, deterministic=True) 46 | self.seq_det = seq 47 | 48 | def transform(self, *images): 49 | images = [self.seq_det.augment_image(image) for image in images] 50 | if len(images) == 1: 51 | return images[0] 52 | else: 53 | return images 54 | 55 | def __call__(self, *args): 56 | self._pre_call_hook() 57 | return self.transform(*args) 58 | 59 | 60 | def get_seed(): 61 | seed = int(time.time()) + int(os.getpid()) 62 | return seed 63 | 64 | 65 | def reseed(augmenter, deterministic=True): 66 | augmenter.random_state = ia.new_random_state(get_seed()) 67 | if deterministic: 68 | augmenter.deterministic = True 69 | 70 | for lists in augmenter.get_children_lists(): 71 | for aug in lists: 72 | aug = reseed(aug, deterministic=True) 73 | return augmenter 74 | -------------------------------------------------------------------------------- /toolkit/keras_transformers/loaders.py: -------------------------------------------------------------------------------- 1 | from keras.preprocessing import text, sequence 2 | from sklearn.externals import joblib 3 | 4 | from steppy.base import BaseTransformer 5 | 6 | 7 | class Tokenizer(BaseTransformer): 8 | def __init__(self, char_level, maxlen, num_words): 9 | super().__init__() 10 | self.char_level = char_level 11 | self.maxlen = maxlen 12 | self.num_words = num_words 13 | 14 | self.tokenizer = text.Tokenizer(char_level=self.char_level, num_words=self.num_words) 15 | 16 | def fit(self, X, X_valid=None, train_mode=True): 17 | self.tokenizer.fit_on_texts(X) 18 | return self 19 | 20 | def transform(self, X, X_valid=None, train_mode=True): 21 | X_tokenized = self._transform(X) 22 | 23 | if X_valid is not None: 24 | X_valid_tokenized = self._transform(X_valid) 25 | else: 26 | X_valid_tokenized = None 27 | return {'X': X_tokenized, 28 | 'X_valid': X_valid_tokenized, 29 | 'tokenizer': self.tokenizer} 30 | 31 | def _transform(self, X): 32 | list_tokenized = self.tokenizer.texts_to_sequences(list(X)) 33 | X_tokenized = sequence.pad_sequences(list_tokenized, maxlen=self.maxlen) 34 | return X_tokenized 35 | 36 | def load(self, filepath): 37 | object_pickle = joblib.load(filepath) 38 | self.char_level = object_pickle['char_level'] 39 | self.maxlen = object_pickle['maxlen'] 40 | self.num_words = object_pickle['num_words'] 41 | self.tokenizer = object_pickle['tokenizer'] 42 | return self 43 | 44 | def persist(self, filepath): 45 | object_pickle = {'char_level': self.char_level, 46 | 'maxlen': self.maxlen, 47 | 'num_words': self.num_words, 48 | 'tokenizer': self.tokenizer} 49 | joblib.dump(object_pickle, filepath) 50 | 51 | 52 | class TextAugmenter(BaseTransformer): 53 | pass 54 | """ 55 | Augmentations by Thesaurus synonim substitution or typos 56 | """ 57 | -------------------------------------------------------------------------------- /toolkit/catboost_transformers/models.py: -------------------------------------------------------------------------------- 1 | from toolkit.toolkit_base import SteppyToolkitError 2 | 3 | try: 4 | import catboost as ctb 5 | from catboost import CatBoostClassifier 6 | from steppy.base import BaseTransformer 7 | from steppy.utils import get_logger 8 | 9 | from toolkit.sklearn_transformers.models import MultilabelEstimators 10 | except ImportError as e: 11 | msg = 'SteppyToolkitError: you have missing modules. Install requirements specific to catboost_transformers.' \ 12 | 'Use this file: toolkit/catboost_transformers/requirements.txt' 13 | raise SteppyToolkitError(msg) from e 14 | 15 | logger = get_logger() 16 | 17 | 18 | class CatboostClassifierMultilabel(MultilabelEstimators): 19 | @property 20 | def estimator(self): 21 | return CatBoostClassifier 22 | 23 | 24 | class CatBoost(BaseTransformer): 25 | def __init__(self, **kwargs): 26 | super().__init__() 27 | self.estimator = ctb.CatBoostClassifier(**kwargs) 28 | 29 | def fit(self, 30 | X, y, 31 | X_valid, y_valid, 32 | feature_names=None, 33 | categorical_features=None, 34 | **kwargs): 35 | 36 | logger.info('Catboost, train data shape {}'.format(X.shape)) 37 | logger.info('Catboost, validation data shape {}'.format(X_valid.shape)) 38 | logger.info('Catboost, train labels shape {}'.format(y.shape)) 39 | logger.info('Catboost, validation labels shape {}'.format(y_valid.shape)) 40 | 41 | categorical_indeces = self._get_categorical_indices(feature_names, categorical_features) 42 | self.estimator.fit(X, y, 43 | eval_set=(X_valid, y_valid), 44 | cat_features=categorical_indeces) 45 | return self 46 | 47 | def transform(self, X, **kwargs): 48 | prediction = self.estimator.predict_proba(X)[:, 1] 49 | return {'prediction': prediction} 50 | 51 | def load(self, filepath): 52 | self.estimator.load_model(filepath) 53 | return self 54 | 55 | def persist(self, filepath): 56 | self.estimator.save_model(filepath) 57 | 58 | def _get_categorical_indices(self, feature_names, categorical_features): 59 | if categorical_features: 60 | return [feature_names.index(feature) for feature in categorical_features] 61 | else: 62 | return None 63 | -------------------------------------------------------------------------------- /toolkit/keras_transformers/callbacks.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from deepsense import neptune 4 | from keras import backend as K 5 | from keras.callbacks import Callback 6 | 7 | 8 | class NeptuneMonitor(Callback): 9 | def __init__(self, model_name): 10 | super().__init__() 11 | self.model_name = model_name 12 | self.ctx = neptune.Context() 13 | self.batch_loss_channel_name = get_correct_channel_name(self.ctx, 14 | '{} Batch Log-loss training'.format(self.model_name)) 15 | self.epoch_loss_channel_name = get_correct_channel_name(self.ctx, 16 | '{} Log-loss training'.format(self.model_name)) 17 | self.epoch_val_loss_channel_name = get_correct_channel_name(self.ctx, 18 | '{} Log-loss validation'.format(self.model_name)) 19 | 20 | self.epoch_id = 0 21 | self.batch_id = 0 22 | 23 | def on_batch_end(self, batch, logs={}): 24 | self.batch_id += 1 25 | self.ctx.channel_send(self.batch_loss_channel_name, self.batch_id, logs['loss']) 26 | 27 | def on_epoch_end(self, epoch, logs={}): 28 | self.epoch_id += 1 29 | self.ctx.channel_send(self.epoch_loss_channel_name, self.epoch_id, logs['loss']) 30 | self.ctx.channel_send(self.epoch_val_loss_channel_name, self.epoch_id, logs['val_loss']) 31 | 32 | 33 | class ReduceLR(Callback): 34 | def __init__(self, gamma): 35 | self.gamma = gamma 36 | 37 | def on_epoch_end(self, epoch, logs={}): 38 | if self.gamma is not None: 39 | K.set_value(self.model.optimizer.lr, self.gamma * K.get_value(self.model.optimizer.lr)) 40 | 41 | 42 | class UnfreezeLayers(Callback): 43 | def __init__(self, unfreeze_on_epoch, from_layer=0, to_layer=1): 44 | self.unfreeze_on_epoch = unfreeze_on_epoch 45 | self.from_layer = from_layer 46 | self.to_layer = to_layer 47 | 48 | self.epoch_id = 0 49 | self.batch_id = 0 50 | 51 | def on_epoch_end(self, epoch, logs={}): 52 | if self.epoch_id == self.unfreeze_on_epoch: 53 | for i, layer in enumerate(self.model.layers): 54 | if i >= self.from_layer and i <= self.to_layer: 55 | layer.trainable = True 56 | self.epoch_id += 1 57 | 58 | 59 | def get_correct_channel_name(ctx, name): 60 | channels_with_name = [channel for channel in ctx._experiment._channels if name in channel.name] 61 | if len(channels_with_name) == 0: 62 | return name 63 | else: 64 | channel_ids = [re.split('[^\d]', channel.name)[-1] for channel in channels_with_name] 65 | channel_ids = sorted([int(idx) if idx != '' else 0 for idx in channel_ids]) 66 | last_id = channel_ids[-1] 67 | corrected_name = '{} {}'.format(name, last_id + 1) 68 | return corrected_name 69 | -------------------------------------------------------------------------------- /toolkit/keras_transformers/contrib.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import, division 3 | 4 | import sys 5 | from os.path import dirname 6 | 7 | sys.path.append(dirname(dirname(__file__))) 8 | from keras import initializers 9 | from keras.engine import InputSpec, Layer 10 | from keras import backend as K 11 | import tensorflow as tf 12 | 13 | 14 | class AttentionWeightedAverage(Layer): 15 | """ 16 | Computes a weighted average of the different channels across timesteps. 17 | Uses 1 parameter pr. channel to compute the attention value for a single timestep. 18 | """ 19 | 20 | def __init__(self, return_attention=False, **kwargs): 21 | self.init = initializers.get('uniform') 22 | self.supports_masking = True 23 | self.return_attention = return_attention 24 | super(AttentionWeightedAverage, self).__init__(**kwargs) 25 | 26 | def build(self, input_shape): 27 | self.input_spec = [InputSpec(ndim=3)] 28 | assert len(input_shape) == 3 29 | 30 | self.W = self.add_weight(shape=(input_shape[2], 1), 31 | name='{}_W'.format(self.name), 32 | initializer=self.init) 33 | self.trainable_weights = [self.W] 34 | super(AttentionWeightedAverage, self).build(input_shape) 35 | 36 | def call(self, x, mask=None): 37 | # computes a probability distribution over the timesteps 38 | # uses 'max trick' for numerical stability 39 | # reshape is done to avoid issue with Tensorflow 40 | # and 1-dimensional weights 41 | logits = K.dot(x, self.W) 42 | x_shape = K.shape(x) 43 | logits = K.reshape(logits, (x_shape[0], x_shape[1])) 44 | ai = K.exp(logits - K.max(logits, axis=-1, keepdims=True)) 45 | 46 | # masked timesteps have zero weight 47 | if mask is not None: 48 | mask = K.cast(mask, K.floatx()) 49 | ai = ai * mask 50 | att_weights = ai / (K.sum(ai, axis=1, keepdims=True) + K.epsilon()) 51 | weighted_input = x * K.expand_dims(att_weights) 52 | result = K.sum(weighted_input, axis=1) 53 | if self.return_attention: 54 | return [result, att_weights] 55 | return result 56 | 57 | def get_output_shape_for(self, input_shape): 58 | return self.compute_output_shape(input_shape) 59 | 60 | def compute_output_shape(self, input_shape): 61 | output_len = input_shape[2] 62 | if self.return_attention: 63 | return [(input_shape[0], output_len), (input_shape[0], input_shape[1])] 64 | return (input_shape[0], output_len) 65 | 66 | def compute_mask(self, input, input_mask=None): 67 | if isinstance(input_mask, list): 68 | return [None] * len(input_mask) 69 | else: 70 | return None 71 | 72 | 73 | def pair_loss(y_true, y_pred): 74 | y_true = tf.cast(y_true, tf.int32) 75 | parts = tf.dynamic_partition(y_pred, y_true, 2) 76 | y_pos = parts[1] 77 | y_neg = parts[0] 78 | y_pos = tf.expand_dims(y_pos, 0) 79 | y_neg = tf.expand_dims(y_neg, -1) 80 | out = K.sigmoid(y_neg - y_pos) 81 | return K.mean(out) 82 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. 6 | 7 | ## Our Standards 8 | 9 | Examples of behavior that contributes to creating a positive environment include: 10 | 11 | * Using welcoming and inclusive language 12 | * Being respectful of differing viewpoints and experiences 13 | * Gracefully accepting constructive criticism 14 | * Focusing on what is best for the community 15 | * Showing empathy towards other community members 16 | 17 | Examples of unacceptable behavior by participants include: 18 | 19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances 20 | * Trolling, insulting/derogatory comments, and personal or political attacks 21 | * Public or private harassment 22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission 23 | * Other conduct which could reasonably be considered inappropriate in a professional setting 24 | 25 | ## Our Responsibilities 26 | 27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. 28 | 29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. 30 | 31 | ## Scope 32 | 33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. 34 | 35 | ## Enforcement 36 | 37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at ml-team@neptune.ml. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. 38 | 39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. 40 | 41 | ## Attribution 42 | 43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] 44 | 45 | [homepage]: http://contributor-covenant.org 46 | [version]: http://contributor-covenant.org/version/1/4/ 47 | -------------------------------------------------------------------------------- /toolkit/pytorch_transformers/validation.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from sklearn.metrics import accuracy_score 5 | from torch.autograd import Variable 6 | 7 | 8 | class DiceLoss(nn.Module): 9 | def __init__(self): 10 | super(DiceLoss, self).__init__() 11 | self.sigmoid = nn.Sigmoid() 12 | 13 | def forward(self, output, target): 14 | prediction = self.sigmoid(output) 15 | return 1 - 2 * torch.sum(prediction * target) / (torch.sum(prediction) + torch.sum(target) + 1e-7) 16 | 17 | 18 | def segmentation_loss(output, target, weight_bce=1.0, weight_dice=1.0): 19 | bce = nn.BCEWithLogitsLoss() 20 | dice = DiceLoss() 21 | return weight_bce*bce(output, target) + weight_dice*dice(output, target) 22 | 23 | 24 | def cross_entropy(output, target, squeeze=False): 25 | if squeeze: 26 | target = target.squeeze(1) 27 | return F.nll_loss(output, target) 28 | 29 | 30 | def mse(output, target, squeeze=False): 31 | if squeeze: 32 | target = target.squeeze(1) 33 | return F.mse_loss(output, target) 34 | 35 | 36 | def multi_output_cross_entropy(outputs, targets): 37 | losses = [] 38 | for output, target in zip(outputs, targets): 39 | loss = cross_entropy(output, target) 40 | losses.append(loss) 41 | return sum(losses) / len(losses) 42 | 43 | 44 | def score_model(model, loss_function, datagen): 45 | batch_gen, steps = datagen 46 | partial_batch_losses = {} 47 | for batch_id, data in enumerate(batch_gen): 48 | X = data[0] 49 | targets_tensors = data[1:] 50 | 51 | if torch.cuda.is_available(): 52 | X = Variable(X, volatile=True).cuda() 53 | targets_var = [] 54 | for target_tensor in targets_tensors: 55 | targets_var.append(Variable(target_tensor, volatile=True).cuda()) 56 | else: 57 | X = Variable(X, volatile=True) 58 | targets_var = [] 59 | for target_tensor in targets_tensors: 60 | targets_var.append(Variable(target_tensor, volatile=True)) 61 | 62 | outputs = model(X) 63 | if len(loss_function) == 1: 64 | for (name, loss_function_one, weight), target in zip(loss_function, targets_var): 65 | loss_sum = loss_function_one(outputs, target) * weight 66 | else: 67 | batch_losses = [] 68 | for (name, loss_function_one, weight), output, target in zip(loss_function, outputs, targets_var): 69 | loss = loss_function_one(output, target) * weight 70 | batch_losses.append(loss) 71 | partial_batch_losses.setdefault(name, []).append(loss) 72 | loss_sum = sum(batch_losses) 73 | partial_batch_losses.setdefault('sum', []).append(loss_sum) 74 | if batch_id == steps: 75 | break 76 | average_losses = {name: sum(losses) / steps for name, losses in partial_batch_losses.items()} 77 | return average_losses 78 | 79 | 80 | def torch_acc_score(output, target): 81 | output = output.data.cpu().numpy() 82 | y_true = target.numpy() 83 | y_pred = output.argmax(axis=1) 84 | return accuracy_score(y_true, y_pred) 85 | 86 | 87 | def torch_acc_score_multi_output(outputs, targets, take_first=None): 88 | accuracies = [] 89 | for i, (output, target) in enumerate(zip(outputs, targets)): 90 | if i == take_first: 91 | break 92 | accuracy = torch_acc_score(output, target) 93 | accuracies.append(accuracy) 94 | avg_accuracy = sum(accuracies) / len(accuracies) 95 | return avg_accuracy 96 | 97 | 98 | def multiclass_segmentation_loss(output, target): 99 | cross_entropy = nn.CrossEntropyLoss() 100 | return cross_entropy(output, target) 101 | -------------------------------------------------------------------------------- /toolkit/pytorch_transformers/loaders/classification.py: -------------------------------------------------------------------------------- 1 | from math import ceil 2 | 3 | import numpy as np 4 | import torch 5 | import torchvision.transforms as transforms 6 | from PIL import Image 7 | from sklearn.externals import joblib 8 | from steppy.base import BaseTransformer 9 | from torch.utils.data import Dataset, DataLoader 10 | 11 | 12 | class MetadataImageDataset(Dataset): 13 | def __init__(self, X, y, image_transform, target_transform, image_augment): 14 | super().__init__() 15 | self.X = X 16 | if y is not None: 17 | self.y = y 18 | else: 19 | self.y = None 20 | 21 | self.image_transform = image_transform 22 | self.image_augment = image_augment 23 | self.target_transform = target_transform 24 | 25 | def load_image(self, img_filepath): 26 | image = np.asarray(Image.open(img_filepath)) 27 | image = image / 255.0 28 | return image 29 | 30 | def __len__(self): 31 | return self.X.shape[0] 32 | 33 | def __getitem__(self, index): 34 | img_filepath = self.X[index] 35 | 36 | Xi = self.load_image(img_filepath) 37 | 38 | if self.image_augment is not None: 39 | Xi = self.image_augment(Xi) 40 | 41 | if self.image_transform is not None: 42 | Xi = self.image_transform(Xi) 43 | if self.y is not None: 44 | yi = self.y[index] 45 | if self.target_transform is not None: 46 | yi = self.target_transform(yi) 47 | return Xi, yi 48 | else: 49 | return Xi 50 | 51 | 52 | class MetadataImageLoader(BaseTransformer): 53 | def __init__(self, loader_params): 54 | super().__init__() 55 | self.loader_params = loader_params 56 | 57 | self.dataset = MetadataImageDataset 58 | self.image_transform = transforms.ToTensor() 59 | self.target_transform = target_transform 60 | self.image_augment = None 61 | 62 | def transform(self, X, y, validation_data, train_mode): 63 | if train_mode: 64 | flow, steps = self.get_datagen(X, y, train_mode, self.loader_params['training']) 65 | else: 66 | flow, steps = self.get_datagen(X, y, train_mode, self.loader_params['inference']) 67 | 68 | if validation_data is not None: 69 | X_valid, y_valid = validation_data 70 | valid_flow, valid_steps = self.get_datagen(X_valid, y_valid, False, self.loader_params['inference']) 71 | else: 72 | valid_flow = None 73 | valid_steps = None 74 | 75 | return {'datagen': (flow, steps), 76 | 'validation_datagen': (valid_flow, valid_steps)} 77 | 78 | def get_datagen(self, X, y, train_mode, loader_params): 79 | if train_mode: 80 | dataset = self.dataset(X, y, 81 | image_augment=self.image_augment, 82 | image_transform=self.image_transform, 83 | target_transform=self.target_transform) 84 | 85 | else: 86 | dataset = self.dataset(X, y, 87 | image_augment=None, 88 | image_transform=self.image_transform, 89 | target_transform=self.target_transform) 90 | datagen = DataLoader(dataset, **loader_params) 91 | steps = ceil(X.shape[0] / loader_params['batch_size']) 92 | return datagen, steps 93 | 94 | def load(self, filepath): 95 | params = joblib.load(filepath) 96 | self.loader_params = params['loader_params'] 97 | return self 98 | 99 | def persist(self, filepath): 100 | params = {'loader_params': self.loader_params} 101 | joblib.dump(params, filepath) 102 | 103 | 104 | def target_transform(y): 105 | return torch.from_numpy(y).type(torch.LongTensor) 106 | -------------------------------------------------------------------------------- /toolkit/xgboost_transformers/models.py: -------------------------------------------------------------------------------- 1 | from toolkit.toolkit_base import SteppyToolkitError 2 | 3 | try: 4 | import xgboost as xgb 5 | from attrdict import AttrDict 6 | from steppy.base import BaseTransformer 7 | from steppy.utils import get_logger 8 | from xgboost import XGBClassifier 9 | 10 | from toolkit.sklearn_transformers.models import MultilabelEstimators 11 | except ImportError as e: 12 | msg = 'SteppyToolkitError: you have missing modules. Install requirements specific to xgboost_transformers.' \ 13 | 'Use this file: toolkit/xgboost_transformers/requirements.txt' 14 | raise SteppyToolkitError(msg) from e 15 | 16 | logger = get_logger() 17 | 18 | 19 | class XGBoostClassifierMultilabel(MultilabelEstimators): 20 | @property 21 | def estimator(self): 22 | return XGBClassifier 23 | 24 | 25 | class XGBoost(BaseTransformer): 26 | """ 27 | Accepts three dictionaries that reflects XGBoost API: 28 | - dmatrix_parameters -> parameters of the xgboost.DMatrix class. 29 | See: https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.DMatrix 30 | - training_parameters -> parameters of the xgboost.train function. 31 | See: https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.train 32 | - predict_parameters -> parameters of the xgboost.Booster.predict function. 33 | See: https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.Booster.predict 34 | - booster_parameters -> parameters of the Booster. 35 | See: https://xgboost.readthedocs.io/en/latest/parameter.html 36 | """ 37 | def __init__(self, 38 | dmatrix_parameters=None, 39 | training_parameters=None, 40 | predict_parameters=None, 41 | booster_parameters=None): 42 | super().__init__() 43 | logger.info('initializing XGBoost transformer') 44 | if dmatrix_parameters is not None: 45 | isinstance(dmatrix_parameters, dict), 'XGBoost transformer: dmatrix_parameters must be dict, ' \ 46 | 'got {} instead'.format(type(dmatrix_parameters)) 47 | if training_parameters is not None: 48 | isinstance(training_parameters, dict), 'XGBoost transformer: training_parameters must be dict, ' \ 49 | 'got {} instead'.format(type(training_parameters)) 50 | if predict_parameters is not None: 51 | isinstance(predict_parameters, dict), 'XGBoost transformer: predict_parameters must be dict, ' \ 52 | 'got {} instead'.format(type(predict_parameters)) 53 | if booster_parameters is not None: 54 | isinstance(booster_parameters, dict), 'XGBoost transformer: booster_parameters must be dict, ' \ 55 | 'got {} instead'.format(type(booster_parameters)) 56 | 57 | self.dmatrix_parameters = dmatrix_parameters or {} 58 | self.training_parameters = training_parameters or {} 59 | self.predict_parameters = predict_parameters or {} 60 | self.booster_parameters = booster_parameters or {} 61 | 62 | def fit(self, X, y, X_valid, y_valid): 63 | logger.info('XGBoost, train data shape {}'.format(X.shape)) 64 | logger.info('XGBoost, validation data shape {}'.format(X_valid.shape)) 65 | logger.info('XGBoost, train labels shape {}'.format(y.shape)) 66 | logger.info('XGBoost, validation labels shape {}'.format(y_valid.shape)) 67 | 68 | train = xgb.DMatrix(data=X, 69 | label=y, 70 | **self.dmatrix_parameters) 71 | valid = xgb.DMatrix(data=X_valid, 72 | label=y_valid, 73 | **self.dmatrix_parameters) 74 | self.estimator = xgb.train(params=self.booster_parameters, 75 | dtrain=train, 76 | evals=[(train, 'train'), (valid, 'valid')], 77 | **self.training_parameters) 78 | return self 79 | 80 | def transform(self, X, y=None, **kwargs): 81 | X_DMatrix = xgb.DMatrix(X, label=y, **self.dmatrix_parameters) 82 | prediction = self.estimator.predict(X_DMatrix, **self.predict_parameters) 83 | return {'prediction': prediction} 84 | 85 | def load(self, filepath): 86 | self.estimator = xgb.Booster(params=self.booster_parameters) 87 | self.estimator.load_model(filepath) 88 | return self 89 | 90 | def persist(self, filepath): 91 | self.estimator.save_model(filepath) 92 | -------------------------------------------------------------------------------- /toolkit/keras_transformers/embeddings.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from gensim.models import KeyedVectors 4 | from sklearn.externals import joblib 5 | 6 | from steppy.base import BaseTransformer 7 | 8 | 9 | class EmbeddingsMatrix(BaseTransformer): 10 | def __init__(self, pretrained_filepath, max_features, embedding_size): 11 | super().__init__() 12 | self.pretrained_filepath = pretrained_filepath 13 | self.max_features = max_features 14 | self.embedding_size = embedding_size 15 | 16 | def fit(self, tokenizer): 17 | self.embedding_matrix = self._get_embedding_matrix(tokenizer) 18 | return self 19 | 20 | def transform(self, tokenizer): 21 | return {'embeddings_matrix': self.embedding_matrix} 22 | 23 | def _get_embedding_matrix(self, tokenizer): 24 | return NotImplementedError 25 | 26 | def persist(self, filepath): 27 | joblib.dump(self.embedding_matrix, filepath) 28 | 29 | def load(self, filepath): 30 | self.embedding_matrix = joblib.load(filepath) 31 | return self 32 | 33 | 34 | class GloveEmbeddingsMatrix(EmbeddingsMatrix): 35 | def _get_embedding_matrix(self, tokenizer): 36 | return load_glove_embeddings(self.pretrained_filepath, 37 | tokenizer, 38 | self.max_features, 39 | self.embedding_size) 40 | 41 | 42 | class Word2VecEmbeddingsMatrix(EmbeddingsMatrix): 43 | def _get_embedding_matrix(self, tokenizer): 44 | return load_word2vec_embeddings(self.pretrained_filepath, 45 | tokenizer, 46 | self.max_features, 47 | self.embedding_size) 48 | 49 | 50 | class FastTextEmbeddingsMatrix(EmbeddingsMatrix): 51 | def _get_embedding_matrix(self, tokenizer): 52 | return load_fasttext_embeddings(self.pretrained_filepath, 53 | tokenizer, 54 | self.max_features, 55 | self.embedding_size) 56 | 57 | 58 | def load_glove_embeddings(filepath, tokenizer, max_features, embedding_size): 59 | embeddings_index = dict() 60 | with open(filepath) as f: 61 | for line in f: 62 | # Note: use split(' ') instead of split() if you get an error. 63 | values = line.split(' ') 64 | word = values[0] 65 | coefs = np.asarray(values[1:], dtype='float32') 66 | embeddings_index[word] = coefs 67 | 68 | all_embs = np.stack(embeddings_index.values()) 69 | emb_mean, emb_std = all_embs.mean(), all_embs.std() 70 | 71 | word_index = tokenizer.word_index 72 | nb_words = min(max_features, len(word_index)) 73 | embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embedding_size)) 74 | for word, i in word_index.items(): 75 | if i >= max_features: 76 | continue 77 | embedding_vector = embeddings_index.get(word) 78 | if embedding_vector is not None: 79 | embedding_matrix[i] = embedding_vector 80 | return embedding_matrix 81 | 82 | 83 | def load_word2vec_embeddings(filepath, tokenizer, max_features, embedding_size): 84 | model = KeyedVectors.load_word2vec_format(filepath, binary=True) 85 | 86 | emb_mean, emb_std = model.wv.syn0.mean(), model.wv.syn0.std() 87 | 88 | word_index = tokenizer.word_index 89 | nb_words = min(max_features, len(word_index)) 90 | embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embedding_size)) 91 | for word, i in word_index.items(): 92 | if i >= max_features: 93 | continue 94 | try: 95 | embedding_vector = model[word] 96 | embedding_matrix[i] = embedding_vector 97 | except KeyError: 98 | continue 99 | return embedding_matrix 100 | 101 | 102 | def load_fasttext_embeddings(filepath, tokenizer, max_features, embedding_size): 103 | embeddings_index = dict() 104 | with open(filepath) as f: 105 | for i, line in enumerate(f): 106 | line = line.strip() 107 | if i == 0: 108 | continue 109 | values = line.split(' ') 110 | word = values[0] 111 | coefs = np.asarray(values[1:], dtype='float32') 112 | if coefs.shape[0] != embedding_size: 113 | continue 114 | embeddings_index[word] = coefs 115 | 116 | all_embs = np.stack(embeddings_index.values()) 117 | emb_mean, emb_std = all_embs.mean(), all_embs.std() 118 | 119 | word_index = tokenizer.word_index 120 | nb_words = min(max_features, len(word_index)) 121 | embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embedding_size)) 122 | for word, i in word_index.items(): 123 | if i >= max_features: 124 | continue 125 | embedding_vector = embeddings_index.get(word) 126 | if embedding_vector is not None: 127 | embedding_matrix[i] = embedding_vector 128 | return embedding_matrix 129 | -------------------------------------------------------------------------------- /toolkit/lightgbm_transformers/models.py: -------------------------------------------------------------------------------- 1 | from toolkit.toolkit_base import SteppyToolkitError 2 | 3 | try: 4 | import lightgbm as lgb 5 | import numpy as np 6 | import pandas as pd 7 | from sklearn.externals import joblib 8 | from steppy.base import BaseTransformer 9 | from steppy.utils import get_logger 10 | except ImportError as e: 11 | msg = 'SteppyToolkitError: you have missing modules. Install requirements specific to lightgbm_transformers.' \ 12 | 'Use this file: toolkit/lightgbm_transformers/requirements.txt' 13 | raise SteppyToolkitError(msg) from e 14 | 15 | logger = get_logger() 16 | 17 | 18 | class LightGBM(BaseTransformer): 19 | """ 20 | Accepts three dictionaries that reflects LightGBM API: 21 | - booster_parameters -> parameters of the Booster 22 | See: https://lightgbm.readthedocs.io/en/latest/Parameters.html 23 | - dataset_parameters -> parameters of the lightgbm.Dataset class 24 | See: https://lightgbm.readthedocs.io/en/latest/Python-API.html#data-structure-api 25 | - training_parameters -> parameters of the lightgbm.train function 26 | See: https://lightgbm.readthedocs.io/en/latest/Python-API.html#training-api 27 | """ 28 | def __init__(self, 29 | booster_parameters=None, 30 | dataset_parameters=None, 31 | training_parameters=None): 32 | super().__init__() 33 | logger.info('initializing LightGBM transformer') 34 | if booster_parameters is not None: 35 | isinstance(booster_parameters, dict), 'LightGBM transformer: booster_parameters must be dict, ' \ 36 | 'got {} instead'.format(type(booster_parameters)) 37 | if dataset_parameters is not None: 38 | isinstance(dataset_parameters, dict), 'LightGBM transformer: dataset_parameters must be dict, ' \ 39 | 'got {} instead'.format(type(dataset_parameters)) 40 | if training_parameters is not None: 41 | isinstance(training_parameters, dict), 'LightGBM transformer: training_parameters must be dict, ' \ 42 | 'got {} instead'.format(type(training_parameters)) 43 | 44 | self.booster_parameters = booster_parameters or {} 45 | self.dataset_parameters = dataset_parameters or {} 46 | self.training_parameters = training_parameters or {} 47 | 48 | def fit(self, X, y, X_valid, y_valid): 49 | self._check_target_shape_and_type(y, 'y') 50 | self._check_target_shape_and_type(y_valid, 'y_valid') 51 | y = self._format_target(y) 52 | y_valid = self._format_target(y_valid) 53 | 54 | logger.info('LightGBM transformer, train data shape {}'.format(X.shape)) 55 | logger.info('LightGBM transformer, validation data shape {}'.format(X_valid.shape)) 56 | logger.info('LightGBM transformer, train labels shape {}'.format(y.shape)) 57 | logger.info('LightGBM transformer, validation labels shape {}'.format(y_valid.shape)) 58 | 59 | data_train = lgb.Dataset(data=X, 60 | label=y, 61 | **self.dataset_parameters) 62 | data_valid = lgb.Dataset(data=X_valid, 63 | label=y_valid, 64 | **self.dataset_parameters) 65 | self.estimator = lgb.train(params=self.booster_parameters, 66 | train_set=data_train, 67 | valid_sets=[data_train, data_valid], 68 | valid_names=['data_train', 'data_valid'], 69 | **self.training_parameters) 70 | return self 71 | 72 | def transform(self, X, **kwargs): 73 | prediction = self.estimator.predict(X) 74 | return {'prediction': prediction} 75 | 76 | def load(self, filepath): 77 | self.estimator = joblib.load(filepath) 78 | return self 79 | 80 | def persist(self, filepath): 81 | joblib.dump(self.estimator, filepath) 82 | 83 | def _check_target_shape_and_type(self, target, name): 84 | if not any([isinstance(target, obj_type) for obj_type in [pd.Series, np.ndarray, list]]): 85 | msg = '"target" must be "numpy.ndarray" or "Pandas.Series" or "list", got {} instead.'.format(type(target)) 86 | raise SteppyToolkitError(msg) 87 | if not isinstance(target, list): 88 | assert len(target.shape) == 1, '"{}" must be 1-D. It is {}-D instead.'.format(name, len(target.shape)) 89 | 90 | def _format_target(self, target): 91 | if isinstance(target, pd.Series): 92 | return target.values 93 | elif isinstance(target, np.ndarray): 94 | return target 95 | elif isinstance(target, list): 96 | return np.array(target) 97 | else: 98 | raise TypeError( 99 | '"target" must be "numpy.ndarray" or "Pandas.Series" or "list", got {} instead.'.format( 100 | type(target))) 101 | -------------------------------------------------------------------------------- /toolkit/sklearn_transformers/models.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import sklearn.linear_model as lr 4 | from sklearn import ensemble 5 | from sklearn import svm 6 | from sklearn.externals import joblib 7 | 8 | from steppy.base import BaseTransformer 9 | from steppy.utils import get_logger 10 | 11 | logger = get_logger() 12 | 13 | 14 | class SklearnBaseTransformer(BaseTransformer): 15 | def __init__(self, estimator): 16 | super().__init__() 17 | self.estimator = estimator 18 | 19 | def fit(self, X, y=None, **kwargs): 20 | self.estimator.fit(X, y) 21 | return self 22 | 23 | def persist(self, filepath): 24 | joblib.dump(self.estimator, filepath) 25 | 26 | def load(self, filepath): 27 | self.estimator = joblib.load(filepath) 28 | return self 29 | 30 | 31 | class SklearnClassifier(SklearnBaseTransformer): 32 | RESULT_KEY = 'prediction' 33 | 34 | def transform(self, X, y=None, **kwargs): 35 | prediction = self.estimator.predict_proba(X) 36 | return {SklearnClassifier.RESULT_KEY: prediction} 37 | 38 | 39 | class SklearnRegressor(SklearnBaseTransformer): 40 | RESULT_KEY = 'prediction' 41 | 42 | def transform(self, X, y=None, **kwargs): 43 | prediction = self.estimator.predict(X) 44 | return {SklearnRegressor.RESULT_KEY: prediction} 45 | 46 | 47 | class SklearnTransformer(SklearnBaseTransformer): 48 | RESULT_KEY = 'transformed' 49 | 50 | def transform(self, X, y=None, **kwargs): 51 | transformed = self.estimator.transform(X) 52 | return {SklearnTransformer.RESULT_KEY: transformed} 53 | 54 | 55 | class SklearnPipeline(SklearnBaseTransformer): 56 | RESULT_KEY = 'transformed' 57 | 58 | def transform(self, X, y=None, **kwargs): 59 | transformed = self.estimator.transform(X) 60 | return {SklearnPipeline.RESULT_KEY: transformed} 61 | 62 | 63 | class MultilabelEstimators(BaseTransformer): 64 | def __init__(self, label_nr, **kwargs): 65 | super().__init__() 66 | self.label_nr = label_nr 67 | self.estimators = self._get_estimators(**kwargs) 68 | 69 | @property 70 | def estimator(self): 71 | return NotImplementedError 72 | 73 | def _get_estimators(self, **kwargs): 74 | estimators = [] 75 | for i in range(self.label_nr): 76 | estimators.append((i, self.estimator(**kwargs))) 77 | return estimators 78 | 79 | def fit(self, X, y, **kwargs): 80 | for i, estimator in self.estimators: 81 | logger.info('fitting estimator {}'.format(i)) 82 | estimator.fit(X, y[:, i]) 83 | return self 84 | 85 | def transform(self, X, y=None, **kwargs): 86 | predictions = [] 87 | for i, estimator in self.estimators: 88 | prediction = estimator.predict_proba(X) 89 | predictions.append(prediction) 90 | predictions = np.stack(predictions, axis=0) 91 | predictions = predictions[:, :, 1].transpose() 92 | return {'predicted_probability': predictions} 93 | 94 | def load(self, filepath): 95 | params = joblib.load(filepath) 96 | self.label_nr = params['label_nr'] 97 | self.estimators = params['estimators'] 98 | return self 99 | 100 | def persist(self, filepath): 101 | params = {'label_nr': self.label_nr, 102 | 'estimators': self.estimators} 103 | joblib.dump(params, filepath) 104 | 105 | 106 | class LogisticRegressionMultilabel(MultilabelEstimators): 107 | @property 108 | def estimator(self): 109 | return lr.LogisticRegression 110 | 111 | 112 | class SVCMultilabel(MultilabelEstimators): 113 | @property 114 | def estimator(self): 115 | return svm.SVC 116 | 117 | 118 | class LinearSVCMultilabel(MultilabelEstimators): 119 | @property 120 | def estimator(self): 121 | return LinearSVC_proba 122 | 123 | 124 | class RandomForestMultilabel(MultilabelEstimators): 125 | @property 126 | def estimator(self): 127 | return ensemble.RandomForestClassifier 128 | 129 | 130 | class LinearSVC_proba(svm.LinearSVC): 131 | def _platt_func(self, x): 132 | return 1.0 / (1 + np.exp(-x)) 133 | 134 | def predict_proba(self, X): 135 | f = np.vectorize(self._platt_func) 136 | raw_predictions = self.decision_function(X) 137 | platt_predictions = f(raw_predictions).reshape(-1, 1) 138 | prob_positive = platt_predictions / platt_predictions.sum(axis=1)[:, None] 139 | prob_negative = 1.0 - prob_positive 140 | probabilities = np.hstack([prob_negative, prob_positive]) 141 | return probabilities 142 | 143 | 144 | def make_transformer(estimator, mode='classifier'): 145 | if mode == 'classifier': 146 | transformer = SklearnClassifier(estimator) 147 | elif mode == 'regressor': 148 | transformer = SklearnRegressor(estimator) 149 | elif mode == 'transformer': 150 | transformer = SklearnTransformer(estimator) 151 | elif mode == 'pipeline': 152 | transformer = SklearnPipeline(estimator) 153 | else: 154 | raise NotImplementedError("""Only classifier, regressor and transformer modes are available""") 155 | 156 | return transformer 157 | -------------------------------------------------------------------------------- /toolkit/preprocessing/text.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | import string 4 | 5 | import nltk 6 | import numpy as np 7 | import pandas as pd 8 | from nltk.corpus import stopwords 9 | from nltk.stem.wordnet import WordNetLemmatizer 10 | from nltk.tokenize import TweetTokenizer 11 | from sklearn.externals import joblib 12 | from steppy.base import BaseTransformer 13 | 14 | lem = WordNetLemmatizer() 15 | tokenizer = TweetTokenizer() 16 | 17 | nltk.download('wordnet') 18 | nltk.download('stopwords') 19 | 20 | eng_stopwords = set(stopwords.words("english")) 21 | with open('steps/resources/apostrophes.json', 'r') as f: 22 | APOSTROPHES_WORDS = json.load(f) 23 | 24 | 25 | class WordListFilter(BaseTransformer): 26 | def __init__(self, word_list_filepath): 27 | super().__init__() 28 | self.word_set = self._read_data(word_list_filepath) 29 | 30 | def transform(self, X): 31 | X = self._transform(X) 32 | return {'X': X} 33 | 34 | def _transform(self, X): 35 | X = pd.DataFrame(X, columns=['text']).astype(str) 36 | X['text'] = X['text'].apply(self._filter_words) 37 | return X['text'].values 38 | 39 | def _filter_words(self, x): 40 | x = x.lower() 41 | x = ' '.join([w for w in x.split() if w in self.word_set]) 42 | return x 43 | 44 | def _read_data(self, filepath): 45 | with open(filepath, 'r+') as f: 46 | data = f.read() 47 | return set(data.split('\n')) 48 | 49 | def load(self, filepath): 50 | return self 51 | 52 | def persist(self, filepath): 53 | joblib.dump({}, filepath) 54 | 55 | 56 | class TextCleaner(BaseTransformer): 57 | def __init__(self, 58 | drop_punctuation, 59 | drop_newline, 60 | drop_multispaces, 61 | all_lower_case, 62 | fill_na_with, 63 | deduplication_threshold, 64 | apostrophes, 65 | use_stopwords): 66 | super().__init__() 67 | self.drop_punctuation = drop_punctuation 68 | self.drop_newline = drop_newline 69 | self.drop_multispaces = drop_multispaces 70 | self.all_lower_case = all_lower_case 71 | self.fill_na_with = fill_na_with 72 | self.deduplication_threshold = deduplication_threshold 73 | self.apostrophes = apostrophes 74 | self.use_stopwords = use_stopwords 75 | 76 | def transform(self, X): 77 | X = pd.DataFrame(X, columns=['text']).astype(str) 78 | X['text'] = X['text'].apply(self._transform) 79 | if self.fill_na_with: 80 | X['text'] = X['text'].fillna(self.fill_na_with).values 81 | return {'X': X['text'].values} 82 | 83 | def _transform(self, x): 84 | if self.all_lower_case: 85 | x = self._lower(x) 86 | if self.drop_punctuation: 87 | x = self._remove_punctuation(x) 88 | if self.drop_newline: 89 | x = self._remove_newline(x) 90 | if self.drop_multispaces: 91 | x = self._substitute_multiple_spaces(x) 92 | if self.deduplication_threshold is not None: 93 | x = self._deduplicate(x) 94 | if self.apostrophes: 95 | x = self._apostrophes(x) 96 | if self.use_stopwords: 97 | x = self._use_stopwords(x) 98 | return x 99 | 100 | def _use_stopwords(self, x): 101 | words = tokenizer.tokenize(x) 102 | words = [w for w in words if not w in eng_stopwords] 103 | x = " ".join(words) 104 | return x 105 | 106 | def _apostrophes(self, x): 107 | words = tokenizer.tokenize(x) 108 | words = [APOSTROPHES_WORDS[word] if word in APOSTROPHES_WORDS else word for word in words] 109 | words = [lem.lemmatize(word, "v") for word in words] 110 | words = [w for w in words if not w in eng_stopwords] 111 | x = " ".join(words) 112 | return x 113 | 114 | def _lower(self, x): 115 | return x.lower() 116 | 117 | def _remove_punctuation(self, x): 118 | return re.sub(r'[^\w\s]', ' ', x) 119 | 120 | def _remove_newline(self, x): 121 | x = x.replace('\n', ' ') 122 | x = x.replace('\n\n', ' ') 123 | return x 124 | 125 | def _substitute_multiple_spaces(self, x): 126 | return ' '.join(x.split()) 127 | 128 | def _deduplicate(self, x): 129 | word_list = x.split() 130 | num_words = len(word_list) 131 | if num_words == 0: 132 | return x 133 | else: 134 | num_unique_words = len(set(word_list)) 135 | unique_ratio = num_words / num_unique_words 136 | if unique_ratio > self.deduplication_threshold: 137 | x = ' '.join(x.split()[:num_unique_words]) 138 | return x 139 | 140 | def load(self, filepath): 141 | params = joblib.load(filepath) 142 | self.drop_punctuation = params['drop_punctuation'] 143 | self.all_lower_case = params['all_lower_case'] 144 | self.fill_na_with = params['fill_na_with'] 145 | return self 146 | 147 | def persist(self, filepath): 148 | params = {'drop_punctuation': self.drop_punctuation, 149 | 'all_lower_case': self.all_lower_case, 150 | 'fill_na_with': self.fill_na_with, 151 | } 152 | joblib.dump(params, filepath) 153 | 154 | 155 | class TextCounter(BaseTransformer): 156 | def transform(self, X): 157 | X = pd.DataFrame(X, columns=['text']).astype(str) 158 | X = X['text'].apply(self._transform) 159 | X['caps_vs_length'] = self._caps_vs_length(X) 160 | X['num_symbols'] = X['text'].apply(lambda comment: sum(comment.count(w) for w in '*&$%')) 161 | X['num_words'] = X['text'].apply(lambda comment: len(comment.split())) 162 | X['num_unique_words'] = X['text'].apply(lambda comment: len(set(w for w in comment.split()))) 163 | X['words_vs_unique'] = self._words_vs_unique(X) 164 | X['mean_word_len'] = X['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()])) 165 | X.drop('text', axis=1, inplace=True) 166 | X.fillna(0.0, inplace=True) 167 | return {'X': X} 168 | 169 | def _transform(self, x): 170 | features = {} 171 | features['text'] = x 172 | features['char_count'] = char_count(x) 173 | features['word_count'] = word_count(x) 174 | features['punctuation_count'] = punctuation_count(x) 175 | features['upper_case_count'] = upper_case_count(x) 176 | features['lower_case_count'] = lower_case_count(x) 177 | features['digit_count'] = digit_count(x) 178 | features['space_count'] = space_count(x) 179 | features['newline_count'] = newline_count(x) 180 | return pd.Series(features) 181 | 182 | def _caps_vs_length(self, X): 183 | try: 184 | return X.apply(lambda row: float(row['upper_case_count']) / float(row['char_count']), axis=1) 185 | except ZeroDivisionError: 186 | return 0 187 | 188 | def _words_vs_unique(self, X): 189 | try: 190 | return X['num_unique_words'] / X['num_words'] 191 | except ZeroDivisionError: 192 | return 0 193 | 194 | def load(self, filepath): 195 | return self 196 | 197 | def persist(self, filepath): 198 | joblib.dump({}, filepath) 199 | 200 | 201 | def char_count(x): 202 | return len(x) 203 | 204 | 205 | def word_count(x): 206 | return len(x.split()) 207 | 208 | 209 | def newline_count(x): 210 | return x.count('\n') 211 | 212 | 213 | def upper_case_count(x): 214 | return sum(c.isupper() for c in x) 215 | 216 | 217 | def lower_case_count(x): 218 | return sum(c.islower() for c in x) 219 | 220 | 221 | def digit_count(x): 222 | return sum(c.isdigit() for c in x) 223 | 224 | 225 | def space_count(x): 226 | return sum(c.isspace() for c in x) 227 | 228 | 229 | def punctuation_count(x): 230 | return occurrence(x, string.punctuation) 231 | 232 | 233 | def occurrence(s1, s2): 234 | return sum([1 for x in s1 if x in s2]) 235 | -------------------------------------------------------------------------------- /toolkit/pytorch_transformers/models.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | from functools import partial 4 | 5 | import numpy as np 6 | import torch 7 | import torch.nn as nn 8 | from steppy.base import BaseTransformer 9 | from steppy.utils import get_logger 10 | from torch.autograd import Variable 11 | from torch.nn import init 12 | 13 | from toolkit.pytorch_transformers.utils import persist_torch_model 14 | 15 | logger = get_logger() 16 | 17 | 18 | class Model(BaseTransformer): 19 | def __init__(self, 20 | architecture_config, 21 | training_config, 22 | callbacks_config): 23 | super().__init__() 24 | self.architecture_config = architecture_config 25 | self.training_config = training_config 26 | self.callbacks_config = callbacks_config 27 | 28 | self.model = None 29 | self.optimizer = None 30 | self.loss_function = None 31 | self.callbacks = None 32 | self.validation_loss = {} 33 | 34 | @property 35 | def output_names(self): 36 | return [name for (name, func, weight) in self.loss_function] 37 | 38 | def _initialize_model_weights(self): 39 | logger.info('initializing model weights...') 40 | weights_init_config = self.architecture_config['weights_init'] 41 | 42 | if weights_init_config['function'] == 'normal': 43 | weights_init_func = partial(init_weights_normal, **weights_init_config['params']) 44 | elif weights_init_config['function'] == 'xavier': 45 | weights_init_func = init_weights_xavier 46 | elif weights_init_config['function'] == 'he': 47 | weights_init_func = init_weights_he 48 | else: 49 | raise NotImplementedError 50 | 51 | self.model.apply(weights_init_func) 52 | 53 | def fit(self, datagen, validation_datagen=None): 54 | self._initialize_model_weights() 55 | 56 | if torch.cuda.is_available(): 57 | self.model = self.model.cuda() 58 | 59 | self.callbacks.set_params(self, validation_datagen=validation_datagen) 60 | self.callbacks.on_train_begin() 61 | 62 | batch_gen, steps = datagen 63 | for epoch_id in range(self.training_config['epochs']): 64 | self.callbacks.on_epoch_begin() 65 | for batch_id, data in enumerate(batch_gen): 66 | self.callbacks.on_batch_begin() 67 | metrics = self._fit_loop(data) 68 | self.callbacks.on_batch_end(metrics=metrics) 69 | if batch_id == steps: 70 | break 71 | self.callbacks.on_epoch_end() 72 | if self.callbacks.training_break(): 73 | break 74 | self.callbacks.on_train_end() 75 | return self 76 | 77 | def _fit_loop(self, data): 78 | X = data[0] 79 | targets_tensors = data[1:] 80 | 81 | if torch.cuda.is_available(): 82 | X = Variable(X).cuda() 83 | targets_var = [] 84 | for target_tensor in targets_tensors: 85 | targets_var.append(Variable(target_tensor).cuda()) 86 | else: 87 | X = Variable(X) 88 | targets_var = [] 89 | for target_tensor in targets_tensors: 90 | targets_var.append(Variable(target_tensor)) 91 | 92 | self.optimizer.zero_grad() 93 | outputs_batch = self.model(X) 94 | partial_batch_losses = {} 95 | 96 | assert len(targets_tensors) == len(outputs_batch) == len(self.loss_function),\ 97 | '''Number of targets, model outputs and elements of loss function must equal. 98 | You have n_targets={0}, n_model_outputs={1}, n_loss_function_elements={2}. 99 | The order of elements must also be preserved.'''.format(len(targets_tensors), 100 | len(outputs_batch), 101 | len(self.loss_function)) 102 | 103 | if len(self.output_names) == 1: 104 | for (name, loss_function, weight), target in zip(self.loss_function, targets_var): 105 | batch_loss = loss_function(outputs_batch, target) * weight 106 | else: 107 | for (name, loss_function, weight), output, target in zip(self.loss_function, outputs_batch, targets_var): 108 | partial_batch_losses[name] = loss_function(output, target) * weight 109 | batch_loss = sum(partial_batch_losses.values()) 110 | partial_batch_losses['sum'] = batch_loss 111 | batch_loss.backward() 112 | self.optimizer.step() 113 | 114 | return partial_batch_losses 115 | 116 | def _transform(self, datagen, validation_datagen=None): 117 | self.model.eval() 118 | batch_gen, steps = datagen 119 | outputs = {} 120 | for batch_id, data in enumerate(batch_gen): 121 | if isinstance(data, list): 122 | X = data[0] 123 | else: 124 | X = data 125 | 126 | if torch.cuda.is_available(): 127 | X = Variable(X, volatile=True).cuda() 128 | else: 129 | X = Variable(X, volatile=True) 130 | 131 | outputs_batch = self.model(X) 132 | if len(self.output_names) == 1: 133 | outputs.setdefault(self.output_names[0], []).append(outputs_batch.data.cpu().numpy()) 134 | else: 135 | for name, output in zip(self.output_names, outputs_batch): 136 | output_ = output.data.cpu().numpy() 137 | outputs.setdefault(name, []).append(output_) 138 | if batch_id == steps: 139 | break 140 | self.model.train() 141 | outputs = {'{}_prediction'.format(name): np.vstack(outputs_) for name, outputs_ in outputs.items()} 142 | return outputs 143 | 144 | def transform(self, datagen, validation_datagen=None): 145 | predictions = self._transform(datagen, validation_datagen) 146 | return NotImplementedError 147 | 148 | def load(self, filepath): 149 | self.model.eval() 150 | 151 | if torch.cuda.is_available(): 152 | self.model.cpu() 153 | self.model.load_state_dict(torch.load(filepath)) 154 | self.model.cuda() 155 | else: 156 | self.model.load_state_dict(torch.load(filepath, map_location=lambda storage, loc: storage)) 157 | return self 158 | 159 | def persist(self, filepath): 160 | checkpoint_callback = self.callbacks_config.get('model_checkpoint') 161 | if checkpoint_callback: 162 | checkpoint_filepath = checkpoint_callback['filepath'] 163 | if os.path.exists(checkpoint_filepath): 164 | shutil.copyfile(checkpoint_filepath, filepath) 165 | else: 166 | persist_torch_model(self.model, filepath) 167 | else: 168 | persist_torch_model(self.model, filepath) 169 | 170 | 171 | class PyTorchBasic(nn.Module): 172 | def _flatten_features(self, in_size, features): 173 | f = features(Variable(torch.ones(1, *in_size))) 174 | return int(np.prod(f.size()[1:])) 175 | 176 | def forward(self, x): 177 | features = self.features(x) 178 | flat_features = features.view(-1, self.flat_features) 179 | out = self.classifier(flat_features) 180 | return out 181 | 182 | def forward_target(self, x): 183 | return self.forward(x) 184 | 185 | 186 | def init_weights_normal(model, mean, std_conv2d, std_linear): 187 | if type(model) == nn.Conv2d: 188 | model.weight.data.normal_(mean=mean, std=std_conv2d) 189 | if type(model) == nn.Linear: 190 | model.weight.data.normal_(mean=mean, std=std_linear) 191 | 192 | 193 | def init_weights_xavier(model): 194 | if isinstance(model, nn.Conv2d): 195 | init.xavier_normal(model.weight) 196 | init.constant(model.bias, 0) 197 | 198 | 199 | def init_weights_he(model): 200 | if isinstance(model, nn.Conv2d): 201 | init.kaiming_normal(model.weight) 202 | init.constant(model.bias, 0) 203 | -------------------------------------------------------------------------------- /toolkit/postprocessing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.externals import joblib 4 | 5 | from steppy.base import BaseTransformer 6 | 7 | 8 | class ClassPredictor(BaseTransformer): 9 | def transform(self, prediction_proba): 10 | predictions_class = np.argmax(prediction_proba, axis=1) 11 | return {'y_pred': predictions_class} 12 | 13 | def load(self, filepath): 14 | return ClassPredictor() 15 | 16 | def persist(self, filepath): 17 | joblib.dump({}, filepath) 18 | 19 | 20 | class PredictionAverage(BaseTransformer): 21 | def __init__(self, weights=None): 22 | super().__init__() 23 | self.weights = weights 24 | 25 | def transform(self, prediction_proba_list): 26 | if self.weights is not None: 27 | reshaped_weights = self._reshape_weights(prediction_proba_list.shape) 28 | prediction_proba_list *= reshaped_weights 29 | avg_pred = np.sum(prediction_proba_list, axis=0) 30 | else: 31 | avg_pred = np.mean(prediction_proba_list, axis=0) 32 | return {'prediction_probability': avg_pred} 33 | 34 | def load(self, filepath): 35 | params = joblib.load(filepath) 36 | self.weights = params['weights'] 37 | return self 38 | 39 | def persist(self, filepath): 40 | joblib.dump({'weights': self.weights}, filepath) 41 | 42 | def _reshape_weights(self, prediction_shape): 43 | dim = len(prediction_shape) 44 | reshape_dim = (-1,) + tuple([1] * (dim - 1)) 45 | reshaped_weights = np.array(self.weights).reshape(reshape_dim) 46 | return reshaped_weights 47 | 48 | 49 | class PredictionAverageUnstack(BaseTransformer): 50 | def transform(self, prediction_probability, id_list): 51 | df = pd.DataFrame(prediction_probability) 52 | df['id'] = id_list 53 | avg_pred = df.groupby('id').mean().reset_index().drop(['id'], axis=1).values 54 | return {'prediction_probability': avg_pred} 55 | 56 | def load(self, filepath): 57 | return self 58 | 59 | def persist(self, filepath): 60 | joblib.dump({}, filepath) 61 | 62 | 63 | class ProbabilityCalibration(BaseTransformer): 64 | def __init__(self, power): 65 | super().__init__() 66 | self.power = power 67 | 68 | def transform(self, predicted_probability): 69 | predicted_probability = np.array(predicted_probability) ** self.power 70 | return {'predicted_probability': predicted_probability} 71 | 72 | 73 | class BlendingOptimizer(BaseTransformer): 74 | """Class for optimizing the weights in blending of different models' predictions. 75 | 76 | Args: 77 | metric (Callable): Callable metric function to optimize. 78 | maximize (bool): default True. Boolean indicating whether the `metric` needs to be maximized or minimized. 79 | power (float): default 1.0. Power to apply on each models' predictions before blending. 80 | Example: 81 | >>> from sklearn.metrics import mean_absolute_error 82 | >>> y = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] 83 | >>> p_model_1 = [0.11, 0.19, 0.25, 0.37, 0.55, 0.62, 0.78, 0.81, 0.94] 84 | >>> p_model_2 = [0.07, 0.21, 0.29, 0.33, 0.53, 0.54, 0.74, 0.74, 0.91] 85 | >>> preds = [p_model_1, p_model_2] 86 | >>> opt = BlendingOptimizer(metric=mean_absolute_error, maximize=False) 87 | >>> opt.fit(X=preds, y=y) 88 | >>> y_pred = opt.transform(X=preds)['y_pred'] 89 | >>> print('MAE 1: {:0.3f}'.format(mean_absolute_error(y, p_model_1))) 90 | >>> print('MAE 2: {:0.3f}'.format(mean_absolute_error(y, p_model_2))) 91 | >>> print('MAE blended: {:0.3f}'.format(mean_absolute_error(y, y_pred))) 92 | """ 93 | 94 | def __init__(self, metric, maximize=True, power=1.0): 95 | super().__init__() 96 | self.metric = metric 97 | self.maximize = maximize 98 | self._power = power 99 | self._score = None 100 | self._weights = None 101 | 102 | def fit(self, X, y, step_size=0.1, init_weights=None, warm_start: bool=False): 103 | """Fit the weights on the given predictions. 104 | 105 | Args: 106 | X (array-like): Predictions of different models for the labels. 107 | y (array-like): Labels. 108 | step_size (float): Step size for optimizing the weights. 109 | Smaller step sizes most likely improve resulting score but increases training time. 110 | init_weights (array-like): Initial weights for training. 111 | When `warm_start` is used `init_weights` are ignored. 112 | warm_start (bool): Continues training. Will only work when `fit` has been called with this object earlier. 113 | When `warm_start` is used `init_weights` are ignored. 114 | Returns: self 115 | """ 116 | assert len(np.shape(X)) == 2, 'X must be 2-dimensional, got {}-D instead.'.format(len(np.shape(X))) 117 | assert np.shape(X)[0] > 1, 'X must contain predictions from at least two models. ' \ 118 | 'Got {} instead'.format(np.shape(X)[0]) 119 | 120 | assert np.shape(X)[1] == len(y), ( 121 | 'BlendingOptimizer: Length of predictions and labels does not match: ' 122 | 'preds_len={}, y_len={}'.format(np.shape(X)[1], len(y))) 123 | 124 | if warm_start: 125 | assert self._weights is not None, 'Optimizer has to be fitted before `warm_start` can be used.' 126 | weights = self._weights 127 | elif init_weights is None: 128 | weights = np.array([1.0] * len(X)) 129 | else: 130 | assert (len(init_weights) == np.shape(X)[0]), ( 131 | 'BlendingOptimizer: Number of models to blend its predictions and weights does not match: ' 132 | 'n_models={}, weights_len={}'.format(np.shape(X)[0], len(init_weights))) 133 | weights = init_weights 134 | 135 | def __is_better_score(score_to_test, score): 136 | return score_to_test > score if self.maximize else not score_to_test > score 137 | 138 | score = 0 139 | best_score = self.maximize - 0.5 140 | 141 | while __is_better_score(best_score, score): 142 | best_score = self.metric(y, np.average(np.power(X, self._power), weights=weights, axis=0) ** ( 143 | 1.0 / self._power)) 144 | score = best_score 145 | best_index, best_step = -1, 0.0 146 | for j in range(len(X)): 147 | delta = np.array([(0 if k != j else step_size) for k in range(len(X))]) 148 | s = self.metric(y, np.average(np.power(X, self._power), weights=weights + delta, axis=0) ** ( 149 | 1.0 / self._power)) 150 | if __is_better_score(s, best_score): 151 | best_index, best_score, best_step = j, s, step_size 152 | continue 153 | if weights[j] - step_size >= 0: 154 | s = self.metric(y, np.average(np.power(X, self._power), weights=weights - delta, axis=0) ** ( 155 | 1.0 / self._power)) 156 | if s > best_score: 157 | best_index, best_score, best_step = j, s, -step_size 158 | if __is_better_score(best_score, score): 159 | weights[best_index] += best_step 160 | 161 | self._weights = weights 162 | self._score = best_score 163 | 164 | return self 165 | 166 | def transform(self, X): 167 | """Performs predictions blending using the trained weights. 168 | 169 | Args: 170 | X (array-like): Predictions of different models. 171 | Returns: dict with blended predictions (key is 'y_pred'). 172 | """ 173 | assert np.shape(X)[0] == len(self._weights), ( 174 | 'BlendingOptimizer: Number of models to blend its predictions and weights does not match: ' 175 | 'n_models={}, weights_len={}'.format(np.shape(X)[0], len(self._weights))) 176 | blended_predictions = np.average(np.power(X, self._power), 177 | weights=self._weights, 178 | axis=0) ** (1.0 / self._power) 179 | 180 | return {'y_pred': blended_predictions} 181 | 182 | def fit_transform(self, X, y, step_size=0.1, init_weights=None, warm_start=False): 183 | """Fit optimizer to X, then transforms X. See `fit` and `transform` for further explanation.""" 184 | self.fit(X=X, y=y, step_size=step_size, init_weights=init_weights, warm_start=warm_start) 185 | 186 | return self.transform(X=X) 187 | 188 | def load(self, filepath): 189 | params = joblib.load(filepath) 190 | self.metric = params['metric'] 191 | self.maximize = params['maximize'] 192 | self._power = params['power'] 193 | self._score = params['score'] 194 | self._weights = params['weights'] 195 | return self 196 | 197 | def persist(self, filepath): 198 | joblib.dump({'metric': self.metric, 199 | 'maximize': self.maximize, 200 | 'power': self._power, 201 | 'score': self._score, 202 | 'weights': self._weights}, 203 | filepath) 204 | -------------------------------------------------------------------------------- /toolkit/preprocessing/misc.py: -------------------------------------------------------------------------------- 1 | import category_encoders as ce 2 | import numpy as np 3 | import pandas as pd 4 | import sklearn.decomposition as decomposition 5 | from fancyimpute import SimpleFill 6 | from sklearn.externals import joblib 7 | from sklearn.feature_extraction import text 8 | from sklearn.preprocessing import Normalizer, MinMaxScaler 9 | from steppy.base import BaseTransformer 10 | 11 | 12 | class XYSplit(BaseTransformer): 13 | def __init__(self, x_columns, y_columns): 14 | super().__init__() 15 | self.x_columns = x_columns 16 | self.y_columns = y_columns 17 | self.columns_to_get = None 18 | self.target_columns = None 19 | 20 | def transform(self, meta, train_mode): 21 | X = meta[self.x_columns].values 22 | if train_mode: 23 | y = meta[self.y_columns].values 24 | else: 25 | y = None 26 | 27 | return {'X': X, 28 | 'y': y} 29 | 30 | def load(self, filepath): 31 | params = joblib.load(filepath) 32 | self.columns_to_get = params['x_columns'] 33 | self.target_columns = params['y_columns'] 34 | return self 35 | 36 | def persist(self, filepath): 37 | params = {'x_columns': self.x_columns, 38 | 'y_columns': self.y_columns 39 | } 40 | joblib.dump(params, filepath) 41 | 42 | 43 | class TfIdfVectorizer(BaseTransformer): 44 | def __init__(self, **kwargs): 45 | super().__init__() 46 | self.vectorizer = text.TfidfVectorizer(**kwargs) 47 | 48 | def fit(self, text): 49 | self.vectorizer.fit(text) 50 | return self 51 | 52 | def transform(self, text): 53 | return {'features': self.vectorizer.transform(text)} 54 | 55 | def load(self, filepath): 56 | self.vectorizer = joblib.load(filepath) 57 | return self 58 | 59 | def persist(self, filepath): 60 | joblib.dump(self.vectorizer, filepath) 61 | 62 | 63 | class TruncatedSVD(BaseTransformer): 64 | def __init__(self, **kwargs): 65 | super().__init__() 66 | self.truncated_svd = decomposition.TruncatedSVD(**kwargs) 67 | 68 | def fit(self, features): 69 | self.truncated_svd.fit(features) 70 | return self 71 | 72 | def transform(self, features): 73 | return {'features': self.truncated_svd.transform(features)} 74 | 75 | def load(self, filepath): 76 | self.truncated_svd = joblib.load(filepath) 77 | return self 78 | 79 | def persist(self, filepath): 80 | joblib.dump(self.truncated_svd, filepath) 81 | 82 | 83 | class Steppy_Normalizer(BaseTransformer): 84 | def __init__(self): 85 | super().__init__() 86 | self.normalizer = Normalizer() 87 | 88 | def fit(self, X): 89 | self.normalizer.fit(X) 90 | return self 91 | 92 | def transform(self, X): 93 | X = self.normalizer.transform(X) 94 | return {'X': X} 95 | 96 | def load(self, filepath): 97 | self.normalizer = joblib.load(filepath) 98 | return self 99 | 100 | def persist(self, filepath): 101 | joblib.dump(self.normalizer, filepath) 102 | 103 | 104 | class Steppy_MinMaxScaler(BaseTransformer): 105 | def __init__(self): 106 | super().__init__() 107 | self.minmax_scaler = MinMaxScaler() 108 | 109 | def fit(self, X): 110 | self.minmax_scaler.fit(X) 111 | return self 112 | 113 | def transform(self, X): 114 | X = self.minmax_scaler.transform(X) 115 | return {'X': X} 116 | 117 | def load(self, filepath): 118 | self.minmax_scaler = joblib.load(filepath) 119 | return self 120 | 121 | def persist(self, filepath): 122 | joblib.dump(self.minmax_scaler, filepath) 123 | 124 | 125 | class MinMaxScalerMultilabel(BaseTransformer): 126 | def __init__(self): 127 | super().__init__() 128 | self.minmax_scalers = [] 129 | 130 | def fit(self, X): 131 | for i in range(X.shape[1]): 132 | minmax_scaler = Steppy_MinMaxScaler() 133 | minmax_scaler.fit(X[:, i, :]) 134 | self.minmax_scalers.append(minmax_scaler) 135 | return self 136 | 137 | def transform(self, X): 138 | for i, minmax_scaler in enumerate(self.minmax_scalers): 139 | X[:, i, :] = minmax_scaler.transform(X[:, i, :]) 140 | return {'X': X} 141 | 142 | def load(self, filepath): 143 | self.minmax_scalers = joblib.load(filepath) 144 | return self 145 | 146 | def persist(self, filepath): 147 | joblib.dump(self.minmax_scalers, filepath) 148 | 149 | 150 | class FillNan(BaseTransformer): 151 | def __init__(self, fill_method='zero', fill_missing=True, **kwargs): 152 | """Imputs NaN's using various filling methods like mean, zero, median, min, random 153 | 154 | 155 | Args: 156 | fill_method: How NaN's will be exchanged. Possible values: 'mean', 'zero', 'median', 'min', 'random' 157 | fill_missing: If True, transformer will fill NaN values by filling method 158 | """ 159 | super().__init__() 160 | self.fill_missing = fill_missing 161 | self.filler = SimpleFill(fill_method) 162 | 163 | def transform(self, X): 164 | """ 165 | Args: 166 | X: DataFrame with NaN's 167 | Returns: 168 | Dictionary with one key - 'X' corresponding to given DataFrame but without nan's 169 | 170 | """ 171 | if self.fill_missing: 172 | X = self.filler.complete(X) 173 | return {'X': X} 174 | 175 | def load(self, filepath): 176 | self.filler = joblib.load(filepath) 177 | return self 178 | 179 | def persist(self, filepath): 180 | joblib.dump(self.filler, filepath) 181 | 182 | 183 | class CategoricalEncoder(BaseTransformer): 184 | def __init__(self): 185 | """Encode features to categorical type""" 186 | super().__init__() 187 | self.encoder_class = ce.OrdinalEncoder 188 | self.categorical_encoder = None 189 | 190 | def fit(self, X): 191 | """ 192 | Args: 193 | X: DataFrame of categorical features to encode 194 | """ 195 | self.categorical_encoder = self.encoder_class(cols=list(X)) 196 | self.categorical_encoder.fit(X) 197 | return self 198 | 199 | def transform(self, X): 200 | """ 201 | Args: 202 | X: DataFrame of categorical features to encode 203 | Returns: 204 | Dictionary with one key - 'categorical_features' corresponding to encoded features form X 205 | """ 206 | X = self.categorical_encoder.transform(X) 207 | return {'categorical_features': X} 208 | 209 | def load(self, filepath): 210 | self.categorical_encoder = joblib.load(filepath) 211 | return self 212 | 213 | def persist(self, filepath): 214 | joblib.dump(self.categorical_encoder, filepath) 215 | 216 | 217 | class GroupbyAggregate(BaseTransformer): 218 | def __init__(self, id_column, groupby_aggregations): 219 | """Group and aggregate features by specified configuration 220 | 221 | 222 | Args: 223 | id_column: Column with id's which will be preprocessed 224 | groupby_aggregations: list of tuples 225 | 226 | Example 227 | groupby_aggregations = [(['f0'], [('f2', 'min'), 228 | ('f2', 'median')]), 229 | (['f0', 'f1'], [('f2', 'mean'), 230 | ('f2', 'max'), 231 | ('f2', 'kurt')])] 232 | X = np.array([[0, 0, 0], 233 | [0, 0, 1], 234 | [0, 1, 0]]) 235 | X = pd.DataFrame(X) 236 | X.columns = ['f0', 'f1', 'f2'] 237 | 238 | tr = GroupbyAggregate(list(range(3)), groupby_aggregations) 239 | aggregations = tr.fit_transform(X) 240 | """ 241 | super().__init__() 242 | self.id_column = id_column 243 | self.groupby_aggregations = groupby_aggregations 244 | 245 | def fit(self, X): 246 | features = pd.DataFrame({self.id_column: X[self.id_column].unique()}) 247 | for groupby_cols, specs in self.groupby_aggregations: 248 | group_object = X.groupby(groupby_cols) 249 | for select, agg in specs: 250 | groupby_aggregate_name = self._create_colname_from_specs(groupby_cols, select, agg) 251 | features = features.merge(group_object[select] 252 | .agg(agg) 253 | .reset_index() 254 | .rename(index=str, 255 | columns={select: groupby_aggregate_name}) 256 | [groupby_cols + [groupby_aggregate_name]], 257 | on=groupby_cols, 258 | how='left') 259 | self.features = features 260 | return self 261 | 262 | def transform(self, X): 263 | return {'numerical_features': self.features} 264 | 265 | def load(self, filepath): 266 | self.features = joblib.load(filepath) 267 | return self 268 | 269 | def persist(self, filepath): 270 | joblib.dump(self.features, filepath) 271 | 272 | def _create_colname_from_specs(self, groupby_cols, select, agg): 273 | return '{}_{}_{}'.format('_'.join(groupby_cols), agg, select) 274 | 275 | 276 | class FeatureJoiner(BaseTransformer): 277 | """Concatenate all features to one DataFrame of given id_column 278 | 279 | Args: 280 | id_column: Column with id's which will be preprocessed 281 | """ 282 | 283 | def __init__(self, id_column): 284 | super().__init__() 285 | self.id_column = id_column 286 | 287 | def transform(self, numerical_feature_list, categorical_feature_list): 288 | """ 289 | Args: 290 | numerical_feature_list: list of numerical features 291 | categorical_feature_list: list of categorical features 292 | 293 | Returns: 294 | Dictionary with following keys: 295 | features: DataFrame with concatenated features 296 | feature_names: list of features names 297 | categorical_features: list of categorical feature names 298 | """ 299 | features = numerical_feature_list + categorical_feature_list 300 | for feature in features: 301 | feature = self._format_target(feature) 302 | feature.set_index(self.id_column, drop=True, inplace=True) 303 | features = pd.concat(features, axis=1).astype(np.float32).reset_index() 304 | 305 | outputs = dict() 306 | outputs['features'] = features 307 | outputs['feature_names'] = list(features.columns) 308 | outputs['categorical_features'] = self._get_feature_names(categorical_feature_list) 309 | return outputs 310 | 311 | def _get_feature_names(self, dataframes): 312 | feature_names = [] 313 | for dataframe in dataframes: 314 | try: 315 | feature_names.extend(list(dataframe.columns)) 316 | except Exception as e: 317 | print(e) 318 | feature_names.append(dataframe.name) 319 | 320 | return feature_names 321 | 322 | def _format_target(self, target): 323 | if isinstance(target, pd.Series): 324 | return pd.DataFrame(target) 325 | return target -------------------------------------------------------------------------------- /toolkit/keras_transformers/models.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | 3 | from keras.models import load_model 4 | from steppy.base import BaseTransformer 5 | 6 | from toolkit.keras_transformers.architectures import vdcnn, scnn, dpcnn, cudnn_gru, cudnn_lstm 7 | from toolkit.keras_transformers.contrib import AttentionWeightedAverage 8 | 9 | 10 | class KerasModelTransformer(BaseTransformer): 11 | def __init__(self, architecture_config, training_config, callbacks_config): 12 | super().__init__() 13 | self.architecture_config = architecture_config 14 | self.training_config = training_config 15 | self.callbacks_config = callbacks_config 16 | 17 | def reset(self): 18 | self.model = self._build_model(**self.architecture_config) 19 | 20 | def _compile_model(self, model_params, optimizer_params): 21 | model = self._build_model(**model_params) 22 | optimizer = self._build_optimizer(**optimizer_params) 23 | loss = self._build_loss() 24 | model.compile(optimizer=optimizer, loss=loss) 25 | return model 26 | 27 | def _create_callbacks(self, **kwargs): 28 | raise NotImplementedError 29 | 30 | def _build_model(self, **kwargs): 31 | raise NotImplementedError 32 | 33 | def _build_optimizer(self, **kwargs): 34 | raise NotImplementedError 35 | 36 | def _build_loss(self, **kwargs): 37 | raise NotImplementedError 38 | 39 | def persist(self, filepath): 40 | checkpoint_callback = self.callbacks_config.get('model_checkpoint') 41 | if checkpoint_callback: 42 | checkpoint_filepath = checkpoint_callback['filepath'] 43 | shutil.copyfile(checkpoint_filepath, filepath) 44 | else: 45 | self.model.save(filepath) 46 | 47 | def load(self, filepath): 48 | self.model = load_model(filepath, 49 | custom_objects={'AttentionWeightedAverage': AttentionWeightedAverage}) 50 | return self 51 | 52 | 53 | class ClassifierXY(KerasModelTransformer): 54 | def fit(self, X, y, validation_data, *args, **kwargs): 55 | self.callbacks = self._create_callbacks(**self.callbacks_config) 56 | self.model = self._compile_model(**self.architecture_config) 57 | 58 | self.model.fit(X, y, 59 | validation_data=validation_data, 60 | callbacks=self.callbacks, 61 | verbose=1, 62 | **self.training_config) 63 | return self 64 | 65 | def transform(self, X, y=None, validation_data=None, *args, **kwargs): 66 | predictions = self.model.predict(X, verbose=1) 67 | return {'prediction_probability': predictions} 68 | 69 | 70 | class ClassifierGenerator(KerasModelTransformer): 71 | def fit(self, datagen, X, y, datagen_valid=None, X_valid=None, y_valid=None, *args, **kwargs): 72 | self.callbacks = self._create_callbacks(**self.callbacks_config) 73 | self.model = self._compile_model(**self.architecture_config) 74 | 75 | fit_args = self.training_config['fit_args'] 76 | flow_args = self.training_config['flow_args'] 77 | batch_size = flow_args['batch_size'] 78 | if X_valid is None: 79 | self.model.fit_generator( 80 | datagen.flow(X, y, **flow_args), 81 | steps_per_epoch=len(X) // batch_size, 82 | callbacks=self.callbacks, 83 | **fit_args) 84 | return self 85 | else: 86 | if datagen_valid is None: 87 | datagen_valid = datagen 88 | self.model.fit_generator( 89 | datagen.flow(X, y, **flow_args), 90 | steps_per_epoch=len(X) // batch_size, 91 | validation_data=datagen_valid.flow(X_valid, y_valid, **flow_args), 92 | validation_steps=len(X_valid) // batch_size, 93 | callbacks=self.callbacks, 94 | **fit_args) 95 | return self 96 | 97 | def transform(self, datagen, X, datagen_valid=None, X_valid=None, *args, **kwargs): 98 | flow_args = self.training_config['flow_args'] 99 | y_proba_train = self.model.predict_generator( 100 | datagen.flow(X, shuffle=False, **flow_args)) 101 | result = dict(output=y_proba_train) 102 | if X_valid is not None: 103 | if datagen_valid is None: 104 | datagen_valid = datagen 105 | y_proba_valid = self.model.predict_generator( 106 | datagen_valid.flow(X_valid, shuffle=False, **flow_args)) 107 | result.update(dict(output_valid=y_proba_valid)) 108 | return result 109 | 110 | 111 | class PretrainedEmbeddingModel(ClassifierXY): 112 | def fit(self, X, y, validation_data, embedding_matrix): 113 | X_valid, y_valid = validation_data 114 | self.callbacks = self._create_callbacks(**self.callbacks_config) 115 | self.architecture_config['model_params']['embedding_matrix'] = embedding_matrix 116 | self.model = self._compile_model(**self.architecture_config) 117 | self.model.fit(X, y, 118 | validation_data=[X_valid, y_valid], 119 | callbacks=self.callbacks, 120 | verbose=1, 121 | **self.training_config) 122 | return self 123 | 124 | def transform(self, X, y=None, validation_data=None, embedding_matrix=None): 125 | predictions = self.model.predict(X, verbose=1) 126 | return {'prediction_probability': predictions} 127 | 128 | 129 | class CharVDCNNTransformer(ClassifierXY): 130 | def _build_model(self, embedding_size, maxlen, max_features, 131 | filter_nr, kernel_size, repeat_block, 132 | dense_size, repeat_dense, output_size, output_activation, 133 | max_pooling, mean_pooling, weighted_average_attention, concat_mode, 134 | dropout_embedding, conv_dropout, dense_dropout, dropout_mode, 135 | conv_kernel_reg_l2, conv_bias_reg_l2, 136 | dense_kernel_reg_l2, dense_bias_reg_l2, 137 | use_prelu, use_batch_norm, batch_norm_first): 138 | return vdcnn(embedding_size, maxlen, max_features, 139 | filter_nr, kernel_size, repeat_block, 140 | dense_size, repeat_dense, output_size, output_activation, 141 | max_pooling, mean_pooling, weighted_average_attention, concat_mode, 142 | dropout_embedding, conv_dropout, dense_dropout, dropout_mode, 143 | conv_kernel_reg_l2, conv_bias_reg_l2, 144 | dense_kernel_reg_l2, dense_bias_reg_l2, 145 | use_prelu, use_batch_norm, batch_norm_first) 146 | 147 | 148 | class WordSCNNTransformer(PretrainedEmbeddingModel): 149 | def _build_model(self, embedding_matrix, embedding_size, trainable_embedding, maxlen, max_features, 150 | filter_nr, kernel_size, repeat_block, 151 | dense_size, repeat_dense, output_size, output_activation, 152 | max_pooling, mean_pooling, weighted_average_attention, concat_mode, 153 | dropout_embedding, conv_dropout, dense_dropout, dropout_mode, 154 | conv_kernel_reg_l2, conv_bias_reg_l2, 155 | dense_kernel_reg_l2, dense_bias_reg_l2, 156 | use_prelu, use_batch_norm, batch_norm_first): 157 | return scnn(embedding_matrix, embedding_size, trainable_embedding, maxlen, max_features, 158 | filter_nr, kernel_size, repeat_block, 159 | dense_size, repeat_dense, output_size, output_activation, 160 | max_pooling, mean_pooling, weighted_average_attention, concat_mode, 161 | dropout_embedding, conv_dropout, dense_dropout, dropout_mode, 162 | conv_kernel_reg_l2, conv_bias_reg_l2, 163 | dense_kernel_reg_l2, dense_bias_reg_l2, 164 | use_prelu, use_batch_norm, batch_norm_first) 165 | 166 | 167 | class WordDPCNNTransformer(PretrainedEmbeddingModel): 168 | def _build_model(self, embedding_matrix, embedding_size, trainable_embedding, maxlen, max_features, 169 | filter_nr, kernel_size, repeat_block, 170 | dense_size, repeat_dense, output_size, output_activation, 171 | max_pooling, mean_pooling, weighted_average_attention, concat_mode, 172 | dropout_embedding, conv_dropout, dense_dropout, dropout_mode, 173 | conv_kernel_reg_l2, conv_bias_reg_l2, 174 | dense_kernel_reg_l2, dense_bias_reg_l2, 175 | use_prelu, use_batch_norm, batch_norm_first): 176 | """ 177 | Implementation of http://ai.tencent.com/ailab/media/publications/ACL3-Brady.pdf 178 | """ 179 | return dpcnn(embedding_matrix, embedding_size, trainable_embedding, maxlen, max_features, 180 | filter_nr, kernel_size, repeat_block, 181 | dense_size, repeat_dense, output_size, output_activation, 182 | max_pooling, mean_pooling, weighted_average_attention, concat_mode, 183 | dropout_embedding, conv_dropout, dense_dropout, dropout_mode, 184 | conv_kernel_reg_l2, conv_bias_reg_l2, 185 | dense_kernel_reg_l2, dense_bias_reg_l2, 186 | use_prelu, use_batch_norm, batch_norm_first) 187 | 188 | 189 | class WordCuDNNLSTMTransformer(PretrainedEmbeddingModel): 190 | def _build_model(self, embedding_matrix, embedding_size, trainable_embedding, 191 | maxlen, max_features, 192 | unit_nr, repeat_block, 193 | dense_size, repeat_dense, output_size, output_activation, 194 | max_pooling, mean_pooling, weighted_average_attention, concat_mode, 195 | dropout_embedding, rnn_dropout, dense_dropout, dropout_mode, 196 | rnn_kernel_reg_l2, rnn_recurrent_reg_l2, rnn_bias_reg_l2, 197 | dense_kernel_reg_l2, dense_bias_reg_l2, 198 | use_prelu, use_batch_norm, batch_norm_first): 199 | return cudnn_lstm(embedding_matrix, embedding_size, trainable_embedding, 200 | maxlen, max_features, 201 | unit_nr, repeat_block, 202 | dense_size, repeat_dense, output_size, output_activation, 203 | max_pooling, mean_pooling, weighted_average_attention, concat_mode, 204 | dropout_embedding, rnn_dropout, dense_dropout, dropout_mode, 205 | rnn_kernel_reg_l2, rnn_recurrent_reg_l2, rnn_bias_reg_l2, 206 | dense_kernel_reg_l2, dense_bias_reg_l2, 207 | use_prelu, use_batch_norm, batch_norm_first) 208 | 209 | 210 | class WordCuDNNGRUTransformer(PretrainedEmbeddingModel): 211 | def _build_model(self, embedding_matrix, embedding_size, trainable_embedding, 212 | maxlen, max_features, 213 | unit_nr, repeat_block, 214 | dense_size, repeat_dense, output_size, output_activation, 215 | max_pooling, mean_pooling, weighted_average_attention, concat_mode, 216 | dropout_embedding, rnn_dropout, dense_dropout, dropout_mode, 217 | rnn_kernel_reg_l2, rnn_recurrent_reg_l2, rnn_bias_reg_l2, 218 | dense_kernel_reg_l2, dense_bias_reg_l2, 219 | use_prelu, use_batch_norm, batch_norm_first): 220 | return cudnn_gru(embedding_matrix, embedding_size, trainable_embedding, 221 | maxlen, max_features, 222 | unit_nr, repeat_block, 223 | dense_size, repeat_dense, output_size, output_activation, 224 | max_pooling, mean_pooling, weighted_average_attention, concat_mode, 225 | dropout_embedding, rnn_dropout, dense_dropout, dropout_mode, 226 | rnn_kernel_reg_l2, rnn_recurrent_reg_l2, rnn_bias_reg_l2, 227 | dense_kernel_reg_l2, dense_bias_reg_l2, 228 | use_prelu, use_batch_norm, batch_norm_first) 229 | -------------------------------------------------------------------------------- /toolkit/pytorch_transformers/callbacks.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime, timedelta 3 | 4 | from deepsense import neptune 5 | from steppy.utils import get_logger 6 | from torch.optim.lr_scheduler import ExponentialLR 7 | 8 | from toolkit.pytorch_transformers.utils import Averager, persist_torch_model 9 | from toolkit.pytorch_transformers.validation import score_model 10 | 11 | logger = get_logger() 12 | 13 | 14 | class Callback: 15 | def __init__(self): 16 | self.epoch_id = None 17 | self.batch_id = None 18 | 19 | self.model = None 20 | self.optimizer = None 21 | self.loss_function = None 22 | self.output_names = None 23 | self.validation_datagen = None 24 | self.lr_scheduler = None 25 | 26 | def set_params(self, transformer, validation_datagen): 27 | self.model = transformer.model 28 | self.optimizer = transformer.optimizer 29 | self.loss_function = transformer.loss_function 30 | self.output_names = transformer.output_names 31 | self.validation_datagen = validation_datagen 32 | self.validation_loss = transformer.validation_loss 33 | 34 | def on_train_begin(self, *args, **kwargs): 35 | self.epoch_id = 0 36 | self.batch_id = 0 37 | 38 | def on_train_end(self, *args, **kwargs): 39 | pass 40 | 41 | def on_epoch_begin(self, *args, **kwargs): 42 | pass 43 | 44 | def on_epoch_end(self, *args, **kwargs): 45 | self.epoch_id += 1 46 | 47 | def on_batch_begin(self, *args, **kwargs): 48 | pass 49 | 50 | def on_batch_end(self, *args, **kwargs): 51 | self.batch_id += 1 52 | 53 | def training_break(self, *args, **kwargs): 54 | return False 55 | 56 | def get_validation_loss(self): 57 | if self.epoch_id not in self.validation_loss.keys(): 58 | self.validation_loss[self.epoch_id] = score_model(self.model, 59 | self.loss_function, 60 | self.validation_datagen) 61 | return self.validation_loss[self.epoch_id] 62 | 63 | 64 | class CallbackList: 65 | def __init__(self, callbacks=None): 66 | if callbacks is None: 67 | self.callbacks = [] 68 | elif isinstance(callbacks, Callback): 69 | self.callbacks = [callbacks] 70 | else: 71 | self.callbacks = callbacks 72 | 73 | def __len__(self): 74 | return len(self.callbacks) 75 | 76 | def set_params(self, *args, **kwargs): 77 | for callback in self.callbacks: 78 | callback.set_params(*args, **kwargs) 79 | 80 | def on_train_begin(self, *args, **kwargs): 81 | for callback in self.callbacks: 82 | callback.on_train_begin(*args, **kwargs) 83 | 84 | def on_train_end(self, *args, **kwargs): 85 | for callback in self.callbacks: 86 | callback.on_train_end(*args, **kwargs) 87 | 88 | def on_epoch_begin(self, *args, **kwargs): 89 | for callback in self.callbacks: 90 | callback.on_epoch_begin(*args, **kwargs) 91 | 92 | def on_epoch_end(self, *args, **kwargs): 93 | for callback in self.callbacks: 94 | callback.on_epoch_end(*args, **kwargs) 95 | 96 | def on_batch_begin(self, *args, **kwargs): 97 | for callback in self.callbacks: 98 | callback.on_batch_begin(*args, **kwargs) 99 | 100 | def on_batch_end(self, *args, **kwargs): 101 | for callback in self.callbacks: 102 | callback.on_batch_end(*args, **kwargs) 103 | 104 | def training_break(self, *args, **kwargs): 105 | callback_out = [callback.training_break(*args, **kwargs) for callback in self.callbacks] 106 | return any(callback_out) 107 | 108 | 109 | class TrainingMonitor(Callback): 110 | def __init__(self, epoch_every=None, batch_every=None): 111 | super().__init__() 112 | self.epoch_loss_averagers = {} 113 | if epoch_every == 0: 114 | self.epoch_every = False 115 | else: 116 | self.epoch_every = epoch_every 117 | if batch_every == 0: 118 | self.batch_every = False 119 | else: 120 | self.batch_every = batch_every 121 | 122 | def on_train_begin(self, *args, **kwargs): 123 | self.epoch_loss_averagers = {} 124 | self.epoch_id = 0 125 | self.batch_id = 0 126 | 127 | def on_epoch_end(self, *args, **kwargs): 128 | for name, averager in self.epoch_loss_averagers.items(): 129 | epoch_avg_loss = averager.value 130 | averager.reset() 131 | if self.epoch_every and ((self.epoch_id % self.epoch_every) == 0): 132 | logger.info('epoch {0} {1}: {2:.5f}'.format(self.epoch_id, name, epoch_avg_loss)) 133 | self.epoch_id += 1 134 | 135 | def on_batch_end(self, metrics, *args, **kwargs): 136 | for name, loss in metrics.items(): 137 | loss = loss.data.cpu().numpy()[0] 138 | if name in self.epoch_loss_averagers.keys(): 139 | self.epoch_loss_averagers[name].send(loss) 140 | else: 141 | self.epoch_loss_averagers[name] = Averager() 142 | self.epoch_loss_averagers[name].send(loss) 143 | 144 | if self.batch_every and ((self.batch_id % self.batch_every) == 0): 145 | logger.info('epoch {0} batch {1} {2}: {3:.5f}'.format(self.epoch_id, self.batch_id, name, loss)) 146 | self.batch_id += 1 147 | 148 | 149 | class ValidationMonitor(Callback): 150 | def __init__(self, epoch_every=None, batch_every=None): 151 | super().__init__() 152 | if epoch_every == 0: 153 | self.epoch_every = False 154 | else: 155 | self.epoch_every = epoch_every 156 | if batch_every == 0: 157 | self.batch_every = False 158 | else: 159 | self.batch_every = batch_every 160 | 161 | def on_epoch_end(self, *args, **kwargs): 162 | if self.epoch_every and ((self.epoch_id % self.epoch_every) == 0): 163 | self.model.eval() 164 | val_loss = self.get_validation_loss() 165 | self.model.train() 166 | for name, loss in val_loss.items(): 167 | loss = loss.data.cpu().numpy()[0] 168 | logger.info('epoch {0} validation {1}: {2:.5f}'.format(self.epoch_id, name, loss)) 169 | self.epoch_id += 1 170 | 171 | 172 | class EarlyStopping(Callback): 173 | def __init__(self, patience, minimize=True): 174 | super().__init__() 175 | self.patience = patience 176 | self.minimize = minimize 177 | self.best_score = None 178 | self.epoch_since_best = 0 179 | self._training_break = False 180 | 181 | def on_epoch_end(self, *args, **kwargs): 182 | self.model.eval() 183 | val_loss = self.get_validation_loss() 184 | loss_sum = val_loss['sum'] 185 | loss_sum = loss_sum.data.cpu().numpy()[0] 186 | 187 | self.model.train() 188 | 189 | if not self.best_score: 190 | self.best_score = loss_sum 191 | 192 | if (self.minimize and loss_sum < self.best_score) or (not self.minimize and loss_sum > self.best_score): 193 | self.best_score = loss_sum 194 | self.epoch_since_best = 0 195 | else: 196 | self.epoch_since_best += 1 197 | 198 | if self.epoch_since_best > self.patience: 199 | self._training_break = True 200 | 201 | self.epoch_id += 1 202 | 203 | def training_break(self, *args, **kwargs): 204 | return self._training_break 205 | 206 | 207 | class ExponentialLRScheduler(Callback): 208 | def __init__(self, gamma, epoch_every=1, batch_every=None): 209 | super().__init__() 210 | self.gamma = gamma 211 | if epoch_every == 0: 212 | self.epoch_every = False 213 | else: 214 | self.epoch_every = epoch_every 215 | if batch_every == 0: 216 | self.batch_every = False 217 | else: 218 | self.batch_every = batch_every 219 | 220 | def set_params(self, transformer, validation_datagen): 221 | self.validation_datagen = validation_datagen 222 | self.model = transformer.model 223 | self.optimizer = transformer.optimizer 224 | self.loss_function = transformer.loss_function 225 | self.lr_scheduler = ExponentialLR(self.optimizer, self.gamma, last_epoch=-1) 226 | 227 | def on_train_begin(self, *args, **kwargs): 228 | self.epoch_id = 0 229 | self.batch_id = 0 230 | logger.info('initial lr: {0}'.format(self.optimizer.state_dict()['param_groups'][0]['initial_lr'])) 231 | 232 | def on_epoch_end(self, *args, **kwargs): 233 | if self.epoch_every and (((self.epoch_id + 1) % self.epoch_every) == 0): 234 | self.lr_scheduler.step() 235 | logger.info('epoch {0} current lr: {1}'.format(self.epoch_id + 1, 236 | self.optimizer.state_dict()['param_groups'][0]['lr'])) 237 | self.epoch_id += 1 238 | 239 | def on_batch_end(self, *args, **kwargs): 240 | if self.batch_every and ((self.batch_id % self.batch_every) == 0): 241 | self.lr_scheduler.step() 242 | logger.info('epoch {0} batch {1} current lr: {2}'.format( 243 | self.epoch_id + 1, self.batch_id + 1, self.optimizer.state_dict()['param_groups'][0]['lr'])) 244 | self.batch_id += 1 245 | 246 | 247 | class ModelCheckpoint(Callback): 248 | def __init__(self, filepath, epoch_every=1, minimize=True): 249 | super().__init__() 250 | self.filepath = filepath 251 | self.minimize = minimize 252 | self.best_score = None 253 | 254 | if epoch_every == 0: 255 | self.epoch_every = False 256 | else: 257 | self.epoch_every = epoch_every 258 | 259 | def on_train_begin(self, *args, **kwargs): 260 | self.epoch_id = 0 261 | self.batch_id = 0 262 | os.makedirs(os.path.dirname(self.filepath), exist_ok=True) 263 | 264 | def on_epoch_end(self, *args, **kwargs): 265 | if self.epoch_every and ((self.epoch_id % self.epoch_every) == 0): 266 | self.model.eval() 267 | val_loss = self.get_validation_loss() 268 | loss_sum = val_loss['sum'] 269 | loss_sum = loss_sum.data.cpu().numpy()[0] 270 | 271 | self.model.train() 272 | 273 | if not self.best_score: 274 | self.best_score = loss_sum 275 | 276 | if (self.minimize and loss_sum < self.best_score) or (not self.minimize and loss_sum > self.best_score) or ( 277 | self.epoch_id == 0): 278 | self.best_score = loss_sum 279 | persist_torch_model(self.model, self.filepath) 280 | logger.info('epoch {0} model persisted to {1}'.format(self.epoch_id, self.filepath)) 281 | 282 | self.epoch_id += 1 283 | 284 | 285 | class NeptuneMonitor(Callback): 286 | def __init__(self, model_name): 287 | super().__init__() 288 | self.model_name = model_name 289 | self.ctx = neptune.Context() 290 | self.epoch_loss_averager = Averager() 291 | 292 | def on_train_begin(self, *args, **kwargs): 293 | self.epoch_loss_averagers = {} 294 | self.epoch_id = 0 295 | self.batch_id = 0 296 | 297 | def on_batch_end(self, metrics, *args, **kwargs): 298 | for name, loss in metrics.items(): 299 | loss = loss.data.cpu().numpy()[0] 300 | 301 | if name in self.epoch_loss_averagers.keys(): 302 | self.epoch_loss_averagers[name].send(loss) 303 | else: 304 | self.epoch_loss_averagers[name] = Averager() 305 | self.epoch_loss_averagers[name].send(loss) 306 | 307 | self.ctx.channel_send('{} batch {} loss'.format(self.model_name, name), x=self.batch_id, y=loss) 308 | 309 | self.batch_id += 1 310 | 311 | def on_epoch_end(self, *args, **kwargs): 312 | self._send_numeric_channels() 313 | self.epoch_id += 1 314 | 315 | def _send_numeric_channels(self, *args, **kwargs): 316 | for name, averager in self.epoch_loss_averagers.items(): 317 | epoch_avg_loss = averager.value 318 | averager.reset() 319 | self.ctx.channel_send('{} epoch {} loss'.format(self.model_name, name), x=self.epoch_id, y=epoch_avg_loss) 320 | 321 | self.model.eval() 322 | val_loss = self.get_validation_loss() 323 | self.model.train() 324 | for name, loss in val_loss.items(): 325 | loss = loss.data.cpu().numpy()[0] 326 | self.ctx.channel_send('{} epoch_val {} loss'.format(self.model_name, name), x=self.epoch_id, y=loss) 327 | 328 | 329 | class ExperimentTiming(Callback): 330 | def __init__(self, epoch_every=None, batch_every=None): 331 | super().__init__() 332 | if epoch_every == 0: 333 | self.epoch_every = False 334 | else: 335 | self.epoch_every = epoch_every 336 | if batch_every == 0: 337 | self.batch_every = False 338 | else: 339 | self.batch_every = batch_every 340 | self.batch_start = None 341 | self.epoch_start = None 342 | self.current_sum = None 343 | self.current_mean = None 344 | 345 | def on_train_begin(self, *args, **kwargs): 346 | self.epoch_id = 0 347 | self.batch_id = 0 348 | logger.info('starting training...') 349 | 350 | def on_train_end(self, *args, **kwargs): 351 | logger.info('training finished') 352 | 353 | def on_epoch_begin(self, *args, **kwargs): 354 | if self.epoch_id > 0: 355 | epoch_time = datetime.now() - self.epoch_start 356 | if self.epoch_every: 357 | if (self.epoch_id % self.epoch_every) == 0: 358 | logger.info('epoch {0} time {1}'.format(self.epoch_id - 1, str(epoch_time)[:-7])) 359 | self.epoch_start = datetime.now() 360 | self.current_sum = timedelta() 361 | self.current_mean = timedelta() 362 | logger.info('epoch {0} ...'.format(self.epoch_id)) 363 | 364 | def on_batch_begin(self, *args, **kwargs): 365 | if self.batch_id > 0: 366 | current_delta = datetime.now() - self.batch_start 367 | self.current_sum += current_delta 368 | self.current_mean = self.current_sum / self.batch_id 369 | if self.batch_every: 370 | if self.batch_id > 0 and (((self.batch_id - 1) % self.batch_every) == 0): 371 | logger.info('epoch {0} average batch time: {1}'.format(self.epoch_id, str(self.current_mean)[:-5])) 372 | if self.batch_every: 373 | if self.batch_id == 0 or self.batch_id % self.batch_every == 0: 374 | logger.info('epoch {0} batch {1} ...'.format(self.epoch_id, self.batch_id)) 375 | self.batch_start = datetime.now() 376 | 377 | 378 | class ReduceLROnPlateau(Callback): # thank you keras 379 | def __init__(self): 380 | super().__init__() 381 | pass 382 | -------------------------------------------------------------------------------- /toolkit/pytorch_transformers/architectures/unet.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | from toolkit.pytorch_transformers.architectures.utils import get_downsample_pad, get_upsample_pad 7 | 8 | 9 | class UNet(nn.Module): 10 | def __init__(self, conv_kernel=3, 11 | pool_kernel=3, pool_stride=2, 12 | repeat_blocks=2, n_filters=8, 13 | batch_norm=True, dropout=0.1, 14 | in_channels=3, out_channels=2, 15 | kernel_scale=3, 16 | **kwargs): 17 | 18 | assert conv_kernel % 2 == 1, "Size of convolution kernel has to be an odd number. " \ 19 | "Otherwise convolution layer will not keep image size" 20 | assert pool_stride > 1 or pool_kernel % 2 == 1, "Pooling layer stride has to be greater than one or" \ 21 | "kernel of pooling layer has to be an odd number." 22 | warnings.warn("Please make sure, that your input tensor's dimensions are divisible by " 23 | "(pool_stride ** repeat_blocks)") 24 | 25 | super(UNet, self).__init__() 26 | 27 | self.conv_kernel = conv_kernel 28 | self.conv_stride = 1 29 | self.pool_kernel = pool_kernel 30 | self.pool_stride = pool_stride 31 | self.repeat_blocks = repeat_blocks 32 | self.n_filters = n_filters 33 | self.batch_norm = batch_norm 34 | self.dropout = dropout 35 | self.in_channels = in_channels 36 | self.out_channels = out_channels 37 | self.kernel_scale = kernel_scale 38 | 39 | self.input_block = self._input_block() 40 | self.down_convs = self._down_convs() 41 | self.down_pools = self._down_pools() 42 | self.floor_block = self._floor_block() 43 | self.up_convs = self._up_convs() 44 | self.up_samples = self._up_samples() 45 | self.classification_block = self._classification_block() 46 | self.output_layer = self._output_layer() 47 | 48 | def _down_convs(self): 49 | down_convs = [] 50 | for i in range(self.repeat_blocks): 51 | in_channels = int(self.n_filters * 2 ** i) 52 | down_convs.append(DownConv(in_channels, self.conv_kernel, self.batch_norm, self.dropout)) 53 | return nn.ModuleList(down_convs) 54 | 55 | def _up_convs(self): 56 | up_convs = [] 57 | for i in range(self.repeat_blocks): 58 | in_channels = int(self.n_filters * 2 ** (i + 2)) 59 | up_convs.append(UpConv(in_channels, self.conv_kernel, self.batch_norm, self.dropout)) 60 | return nn.ModuleList(up_convs) 61 | 62 | def _down_pools(self): 63 | down_pools = [] 64 | padding = get_downsample_pad(stride=self.pool_stride, kernel=self.pool_kernel) 65 | for _ in range(self.repeat_blocks): 66 | down_pools.append(nn.MaxPool2d(kernel_size=self.pool_kernel, 67 | stride=self.pool_stride, 68 | padding=padding)) 69 | return nn.ModuleList(down_pools) 70 | 71 | def _up_samples(self): 72 | up_samples = [] 73 | kernel_scale = self.kernel_scale 74 | stride = self.pool_stride 75 | kernel_size = kernel_scale * stride 76 | padding, output_padding = get_upsample_pad(stride=stride, kernel=kernel_size) 77 | for i in range(self.repeat_blocks): 78 | in_channels = int(self.n_filters * 2 ** (i + 2)) 79 | out_channels = int(self.n_filters * 2 ** (i + 1)) 80 | up_samples.append(nn.ConvTranspose2d(in_channels=in_channels, 81 | out_channels=out_channels, 82 | kernel_size=kernel_size, 83 | stride=stride, 84 | padding=padding, 85 | output_padding=output_padding, 86 | bias=False 87 | )) 88 | return nn.ModuleList(up_samples) 89 | 90 | def _input_block(self): 91 | stride = self.conv_stride 92 | padding = get_downsample_pad(stride=stride, kernel=self.conv_kernel) 93 | if self.batch_norm: 94 | input_block = nn.Sequential(nn.Conv2d(in_channels=self.in_channels, out_channels=self.n_filters, 95 | kernel_size=(self.conv_kernel, self.conv_kernel), 96 | stride=stride, padding=padding), 97 | nn.BatchNorm2d(num_features=self.n_filters), 98 | nn.ReLU(), 99 | 100 | nn.Conv2d(in_channels=self.n_filters, out_channels=self.n_filters, 101 | kernel_size=(self.conv_kernel, self.conv_kernel), 102 | stride=stride, padding=padding), 103 | nn.BatchNorm2d(num_features=self.n_filters), 104 | nn.ReLU(), 105 | 106 | nn.Dropout(self.dropout), 107 | ) 108 | else: 109 | input_block = nn.Sequential(nn.Conv2d(in_channels=self.in_channels, out_channels=self.n_filters, 110 | kernel_size=(self.conv_kernel, self.conv_kernel), 111 | stride=stride, padding=padding), 112 | nn.ReLU(), 113 | 114 | nn.Conv2d(in_channels=self.n_filters, out_channels=self.n_filters, 115 | kernel_size=(self.conv_kernel, self.conv_kernel), 116 | stride=stride, padding=padding), 117 | nn.ReLU(), 118 | 119 | nn.Dropout(self.dropout), 120 | ) 121 | return input_block 122 | 123 | def _floor_block(self): 124 | in_channels = int(self.n_filters * 2 ** self.repeat_blocks) 125 | return nn.Sequential(DownConv(in_channels, self.conv_kernel, self.batch_norm, self.dropout), 126 | ) 127 | 128 | def _classification_block(self): 129 | in_block = int(2 * self.n_filters) 130 | stride = self.conv_stride 131 | padding = get_downsample_pad(stride=stride, kernel=self.conv_kernel) 132 | 133 | if self.batch_norm: 134 | classification_block = nn.Sequential(nn.Conv2d(in_channels=in_block, out_channels=self.n_filters, 135 | kernel_size=(self.conv_kernel, self.conv_kernel), 136 | stride=stride, padding=padding), 137 | nn.BatchNorm2d(num_features=self.n_filters), 138 | nn.ReLU(), 139 | nn.Dropout(self.dropout), 140 | 141 | nn.Conv2d(in_channels=self.n_filters, out_channels=self.n_filters, 142 | kernel_size=(self.conv_kernel, self.conv_kernel), 143 | stride=stride, padding=padding), 144 | nn.BatchNorm2d(num_features=self.n_filters), 145 | nn.ReLU(), 146 | ) 147 | else: 148 | classification_block = nn.Sequential(nn.Conv2d(in_channels=in_block, out_channels=self.n_filters, 149 | kernel_size=(self.conv_kernel, self.conv_kernel), 150 | stride=stride, padding=padding), 151 | nn.ReLU(), 152 | nn.Dropout(self.dropout), 153 | 154 | nn.Conv2d(in_channels=self.n_filters, out_channels=self.n_filters, 155 | kernel_size=(self.conv_kernel, self.conv_kernel), 156 | stride=stride, padding=padding), 157 | nn.ReLU(), 158 | ) 159 | return classification_block 160 | 161 | def _output_layer(self): 162 | return nn.Conv2d(in_channels=self.n_filters, out_channels=self.out_channels, 163 | kernel_size=(1, 1), stride=1, padding=0) 164 | 165 | def forward(self, x): 166 | x = self.input_block(x) 167 | 168 | down_convs_outputs = [] 169 | for block, down_pool in zip(self.down_convs, self.down_pools): 170 | x = block(x) 171 | down_convs_outputs.append(x) 172 | x = down_pool(x) 173 | x = self.floor_block(x) 174 | 175 | for down_conv_output, block, up_sample in zip(reversed(down_convs_outputs), 176 | reversed(self.up_convs), 177 | reversed(self.up_samples)): 178 | x = up_sample(x) 179 | x = torch.cat((down_conv_output, x), dim=1) 180 | 181 | x = block(x) 182 | 183 | x = self.classification_block(x) 184 | x = self.output_layer(x) 185 | return x 186 | 187 | 188 | class UNetMultitask(UNet): 189 | def __init__(self, 190 | conv_kernel, 191 | pool_kernel, 192 | pool_stride, 193 | repeat_blocks, 194 | n_filters, 195 | batch_norm, 196 | dropout, 197 | in_channels, 198 | out_channels, 199 | nr_outputs): 200 | super(UNetMultitask, self).__init__(conv_kernel, 201 | pool_kernel, 202 | pool_stride, 203 | repeat_blocks, 204 | n_filters, 205 | batch_norm, 206 | dropout, 207 | in_channels, 208 | out_channels) 209 | self.nr_outputs = nr_outputs 210 | output_legs = [] 211 | for i in range(self.nr_outputs): 212 | output_legs.append(self._output_layer()) 213 | self.output_legs = nn.ModuleList(output_legs) 214 | 215 | def forward(self, x): 216 | x = self.input_block(x) 217 | 218 | down_convs_outputs = [] 219 | for block, down_pool in zip(self.down_convs, self.down_pools): 220 | x = block(x) 221 | down_convs_outputs.append(x) 222 | x = down_pool(x) 223 | x = self.floor_block(x) 224 | 225 | for down_conv_output, block, up_sample in zip(reversed(down_convs_outputs), 226 | reversed(self.up_convs), 227 | reversed(self.up_samples)): 228 | x = up_sample(x) 229 | x = torch.cat((down_conv_output, x), dim=1) 230 | 231 | x = block(x) 232 | 233 | x = self.classification_block(x) 234 | 235 | outputs = [output_leg(x) for output_leg in self.output_legs] 236 | return outputs 237 | 238 | 239 | class DownConv(nn.Module): 240 | def __init__(self, in_channels, kernel_size, batch_norm, dropout): 241 | super(DownConv, self).__init__() 242 | self.in_channels = in_channels 243 | self.block_channels = int(in_channels * 2.) 244 | self.kernel_size = kernel_size 245 | self.batch_norm = batch_norm 246 | self.dropout = dropout 247 | self.conv_stride = 1 248 | 249 | self.down_conv = self._down_conv() 250 | 251 | def _down_conv(self): 252 | stride = self.conv_stride 253 | padding = get_downsample_pad(stride=stride, kernel=self.kernel_size) 254 | if self.batch_norm: 255 | down_conv = nn.Sequential(nn.Conv2d(in_channels=self.in_channels, out_channels=self.block_channels, 256 | kernel_size=(self.kernel_size, self.kernel_size), 257 | stride=stride, padding=padding), 258 | nn.BatchNorm2d(num_features=self.block_channels), 259 | nn.ReLU(), 260 | 261 | nn.Conv2d(in_channels=self.block_channels, out_channels=self.block_channels, 262 | kernel_size=(self.kernel_size, self.kernel_size), 263 | stride=stride, padding=padding), 264 | nn.BatchNorm2d(num_features=self.block_channels), 265 | nn.ReLU(), 266 | 267 | nn.Dropout(self.dropout), 268 | ) 269 | else: 270 | down_conv = nn.Sequential(nn.Conv2d(in_channels=self.in_channels, out_channels=self.block_channels, 271 | kernel_size=(self.kernel_size, self.kernel_size), 272 | stride=stride, padding=padding), 273 | nn.ReLU(), 274 | 275 | nn.Conv2d(in_channels=self.block_channels, out_channels=self.block_channels, 276 | kernel_size=(self.kernel_size, self.kernel_size), 277 | stride=stride, padding=padding), 278 | nn.ReLU(), 279 | 280 | nn.Dropout(self.dropout), 281 | ) 282 | return down_conv 283 | 284 | def forward(self, x): 285 | return self.down_conv(x) 286 | 287 | 288 | class UpConv(nn.Module): 289 | def __init__(self, in_channels, kernel_size, batch_norm, dropout): 290 | super(UpConv, self).__init__() 291 | self.in_channels = in_channels 292 | self.block_channels = int(in_channels / 2.) 293 | self.kernel_size = kernel_size 294 | self.batch_norm = batch_norm 295 | self.dropout = dropout 296 | self.conv_stride = 1 297 | 298 | self.up_conv = self._up_conv() 299 | 300 | def _up_conv(self): 301 | stride = self.conv_stride 302 | padding = get_downsample_pad(stride=stride, kernel=self.kernel_size) 303 | if self.batch_norm: 304 | up_conv = nn.Sequential(nn.Conv2d(in_channels=self.in_channels, out_channels=self.block_channels, 305 | kernel_size=(self.kernel_size, self.kernel_size), 306 | stride=stride, padding=padding), 307 | 308 | nn.BatchNorm2d(num_features=self.block_channels), 309 | nn.ReLU(), 310 | 311 | nn.Conv2d(in_channels=self.block_channels, out_channels=self.block_channels, 312 | kernel_size=(self.kernel_size, self.kernel_size), 313 | stride=stride, padding=padding), 314 | nn.BatchNorm2d(num_features=self.block_channels), 315 | nn.ReLU(), 316 | 317 | nn.Dropout(self.dropout) 318 | ) 319 | else: 320 | up_conv = nn.Sequential(nn.Conv2d(in_channels=self.in_channels, out_channels=self.block_channels, 321 | kernel_size=(self.kernel_size, self.kernel_size), 322 | stride=stride, padding=padding), 323 | nn.ReLU(), 324 | 325 | nn.Conv2d(in_channels=self.block_channels, out_channels=self.block_channels, 326 | kernel_size=(self.kernel_size, self.kernel_size), 327 | stride=stride, padding=padding), 328 | nn.ReLU(), 329 | 330 | nn.Dropout(self.dropout) 331 | ) 332 | return up_conv 333 | 334 | def forward(self, x): 335 | return self.up_conv(x) 336 | -------------------------------------------------------------------------------- /toolkit/keras_transformers/architectures.py: -------------------------------------------------------------------------------- 1 | from keras import regularizers 2 | from keras.activations import relu 3 | from keras.layers import Input, Embedding, PReLU, Bidirectional, Lambda, \ 4 | CuDNNLSTM, CuDNNGRU, Conv1D, Dense, BatchNormalization, Dropout, SpatialDropout1D, \ 5 | GlobalMaxPool1D, GlobalAveragePooling1D, MaxPooling1D 6 | from keras.layers.merge import add, concatenate 7 | from keras.models import Model 8 | 9 | from toolkit.keras_transformers.contrib import AttentionWeightedAverage 10 | 11 | 12 | def scnn(embedding_matrix, embedding_size, trainable_embedding, maxlen, max_features, 13 | filter_nr, kernel_size, repeat_block, dense_size, repeat_dense, output_size, output_activation, 14 | max_pooling, mean_pooling, weighted_average_attention, concat_mode, 15 | dropout_embedding, conv_dropout, dense_dropout, dropout_mode, 16 | conv_kernel_reg_l2, conv_bias_reg_l2, 17 | dense_kernel_reg_l2, dense_bias_reg_l2, 18 | use_prelu, use_batch_norm, batch_norm_first): 19 | input_text = Input(shape=(maxlen,)) 20 | x = Embedding(max_features, embedding_size, weights=[embedding_matrix], trainable=trainable_embedding)( 21 | input_text) 22 | 23 | x = dropout_block(dropout_embedding, dropout_mode)(x) 24 | 25 | for _ in range(repeat_block): 26 | x = convolutional_block(filter_nr, kernel_size, use_batch_norm, use_prelu, conv_dropout, dropout_mode, 27 | conv_kernel_reg_l2, conv_bias_reg_l2, batch_norm_first)(x) 28 | 29 | predictions = classification_block(dense_size=dense_size, repeat_dense=repeat_dense, 30 | output_size=output_size, output_activation=output_activation, 31 | max_pooling=max_pooling, 32 | mean_pooling=mean_pooling, 33 | weighted_average_attention=weighted_average_attention, 34 | concat_mode=concat_mode, 35 | dropout=dense_dropout, 36 | kernel_reg_l2=dense_kernel_reg_l2, bias_reg_l2=dense_bias_reg_l2, 37 | use_prelu=use_prelu, use_batch_norm=use_batch_norm, 38 | batch_norm_first=batch_norm_first)(x) 39 | model = Model(inputs=input_text, outputs=predictions) 40 | return model 41 | 42 | 43 | def dpcnn(embedding_matrix, embedding_size, trainable_embedding, maxlen, max_features, 44 | filter_nr, kernel_size, repeat_block, dense_size, repeat_dense, output_size, output_activation, 45 | max_pooling, mean_pooling, weighted_average_attention, concat_mode, 46 | dropout_embedding, conv_dropout, dense_dropout, dropout_mode, 47 | conv_kernel_reg_l2, conv_bias_reg_l2, 48 | dense_kernel_reg_l2, dense_bias_reg_l2, 49 | use_prelu, use_batch_norm, batch_norm_first): 50 | """ 51 | Note: 52 | Implementation of http://ai.tencent.com/ailab/media/publications/ACL3-Brady.pdf 53 | post activation is used instead of pre-activation, could be worth exploring 54 | """ 55 | 56 | input_text = Input(shape=(maxlen,)) 57 | if embedding_matrix is not None: 58 | embedding = Embedding(max_features, embedding_size, 59 | weights=[embedding_matrix], trainable=trainable_embedding)(input_text) 60 | else: 61 | embedding = Embedding(max_features, embedding_size)(input_text) 62 | 63 | embedding = dropout_block(dropout_embedding, dropout_mode)(embedding) 64 | 65 | x = convolutional_block(filter_nr, kernel_size, use_batch_norm, use_prelu, conv_dropout, dropout_mode, 66 | conv_kernel_reg_l2, conv_bias_reg_l2, batch_norm_first)(embedding) 67 | x = convolutional_block(filter_nr, kernel_size, conv_bias_reg_l2, use_prelu, conv_dropout, dropout_mode, 68 | conv_kernel_reg_l2, conv_bias_reg_l2, batch_norm_first)(x) 69 | if embedding_size == filter_nr: 70 | x = add([embedding, x]) 71 | else: 72 | embedding_resized = shape_matching_layer(filter_nr, use_prelu, conv_kernel_reg_l2, conv_bias_reg_l2)(embedding) 73 | x = add([embedding_resized, x]) 74 | for _ in range(repeat_block): 75 | x = dpcnn_block(filter_nr, kernel_size, use_batch_norm, use_prelu, conv_dropout, dropout_mode, 76 | conv_kernel_reg_l2, conv_bias_reg_l2, batch_norm_first)(x) 77 | 78 | predictions = classification_block(dense_size=dense_size, repeat_dense=repeat_dense, 79 | output_size=output_size, output_activation=output_activation, 80 | max_pooling=max_pooling, 81 | mean_pooling=mean_pooling, 82 | weighted_average_attention=weighted_average_attention, 83 | concat_mode=concat_mode, 84 | dropout=dense_dropout, 85 | kernel_reg_l2=dense_kernel_reg_l2, bias_reg_l2=dense_bias_reg_l2, 86 | use_prelu=use_prelu, use_batch_norm=use_batch_norm, 87 | batch_norm_first=batch_norm_first)(x) 88 | model = Model(inputs=input_text, outputs=predictions) 89 | return model 90 | 91 | 92 | def cudnn_lstm(embedding_matrix, embedding_size, trainable_embedding, 93 | maxlen, max_features, 94 | unit_nr, repeat_block, 95 | dense_size, repeat_dense, output_size, output_activation, 96 | max_pooling, mean_pooling, weighted_average_attention, concat_mode, 97 | dropout_embedding, rnn_dropout, dense_dropout, dropout_mode, 98 | rnn_kernel_reg_l2, rnn_recurrent_reg_l2, rnn_bias_reg_l2, 99 | dense_kernel_reg_l2, dense_bias_reg_l2, 100 | use_prelu, use_batch_norm, batch_norm_first): 101 | input_text = Input(shape=(maxlen,)) 102 | if embedding_matrix is not None: 103 | x = Embedding(max_features, 104 | embedding_size, 105 | weights=[embedding_matrix], 106 | trainable=trainable_embedding)(input_text) 107 | else: 108 | x = Embedding(max_features, 109 | embedding_size)(input_text) 110 | 111 | x = dropout_block(dropout_embedding, dropout_mode)(x) 112 | 113 | for _ in range(repeat_block): 114 | x = cudnn_lstm_block(unit_nr=unit_nr, return_sequences=True, bidirectional=True, 115 | kernel_reg_l2=rnn_kernel_reg_l2, 116 | recurrent_reg_l2=rnn_recurrent_reg_l2, 117 | bias_reg_l2=rnn_bias_reg_l2, 118 | use_batch_norm=use_batch_norm, batch_norm_first=batch_norm_first, 119 | dropout=rnn_dropout, dropout_mode=dropout_mode, use_prelu=use_prelu)(x) 120 | 121 | predictions = classification_block(dense_size=dense_size, repeat_dense=repeat_dense, 122 | output_size=output_size, output_activation=output_activation, 123 | max_pooling=max_pooling, 124 | mean_pooling=mean_pooling, 125 | weighted_average_attention=weighted_average_attention, 126 | concat_mode=concat_mode, 127 | dropout=dense_dropout, 128 | kernel_reg_l2=dense_kernel_reg_l2, bias_reg_l2=dense_bias_reg_l2, 129 | use_prelu=use_prelu, use_batch_norm=use_batch_norm, 130 | batch_norm_first=batch_norm_first)(x) 131 | model = Model(inputs=input_text, outputs=predictions) 132 | return model 133 | 134 | 135 | def cudnn_gru(embedding_matrix, embedding_size, trainable_embedding, 136 | maxlen, max_features, 137 | unit_nr, repeat_block, 138 | dense_size, repeat_dense, output_size, output_activation, 139 | max_pooling, mean_pooling, weighted_average_attention, concat_mode, 140 | dropout_embedding, rnn_dropout, dense_dropout, dropout_mode, 141 | rnn_kernel_reg_l2, rnn_recurrent_reg_l2, rnn_bias_reg_l2, 142 | dense_kernel_reg_l2, dense_bias_reg_l2, 143 | use_prelu, use_batch_norm, batch_norm_first): 144 | input_text = Input(shape=(maxlen,)) 145 | if embedding_matrix is not None: 146 | x = Embedding(max_features, 147 | embedding_size, 148 | weights=[embedding_matrix], 149 | trainable=trainable_embedding)(input_text) 150 | else: 151 | x = Embedding(max_features, 152 | embedding_size)(input_text) 153 | 154 | x = dropout_block(dropout_embedding, dropout_mode)(x) 155 | 156 | for _ in range(repeat_block): 157 | x = cudnn_gru_block(unit_nr=unit_nr, return_sequences=True, bidirectional=True, 158 | kernel_reg_l2=rnn_kernel_reg_l2, 159 | recurrent_reg_l2=rnn_recurrent_reg_l2, 160 | bias_reg_l2=rnn_bias_reg_l2, 161 | use_batch_norm=use_batch_norm, batch_norm_first=batch_norm_first, 162 | dropout=rnn_dropout, dropout_mode=dropout_mode, use_prelu=use_prelu)(x) 163 | 164 | predictions = classification_block(dense_size=dense_size, repeat_dense=repeat_dense, 165 | output_size=output_size, output_activation=output_activation, 166 | max_pooling=max_pooling, 167 | mean_pooling=mean_pooling, 168 | weighted_average_attention=weighted_average_attention, 169 | concat_mode=concat_mode, 170 | dropout=dense_dropout, 171 | kernel_reg_l2=dense_kernel_reg_l2, bias_reg_l2=dense_bias_reg_l2, 172 | use_prelu=use_prelu, use_batch_norm=use_batch_norm, 173 | batch_norm_first=batch_norm_first)(x) 174 | model = Model(inputs=input_text, outputs=predictions) 175 | return model 176 | 177 | 178 | def vdcnn(embedding_size, maxlen, max_features, 179 | filter_nr, kernel_size, repeat_block, dense_size, repeat_dense, output_size, output_activation, 180 | max_pooling, mean_pooling, weighted_average_attention, concat_mode, 181 | dropout_embedding, conv_dropout, dense_dropout, dropout_mode, 182 | conv_kernel_reg_l2, conv_bias_reg_l2, 183 | dense_kernel_reg_l2, dense_bias_reg_l2, 184 | use_prelu, use_batch_norm, batch_norm_first): 185 | """ 186 | Note: 187 | Implementation of http://www.aclweb.org/anthology/E17-1104 188 | We didn't use k-max pooling but GlobalMaxPool1D at the end and didn't explore it in the 189 | intermediate layers. 190 | """ 191 | 192 | input_text = Input(shape=(maxlen,)) 193 | x = Embedding(input_dim=max_features, output_dim=embedding_size)(input_text) 194 | 195 | x = dropout_block(dropout_embedding, dropout_mode)(x) 196 | 197 | x = convolutional_block(filter_nr, kernel_size, use_batch_norm, use_prelu, conv_dropout, dropout_mode, 198 | conv_kernel_reg_l2, conv_bias_reg_l2, batch_norm_first)(x) 199 | 200 | for i in range(repeat_block): 201 | if i + 1 != repeat_block: 202 | x = vdcnn_block(filter_nr, kernel_size, use_batch_norm, use_prelu, conv_dropout, dropout_mode, 203 | conv_kernel_reg_l2, conv_bias_reg_l2, batch_norm_first, last_block=False)(x) 204 | else: 205 | x = vdcnn_block(filter_nr, kernel_size, use_batch_norm, use_prelu, conv_dropout, dropout_mode, 206 | conv_kernel_reg_l2, conv_bias_reg_l2, batch_norm_first, last_block=True)(x) 207 | 208 | predictions = classification_block(dense_size=dense_size, repeat_dense=repeat_dense, 209 | output_size=output_size, output_activation=output_activation, 210 | max_pooling=max_pooling, 211 | mean_pooling=mean_pooling, 212 | weighted_average_attention=weighted_average_attention, 213 | concat_mode=concat_mode, 214 | dropout=dense_dropout, 215 | kernel_reg_l2=dense_kernel_reg_l2, bias_reg_l2=dense_bias_reg_l2, 216 | use_prelu=use_prelu, use_batch_norm=use_batch_norm, 217 | batch_norm_first=batch_norm_first)(x) 218 | model = Model(inputs=input_text, outputs=predictions) 219 | return model 220 | 221 | 222 | def classification_block(dense_size, repeat_dense, output_size, output_activation, 223 | max_pooling, mean_pooling, weighted_average_attention, concat_mode, 224 | dropout, 225 | kernel_reg_l2, bias_reg_l2, 226 | use_prelu, use_batch_norm, batch_norm_first): 227 | def f(x): 228 | if max_pooling: 229 | x_max = GlobalMaxPool1D()(x) 230 | else: 231 | x_max = None 232 | 233 | if mean_pooling: 234 | x_mean = GlobalAveragePooling1D()(x) 235 | else: 236 | x_mean = None 237 | if weighted_average_attention: 238 | x_att = AttentionWeightedAverage()(x) 239 | else: 240 | x_att = None 241 | 242 | x = [xi for xi in [x_max, x_mean, x_att] if xi is not None] 243 | if len(x) == 1: 244 | x = x[0] 245 | else: 246 | if concat_mode == 'concat': 247 | x = concatenate(x, axis=-1) 248 | else: 249 | NotImplementedError('only mode concat for now') 250 | 251 | for _ in range(repeat_dense): 252 | x = dense_block(dense_size=dense_size, 253 | use_batch_norm=use_batch_norm, 254 | use_prelu=use_prelu, 255 | dropout=dropout, 256 | kernel_reg_l2=kernel_reg_l2, 257 | bias_reg_l2=bias_reg_l2, 258 | batch_norm_first=batch_norm_first)(x) 259 | 260 | x = Dense(output_size, activation=output_activation)(x) 261 | return x 262 | 263 | return f 264 | 265 | 266 | def dropout_block(dropout, dropout_mode): 267 | def f(x): 268 | if dropout_mode == 'spatial': 269 | x = SpatialDropout1D(dropout)(x) 270 | elif dropout_mode == 'simple': 271 | x = Dropout(dropout)(x) 272 | else: 273 | raise NotImplementedError('spatial and simple modes are supported') 274 | return x 275 | 276 | return f 277 | 278 | 279 | def prelu_block(use_prelu): 280 | def f(x): 281 | if use_prelu: 282 | x = PReLU()(x) 283 | else: 284 | x = Lambda(relu)(x) 285 | return x 286 | 287 | return f 288 | 289 | 290 | def bn_relu_dropout_block(use_batch_norm, use_prelu, dropout, dropout_mode, batch_norm_first): 291 | def f(x): 292 | if use_batch_norm and batch_norm_first: 293 | x = BatchNormalization()(x) 294 | 295 | x = prelu_block(use_prelu)(x) 296 | x = dropout_block(dropout, dropout_mode)(x) 297 | 298 | if use_batch_norm and not batch_norm_first: 299 | x = BatchNormalization()(x) 300 | return x 301 | 302 | return f 303 | 304 | 305 | def convolutional_block(filter_nr, kernel_size, use_batch_norm, use_prelu, dropout, dropout_mode, 306 | kernel_reg_l2, bias_reg_l2, batch_norm_first): 307 | def f(x): 308 | x = Conv1D(filter_nr, kernel_size=kernel_size, padding='same', activation='linear', 309 | kernel_regularizer=regularizers.l2(kernel_reg_l2), 310 | bias_regularizer=regularizers.l2(bias_reg_l2))(x) 311 | x = bn_relu_dropout_block(use_batch_norm=use_batch_norm, 312 | batch_norm_first=batch_norm_first, 313 | dropout=dropout, 314 | dropout_mode=dropout_mode, 315 | use_prelu=use_prelu)(x) 316 | return x 317 | 318 | return f 319 | 320 | 321 | def shape_matching_layer(filter_nr, use_prelu, kernel_reg_l2, bias_reg_l2): 322 | def f(x): 323 | x = Conv1D(filter_nr, kernel_size=1, padding='same', activation='linear', 324 | kernel_regularizer=regularizers.l2(kernel_reg_l2), 325 | bias_regularizer=regularizers.l2(bias_reg_l2))(x) 326 | x = prelu_block(use_prelu)(x) 327 | return x 328 | 329 | return f 330 | 331 | 332 | def cudnn_lstm_block(unit_nr, return_sequences, bidirectional, 333 | kernel_reg_l2, recurrent_reg_l2, bias_reg_l2, 334 | use_batch_norm, batch_norm_first, 335 | dropout, dropout_mode, use_prelu): 336 | def f(x): 337 | gru_layer = CuDNNLSTM(uunits=unit_nr, return_sequences=return_sequences, 338 | kernel_regularizer=regularizers.l2(kernel_reg_l2), 339 | recurrent_regularizer=regularizers.l2(recurrent_reg_l2), 340 | bias_regularizer=regularizers.l2(bias_reg_l2) 341 | ) 342 | if bidirectional: 343 | x = Bidirectional(gru_layer)(x) 344 | else: 345 | x = gru_layer(x) 346 | x = bn_relu_dropout_block(use_batch_norm=use_batch_norm, batch_norm_first=batch_norm_first, 347 | dropout=dropout, dropout_mode=dropout_mode, 348 | use_prelu=use_prelu)(x) 349 | return x 350 | 351 | return f 352 | 353 | 354 | def cudnn_gru_block(unit_nr, return_sequences, bidirectional, 355 | kernel_reg_l2, recurrent_reg_l2, bias_reg_l2, 356 | use_batch_norm, batch_norm_first, 357 | dropout, dropout_mode, use_prelu): 358 | def f(x): 359 | gru_layer = CuDNNGRU(units=unit_nr, return_sequences=return_sequences, 360 | kernel_regularizer=regularizers.l2(kernel_reg_l2), 361 | recurrent_regularizer=regularizers.l2(recurrent_reg_l2), 362 | bias_regularizer=regularizers.l2(bias_reg_l2) 363 | ) 364 | if bidirectional: 365 | x = Bidirectional(gru_layer)(x) 366 | else: 367 | x = gru_layer(x) 368 | x = bn_relu_dropout_block(use_batch_norm=use_batch_norm, batch_norm_first=batch_norm_first, 369 | dropout=dropout, dropout_mode=dropout_mode, 370 | use_prelu=use_prelu)(x) 371 | return x 372 | 373 | return f 374 | 375 | 376 | def dense_block(dense_size, use_batch_norm, use_prelu, dropout, kernel_reg_l2, bias_reg_l2, 377 | batch_norm_first): 378 | def f(x): 379 | x = Dense(dense_size, activation='linear', 380 | kernel_regularizer=regularizers.l2(kernel_reg_l2), 381 | bias_regularizer=regularizers.l2(bias_reg_l2))(x) 382 | 383 | x = bn_relu_dropout_block(use_batch_norm=use_batch_norm, 384 | use_prelu=use_prelu, 385 | dropout=dropout, 386 | dropout_mode='simple', 387 | batch_norm_first=batch_norm_first)(x) 388 | return x 389 | 390 | return f 391 | 392 | 393 | def dpcnn_block(filter_nr, kernel_size, use_batch_norm, use_prelu, dropout, dropout_mode, 394 | kernel_reg_l2, bias_reg_l2, batch_norm_first): 395 | def f(x): 396 | x = MaxPooling1D(pool_size=3, strides=2)(x) 397 | main = convolutional_block(filter_nr, kernel_size, use_batch_norm, use_prelu, dropout, dropout_mode, 398 | kernel_reg_l2, bias_reg_l2, batch_norm_first)(x) 399 | main = convolutional_block(filter_nr, kernel_size, use_batch_norm, use_prelu, dropout, dropout_mode, 400 | kernel_reg_l2, bias_reg_l2, batch_norm_first)(main) 401 | x = add([main, x]) 402 | return x 403 | 404 | return f 405 | 406 | 407 | def vdcnn_block(filter_nr, kernel_size, use_batch_norm, use_prelu, dropout, dropout_mode, 408 | kernel_reg_l2, bias_reg_l2, batch_norm_first, last_block): 409 | def f(x): 410 | main = convolutional_block(filter_nr, kernel_size, use_batch_norm, use_prelu, dropout, dropout_mode, 411 | kernel_reg_l2, bias_reg_l2, batch_norm_first)(x) 412 | x = add([main, x]) 413 | main = convolutional_block(filter_nr, kernel_size, use_batch_norm, use_prelu, dropout, dropout_mode, 414 | kernel_reg_l2, bias_reg_l2, batch_norm_first)(x) 415 | x = add([main, x]) 416 | if not last_block: 417 | x = MaxPooling1D(pool_size=3, strides=2)(x) 418 | return x 419 | 420 | return f 421 | -------------------------------------------------------------------------------- /toolkit/pytorch_transformers/loaders/segmentation.py: -------------------------------------------------------------------------------- 1 | import json 2 | import multiprocessing as mp 3 | from functools import partial 4 | from itertools import product 5 | 6 | import numpy as np 7 | import torch 8 | import torchvision.transforms as transforms 9 | from PIL import Image 10 | from attrdict import AttrDict 11 | from imgaug import augmenters as iaa 12 | from scipy.stats import gmean 13 | from sklearn.externals import joblib 14 | from steppy.base import BaseTransformer 15 | from torch.utils.data import Dataset, DataLoader 16 | from tqdm import tqdm 17 | 18 | from toolkit.utils import from_pil, to_pil, binary_from_rle, ImgAug 19 | 20 | 21 | class ImageReader(BaseTransformer): 22 | def __init__(self, train_mode, x_columns, y_columns, target_format='png'): 23 | self.train_mode = train_mode 24 | self.x_columns = x_columns 25 | self.y_columns = y_columns 26 | self.target_format = target_format 27 | 28 | def transform(self, meta): 29 | X_ = meta[self.x_columns].values 30 | 31 | X = self.load_images(X_, filetype='png', grayscale=False) 32 | if self.train_mode: 33 | y_ = meta[self.y_columns].values 34 | y = self.load_images(y_, filetype=self.target_format, grayscale=True) 35 | else: 36 | y = None 37 | 38 | return {'X': X, 39 | 'y': y} 40 | 41 | def load_images(self, filepaths, filetype, grayscale=False): 42 | X = [] 43 | for i in range(filepaths.shape[1]): 44 | column = filepaths[:, i] 45 | X.append([]) 46 | for filepath in tqdm(column): 47 | if filetype == 'png': 48 | data = self.load_image(filepath, grayscale=grayscale) 49 | elif filetype == 'json': 50 | data = self.read_json(filepath) 51 | else: 52 | raise Exception('files must be png or json') 53 | X[i].append(data) 54 | return X 55 | 56 | def load_image(self, img_filepath, grayscale): 57 | image = Image.open(img_filepath, 'r') 58 | if not grayscale: 59 | image = image.convert('RGB') 60 | else: 61 | image = image.convert('L').point(lambda x: 0 if x < 128 else 255, '1') 62 | return image 63 | 64 | def read_json(self, path): 65 | with open(path, 'r') as file: 66 | data = json.load(file) 67 | masks = [to_pil(binary_from_rle(rle)) for rle in data] 68 | return masks 69 | 70 | 71 | class XYSplit(BaseTransformer): 72 | def __init__(self, train_mode, x_columns, y_columns): 73 | self.train_mode = train_mode 74 | super().__init__() 75 | self.x_columns = x_columns 76 | self.y_columns = y_columns 77 | self.columns_to_get = None 78 | self.target_columns = None 79 | 80 | def transform(self, meta): 81 | X = meta[self.x_columns[0]].values 82 | if self.train_mode: 83 | y = meta[self.y_columns[0]].values 84 | else: 85 | y = None 86 | 87 | return {'X': X, 88 | 'y': y} 89 | 90 | 91 | class ImageSegmentationBaseDataset(Dataset): 92 | def __init__(self, X, y, train_mode, 93 | image_transform, image_augment_with_target, 94 | mask_transform, image_augment, 95 | image_source='memory'): 96 | super().__init__() 97 | self.X = X 98 | if y is not None: 99 | self.y = y 100 | else: 101 | self.y = None 102 | 103 | self.train_mode = train_mode 104 | self.image_transform = image_transform 105 | self.mask_transform = mask_transform 106 | self.image_augment = image_augment if image_augment is not None else ImgAug(iaa.Noop()) 107 | self.image_augment_with_target = image_augment_with_target if image_augment_with_target is not None else ImgAug( 108 | iaa.Noop()) 109 | 110 | self.image_source = image_source 111 | 112 | def __len__(self): 113 | if self.image_source == 'memory': 114 | return len(self.X[0]) 115 | elif self.image_source == 'disk': 116 | return self.X.shape[0] 117 | 118 | def __getitem__(self, index): 119 | if self.image_source == 'memory': 120 | load_func = self.load_from_memory 121 | elif self.image_source == 'disk': 122 | load_func = self.load_from_disk 123 | else: 124 | raise NotImplementedError("Possible loading options: 'memory' and 'disk'!") 125 | 126 | Xi = load_func(self.X, index, filetype='png', grayscale=False) 127 | 128 | if self.y is not None: 129 | Mi = self.load_target(self.y, index, load_func) 130 | Xi, *Mi = from_pil(Xi, *Mi) 131 | Xi, *Mi = self.image_augment_with_target(Xi, *Mi) 132 | Xi = self.image_augment(Xi) 133 | Xi, *Mi = to_pil(Xi, *Mi) 134 | 135 | if self.mask_transform is not None: 136 | Mi = [self.mask_transform(m) for m in Mi] 137 | 138 | if self.image_transform is not None: 139 | Xi = self.image_transform(Xi) 140 | 141 | Mi = torch.cat(Mi, dim=0) 142 | return Xi, Mi 143 | else: 144 | Xi = from_pil(Xi) 145 | Xi = self.image_augment(Xi) 146 | Xi = to_pil(Xi) 147 | 148 | if self.image_transform is not None: 149 | Xi = self.image_transform(Xi) 150 | return Xi 151 | 152 | def load_from_memory(self, data_source, index, **kwargs): 153 | return data_source[0][index] 154 | 155 | def load_from_disk(self, data_source, index, *, filetype, grayscale=False): 156 | if filetype == 'png': 157 | img_filepath = data_source[index] 158 | return self.load_image(img_filepath, grayscale=grayscale) 159 | elif filetype == 'json': 160 | json_filepath = data_source[index] 161 | return self.read_json(json_filepath) 162 | else: 163 | raise Exception('files must be png or json') 164 | 165 | def load_image(self, img_filepath, grayscale): 166 | image = Image.open(img_filepath, 'r') 167 | if not grayscale: 168 | image = image.convert('RGB') 169 | else: 170 | image = image.convert('L').point(lambda x: 0 if x < 128 else 1) 171 | return image 172 | 173 | def read_json(self, path): 174 | with open(path, 'r') as file: 175 | data = json.load(file) 176 | masks = [to_pil(binary_from_rle(rle)) for rle in data] 177 | return masks 178 | 179 | def load_target(self, data_source, index, load_func): 180 | raise NotImplementedError 181 | 182 | 183 | class ImageSegmentationJsonDataset(ImageSegmentationBaseDataset): 184 | def load_target(self, data_source, index, load_func): 185 | Mi = load_func(data_source, index, filetype='json') 186 | return Mi 187 | 188 | 189 | class ImageSegmentationPngDataset(ImageSegmentationBaseDataset): 190 | def load_target(self, data_source, index, load_func): 191 | Mi = load_func(data_source, index, filetype='png', grayscale=True) 192 | Mi = from_pil(Mi) 193 | target = [to_pil(Mi == class_nr) for class_nr in [0, 1]] 194 | return target 195 | 196 | 197 | class ImageSegmentationTTADataset(ImageSegmentationBaseDataset): 198 | def __init__(self, tta_params, tta_transform, *args, **kwargs): 199 | super().__init__(*args, **kwargs) 200 | self.tta_params = tta_params 201 | self.tta_transform = tta_transform 202 | 203 | def __getitem__(self, index): 204 | if self.image_source == 'memory': 205 | load_func = self.load_from_memory 206 | elif self.image_source == 'disk': 207 | load_func = self.load_from_disk 208 | else: 209 | raise NotImplementedError("Possible loading options: 'memory' and 'disk'!") 210 | 211 | Xi = load_func(self.X, index, filetype='png', grayscale=False) 212 | Xi = from_pil(Xi) 213 | 214 | if self.image_augment is not None: 215 | Xi = self.image_augment(Xi) 216 | 217 | if self.tta_params is not None: 218 | tta_transform_specs = self.tta_params[index] 219 | Xi = self.tta_transform(Xi, tta_transform_specs) 220 | Xi = to_pil(Xi) 221 | 222 | if self.image_transform is not None: 223 | Xi = self.image_transform(Xi) 224 | 225 | return Xi 226 | 227 | 228 | class ImageSegmentationLoaderBasic(BaseTransformer): 229 | def __init__(self, train_mode, loader_params, dataset_params, augmentation_params): 230 | super().__init__() 231 | self.train_mode = train_mode 232 | self.loader_params = AttrDict(loader_params) 233 | self.dataset_params = AttrDict(dataset_params) 234 | self.augmentation_params = AttrDict(augmentation_params) 235 | 236 | self.mask_transform = None 237 | self.image_transform = None 238 | 239 | self.image_augment_train = None 240 | self.image_augment_inference = None 241 | self.image_augment_with_target_train = None 242 | self.image_augment_with_target_inference = None 243 | 244 | self.dataset = None 245 | 246 | def transform(self, X, y, X_valid=None, y_valid=None, **kwargs): 247 | if self.train_mode and y is not None: 248 | flow, steps = self.get_datagen(X, y, True, self.loader_params.training) 249 | else: 250 | flow, steps = self.get_datagen(X, None, False, self.loader_params.inference) 251 | 252 | if X_valid is not None and y_valid is not None: 253 | valid_flow, valid_steps = self.get_datagen(X_valid, y_valid, False, self.loader_params.inference) 254 | else: 255 | valid_flow = None 256 | valid_steps = None 257 | 258 | return {'datagen': (flow, steps), 259 | 'validation_datagen': (valid_flow, valid_steps)} 260 | 261 | def get_datagen(self, X, y, train_mode, loader_params): 262 | if train_mode: 263 | dataset = self.dataset(X, y, 264 | train_mode=True, 265 | image_augment=self.image_augment_train, 266 | image_augment_with_target=self.image_augment_with_target_train, 267 | mask_transform=self.mask_transform, 268 | image_transform=self.image_transform, 269 | image_source=self.dataset_params.image_source) 270 | else: 271 | dataset = self.dataset(X, y, 272 | train_mode=False, 273 | image_augment=self.image_augment_inference, 274 | image_augment_with_target=self.image_augment_with_target_inference, 275 | mask_transform=self.mask_transform, 276 | image_transform=self.image_transform, 277 | image_source=self.dataset_params.image_source) 278 | 279 | datagen = DataLoader(dataset, **loader_params) 280 | steps = len(datagen) 281 | return datagen, steps 282 | 283 | def load(self, filepath): 284 | params = joblib.load(filepath) 285 | self.loader_params = params['loader_params'] 286 | return self 287 | 288 | def save(self, filepath): 289 | params = {'loader_params': self.loader_params} 290 | joblib.dump(params, filepath) 291 | 292 | 293 | class ImageSegmentationLoaderBasicTTA(ImageSegmentationLoaderBasic): 294 | def __init__(self, loader_params, dataset_params, augmentation_params): 295 | self.loader_params = AttrDict(loader_params) 296 | self.dataset_params = AttrDict(dataset_params) 297 | self.augmentation_params = AttrDict(augmentation_params) 298 | 299 | self.mask_transform = None 300 | self.image_transform = None 301 | 302 | self.image_augment_train = None 303 | self.image_augment_inference = None 304 | self.image_augment_with_target_train = None 305 | self.image_augment_with_target_inference = None 306 | 307 | self.dataset = None 308 | 309 | def transform(self, X, tta_params, **kwargs): 310 | flow, steps = self.get_datagen(X, tta_params, self.loader_params.inference) 311 | valid_flow = None 312 | valid_steps = None 313 | return {'datagen': (flow, steps), 314 | 'validation_datagen': (valid_flow, valid_steps)} 315 | 316 | def get_datagen(self, X, tta_params, loader_params): 317 | dataset = self.dataset(tta_params=tta_params, 318 | tta_transform=self.augmentation_params.tta_transform, 319 | X=X, 320 | y=None, 321 | train_mode=False, 322 | image_augment=self.image_augment_inference, 323 | image_augment_with_target=self.image_augment_with_target_inference, 324 | mask_transform=self.mask_transform, 325 | image_transform=self.image_transform, 326 | image_source=self.dataset_params.image_source) 327 | 328 | datagen = DataLoader(dataset, **loader_params) 329 | steps = len(datagen) 330 | return datagen, steps 331 | 332 | 333 | class ImageSegmentationLoaderCropPad(ImageSegmentationLoaderBasic): 334 | def __init__(self, train_mode, loader_params, dataset_params, augmentation_params): 335 | super().__init__(train_mode, loader_params, dataset_params, augmentation_params) 336 | 337 | self.image_transform = transforms.Compose([transforms.Grayscale(num_output_channels=3), 338 | transforms.ToTensor(), 339 | transforms.Normalize(mean=self.dataset_params.MEAN, 340 | std=self.dataset_params.STD), 341 | ]) 342 | self.mask_transform = transforms.Compose([transforms.Lambda(to_array), 343 | transforms.Lambda(to_tensor), 344 | ]) 345 | 346 | self.image_augment_train = ImgAug(self.augmentation_params['image_augment_train']) 347 | self.image_augment_with_target_train = ImgAug(self.augmentation_params['image_augment_with_target_train']) 348 | self.image_augment_inference = ImgAug(self.augmentation_params['image_augment_inference']) 349 | self.image_augment_with_target_inference = ImgAug( 350 | self.augmentation_params['image_augment_with_target_inference']) 351 | 352 | if self.dataset_params.target_format == 'png': 353 | self.dataset = ImageSegmentationPngDataset 354 | elif self.dataset_params.target_format == 'json': 355 | self.dataset = ImageSegmentationJsonDataset 356 | else: 357 | raise Exception('files must be png or json') 358 | 359 | 360 | class ImageSegmentationLoaderCropPadTTA(ImageSegmentationLoaderBasicTTA): 361 | def __init__(self, loader_params, dataset_params, augmentation_params): 362 | super().__init__(loader_params, dataset_params, augmentation_params) 363 | 364 | self.image_transform = transforms.Compose([transforms.Grayscale(num_output_channels=3), 365 | transforms.ToTensor(), 366 | transforms.Normalize(mean=self.dataset_params.MEAN, 367 | std=self.dataset_params.STD), 368 | ]) 369 | self.mask_transform = transforms.Compose([transforms.Lambda(to_array), 370 | transforms.Lambda(to_tensor), 371 | ]) 372 | 373 | self.image_augment_inference = ImgAug(self.augmentation_params['image_augment_inference']) 374 | self.image_augment_with_target_inference = ImgAug( 375 | self.augmentation_params['image_augment_with_target_inference']) 376 | self.dataset = ImageSegmentationTTADataset 377 | 378 | 379 | class ImageSegmentationLoaderResize(ImageSegmentationLoaderBasic): 380 | def __init__(self, train_mode, loader_params, dataset_params, augmentation_params): 381 | super().__init__(train_mode, loader_params, dataset_params, augmentation_params) 382 | 383 | self.image_transform = transforms.Compose([transforms.Resize((self.dataset_params.h, self.dataset_params.w)), 384 | transforms.Grayscale(num_output_channels=3), 385 | transforms.ToTensor(), 386 | transforms.Normalize(mean=self.dataset_params.MEAN, 387 | std=self.dataset_params.STD), 388 | ]) 389 | self.mask_transform = transforms.Compose([transforms.Resize((self.dataset_params.h, self.dataset_params.w), 390 | interpolation=0), 391 | transforms.Lambda(to_array), 392 | transforms.Lambda(to_tensor), 393 | ]) 394 | 395 | self.image_augment_train = ImgAug(self.augmentation_params['image_augment_train']) 396 | self.image_augment_with_target_train = ImgAug(self.augmentation_params['image_augment_with_target_train']) 397 | 398 | if self.dataset_params.target_format == 'png': 399 | self.dataset = ImageSegmentationPngDataset 400 | elif self.dataset_params.target_format == 'json': 401 | self.dataset = ImageSegmentationJsonDataset 402 | else: 403 | raise Exception('files must be png or json') 404 | 405 | 406 | class ImageSegmentationLoaderResizeTTA(ImageSegmentationLoaderBasicTTA): 407 | def __init__(self, loader_params, dataset_params, augmentation_params): 408 | super().__init__(loader_params, dataset_params, augmentation_params) 409 | 410 | self.image_transform = transforms.Compose([transforms.Resize((self.dataset_params.h, self.dataset_params.w)), 411 | transforms.Grayscale(num_output_channels=3), 412 | transforms.ToTensor(), 413 | transforms.Normalize(mean=self.dataset_params.MEAN, 414 | std=self.dataset_params.STD), 415 | ]) 416 | self.mask_transform = transforms.Compose([transforms.Resize((self.dataset_params.h, self.dataset_params.w), 417 | interpolation=0), 418 | transforms.Lambda(to_array), 419 | transforms.Lambda(to_tensor), 420 | ]) 421 | 422 | self.dataset = ImageSegmentationTTADataset 423 | 424 | 425 | class MetaTestTimeAugmentationGenerator(BaseTransformer): 426 | def __init__(self, **kwargs): 427 | self.tta_transformations = AttrDict(kwargs) 428 | 429 | def transform(self, X, **kwargs): 430 | X_tta_rows, tta_params, img_ids = [], [], [] 431 | for i in range(len(X)): 432 | rows, params, ids = self._get_tta_data(i, X[i]) 433 | tta_params.extend(params) 434 | img_ids.extend(ids) 435 | X_tta_rows.extend(rows) 436 | X_tta = np.array(X_tta_rows) 437 | return {'X_tta': X_tta, 'tta_params': tta_params, 'img_ids': img_ids} 438 | 439 | def _get_tta_data(self, i, row): 440 | original_specs = {'ud_flip': False, 'lr_flip': False, 'rotation': 0, 'color_shift': False} 441 | tta_specs = [original_specs] 442 | 443 | ud_options = [True, False] if self.tta_transformations.flip_ud else [False] 444 | lr_options = [True, False] if self.tta_transformations.flip_lr else [False] 445 | rot_options = [0, 90, 180, 270] if self.tta_transformations.rotation else [0] 446 | if self.tta_transformations.color_shift_runs: 447 | color_shift_options = list(range(1, self.tta_transformations.color_shift_runs + 1, 1)) 448 | else: 449 | color_shift_options = [False] 450 | 451 | for ud, lr, rot, color in product(ud_options, lr_options, rot_options, color_shift_options): 452 | if ud is False and lr is False and rot == 0 and color is False: 453 | continue 454 | else: 455 | tta_specs.append({'ud_flip': ud, 'lr_flip': lr, 'rotation': rot, 'color_shift': color}) 456 | 457 | img_ids = [i] * len(tta_specs) 458 | X_rows = [row] * len(tta_specs) 459 | return X_rows, tta_specs, img_ids 460 | 461 | 462 | class TestTimeAugmentationGenerator(BaseTransformer): 463 | def __init__(self, **kwargs): 464 | self.tta_transformations = AttrDict(kwargs) 465 | 466 | def transform(self, X, **kwargs): 467 | X_tta, tta_params, img_ids = [], [], [] 468 | X = X[0] 469 | for i in range(len(X)): 470 | images, params, ids = self._get_tta_data(i, X[i]) 471 | tta_params.extend(params) 472 | img_ids.extend(ids) 473 | X_tta.extend(images) 474 | return {'X_tta': [X_tta], 'tta_params': tta_params, 'img_ids': img_ids} 475 | 476 | def _get_tta_data(self, i, row): 477 | original_specs = {'ud_flip': False, 'lr_flip': False, 'rotation': 0, 'color_shift': False} 478 | tta_specs = [original_specs] 479 | 480 | ud_options = [True, False] if self.tta_transformations.flip_ud else [False] 481 | lr_options = [True, False] if self.tta_transformations.flip_lr else [False] 482 | rot_options = [0, 90, 180, 270] if self.tta_transformations.rotation else [0] 483 | if self.tta_transformations.color_shift_runs: 484 | color_shift_options = list(range(1, self.tta_transformations.color_shift_runs + 1, 1)) 485 | else: 486 | color_shift_options = [False] 487 | 488 | for ud, lr, rot, color in product(ud_options, lr_options, rot_options, color_shift_options): 489 | if ud is False and lr is False and rot == 0 and color is False: 490 | continue 491 | else: 492 | tta_specs.append({'ud_flip': ud, 'lr_flip': lr, 'rotation': rot, 'color_shift': color}) 493 | 494 | img_ids = [i] * len(tta_specs) 495 | X_rows = [row] * len(tta_specs) 496 | return X_rows, tta_specs, img_ids 497 | 498 | 499 | class TestTimeAugmentationAggregator(BaseTransformer): 500 | def __init__(self, tta_inverse_transform, method, nthreads): 501 | self.tta_inverse_transform = tta_inverse_transform 502 | self.method = method 503 | self.nthreads = nthreads 504 | 505 | @property 506 | def agg_method(self): 507 | methods = {'mean': np.mean, 508 | 'max': np.max, 509 | 'min': np.min, 510 | 'gmean': gmean 511 | } 512 | return partial(methods[self.method], axis=-1) 513 | 514 | def transform(self, images, tta_params, img_ids, **kwargs): 515 | _aggregate_augmentations = partial(aggregate_augmentations, 516 | images=images, 517 | tta_params=tta_params, 518 | tta_inverse_transform=self.tta_inverse_transform, 519 | img_ids=img_ids, 520 | agg_method=self.agg_method) 521 | unique_img_ids = set(img_ids) 522 | threads = min(self.nthreads, len(unique_img_ids)) 523 | with mp.pool.ThreadPool(threads) as executor: 524 | averages_images = executor.map(_aggregate_augmentations, unique_img_ids) 525 | return {'aggregated_prediction': averages_images} 526 | 527 | 528 | def aggregate_augmentations(img_id, images, tta_params, tta_inverse_transform, img_ids, agg_method): 529 | tta_predictions_for_id = [] 530 | for image, tta_param, ids in zip(images, tta_params, img_ids): 531 | if ids == img_id: 532 | tta_prediction = tta_inverse_transform(image, tta_param) 533 | tta_predictions_for_id.append(tta_prediction) 534 | else: 535 | continue 536 | tta_averaged = agg_method(np.stack(tta_predictions_for_id, axis=-1)) 537 | return tta_averaged 538 | 539 | 540 | def to_array(x): 541 | x_ = x.convert('L') # convert image to monochrome 542 | x_ = np.array(x_) 543 | x_ = x_.astype(np.float32) 544 | return x_ 545 | 546 | 547 | def to_tensor(x): 548 | x_ = np.expand_dims(x, axis=0) 549 | x_ = torch.from_numpy(x_) 550 | return x_ 551 | --------------------------------------------------------------------------------