├── toolkit
    ├── __init__.py
    ├── preprocessing
    │   ├── __init__.py
    │   ├── text.py
    │   └── misc.py
    ├── keras_transformers
    │   ├── __init__.py
    │   ├── loaders.py
    │   ├── callbacks.py
    │   ├── contrib.py
    │   ├── embeddings.py
    │   ├── models.py
    │   └── architectures.py
    ├── catboost_transformers
    │   ├── __init__.py
    │   ├── requirements.txt
    │   └── models.py
    ├── lightgbm_transformers
    │   ├── __init__.py
    │   ├── requirements.txt
    │   └── models.py
    ├── pytorch_transformers
    │   ├── __init__.py
    │   ├── loaders
    │   │   ├── __init__.py
    │   │   ├── classification.py
    │   │   └── segmentation.py
    │   ├── architectures
    │   │   ├── __init__.py
    │   │   ├── utils.py
    │   │   └── unet.py
    │   ├── utils.py
    │   ├── validation.py
    │   ├── models.py
    │   └── callbacks.py
    ├── sklearn_transformers
    │   ├── __init__.py
    │   └── models.py
    ├── xgboost_transformers
    │   ├── __init__.py
    │   ├── requirements.txt
    │   └── models.py
    ├── toolkit_base.py
    ├── resources
    │   └── apostrophes.json
    ├── utils.py
    └── postprocessing.py
├── setup.cfg
├── requirements.txt
├── .github
    └── ISSUE_TEMPLATE
    │   ├── everything-else.md
    │   └── bug.md
├── README.md
├── PULL_REQUEST_TEMPLATE.md
├── CONTRIBUTING.md
├── LICENSE
├── setup.py
├── .gitignore
└── CODE_OF_CONDUCT.md


/toolkit/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/toolkit/preprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/toolkit/keras_transformers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/toolkit/catboost_transformers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/toolkit/lightgbm_transformers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/toolkit/pytorch_transformers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/toolkit/sklearn_transformers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/toolkit/xgboost_transformers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/toolkit/pytorch_transformers/loaders/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | 


--------------------------------------------------------------------------------
/toolkit/pytorch_transformers/architectures/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/toolkit/catboost_transformers/requirements.txt:
--------------------------------------------------------------------------------
1 | catboost
2 | steppy
3 | 


--------------------------------------------------------------------------------
/toolkit/toolkit_base.py:
--------------------------------------------------------------------------------
1 | class SteppyToolkitError(Exception):
2 |     pass
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | neptune-cli>=2.8.0
2 | setuptools>=39.2.0
3 | steppy>=0.1.9
4 | 


--------------------------------------------------------------------------------
/toolkit/xgboost_transformers/requirements.txt:
--------------------------------------------------------------------------------
1 | attrdict
2 | steppy
3 | xgboost
4 | 


--------------------------------------------------------------------------------
/toolkit/lightgbm_transformers/requirements.txt:
--------------------------------------------------------------------------------
1 | attrdict
2 | lightgbm
3 | numpy
4 | pandas
5 | sklearn
6 | steppy
7 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/everything-else.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: everything else
3 | about: Suggest an idea for this project
4 | 
5 | ---
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Steppy-toolkit
2 | 
3 | Set of tools to make your work with Steppy faster and more effective. [Steppy](https://github.com/minerva-ml/steps) is a lightweight, open-source, Python library for fast and reproducible experimentation.
4 | 


--------------------------------------------------------------------------------
/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | ## Pull Request template
 2 | 
 3 | ### Code contributions
 4 | Major - and most appreciated - contribution is pull request with feature or bug fix. Each pull request initiates discussion about your code contribution.
 5 | 
 6 | Each pull request should be provided with minimal description about its contents.
 7 | #
 8 | 
 9 | Thanks!
10 | 
11 | Jakub & Kamil,
12 | 
13 | _core contributors to the minerva.ml_
14 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: bug
 3 | about: Create bug report
 4 | 
 5 | ---
 6 | 
 7 | There are two things that will make the processing of your issue faster:
 8 | 1. Make sure that you are using the latest version of the code,
 9 | 1. In case of bug issue, it would be nice to provide more technical details such like execution command, error message or script that reproduces your bug.
10 | #
11 | 
12 | Thanks!
13 | 
14 | Kamil & Jakub,
15 | 
16 | *core contributors to the [minerva.ml](https://minerva.ml)*
17 | 


--------------------------------------------------------------------------------
/toolkit/pytorch_transformers/architectures/utils.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import torch.nn as nn
 4 | 
 5 | 
 6 | class Reshape(nn.Module):
 7 |     def __init__(self, *shape):
 8 |         super(Reshape, self).__init__()
 9 |         self.shape = shape
10 | 
11 |     def forward(self, x):
12 |         return x.view(*self.shape)
13 | 
14 | 
15 | def get_downsample_pad(stride, kernel, dilation=1):
16 |     return int(math.ceil((1 - stride + dilation * kernel - 1) / 2))
17 | 
18 | 
19 | def get_upsample_pad(stride, kernel, dilation=1):
20 |     if kernel - stride >= 0 and (kernel - stride) % 2 == 0:
21 |         return (int((kernel - stride) / 2), 0)
22 |     elif kernel - stride < 0:
23 |         return (0, stride - kernel)
24 |     else:
25 |         return (int(math.ceil((kernel - stride) / 2)), 1)
26 | 


--------------------------------------------------------------------------------
/toolkit/pytorch_transformers/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def persist_torch_model(model, path):
 5 |     model.eval()
 6 |     if torch.cuda.is_available():
 7 |         model.cpu()
 8 |         torch.save(model.state_dict(), path)
 9 |         model.cuda()
10 |     else:
11 |         torch.save(model.state_dict(), path)
12 |     model.train()
13 | 
14 | 
15 | class Averager:
16 |     def __init__(self):
17 |         self.current_total = 0.0
18 |         self.iterations = 0.0
19 | 
20 |     def send(self, value):
21 |         self.current_total += value
22 |         self.iterations += 1
23 | 
24 |     @property
25 |     def value(self):
26 |         if self.iterations == 0:
27 |             return 0
28 |         else:
29 |             return 1.0 * self.current_total / self.iterations
30 | 
31 |     def reset(self):
32 |         self.current_total = 0.0
33 |         self.iterations = 0.0
34 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to the Steppy-toolkit
 2 | 
 3 | Here, at [minerva.ml](https://minerva.ml) we are creating Steppy - lightweight, open-source, Python library for fast and reproducible experimentation.
 4 | 
 5 | ### Get involved
 6 | You are welcome to contribute to the Steppy-examples. To get started:
 7 | 1. Check [steppy core library kanban board](https://github.com/minerva-ml/steppy/projects/1) to see what we are working on right now.
 8 | 1. Express your interest in a particular [issue](https://github.com/minerva-ml/steppy/issues) by submitting a comment or,
 9 |     * submit your own [issue](https://github.com/minerva-ml/steppy/issues).
10 | 1. We will get back to you in order to start working together.
11 | 
12 | ### Code contributions
13 | Major - and most appreciated - contribution is [pull request](https://github.com/minerva-ml/steppy-toolkit/pulls) with feature or bug fix.
14 | 
15 | ### Remarks
16 | In case of custom ideas, please contact core contributors directly at ml-team@neptune.ml.
17 | #
18 | 
19 | Thanks!
20 | 
21 | Jakub & Kamil,
22 | 
23 | *core contributors to the [minerva.ml](https://minerva.ml)*
24 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 neptune.ml
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | long_description = '''
 4 | Steppy-toolkit is complementary to the steppy library.
 5 | 
 6 | The goal of this package is to provide data scientist
 7 | with curated collection of highly parameterizable implementations of neural networks
 8 | together with a number of pre- and post-processing routines.
 9 | 
10 | Steppy-toolkit offers implementations in popular frameworks, such as PyTorch, Keras and scikit-learn.
11 | 
12 | Steppy-toolkit is compatible with Python>=3.5
13 | and is distributed under the MIT license.
14 | '''
15 | 
16 | setup(name='steppy-toolkit',
17 |       packages=find_packages(),
18 |       version='0.1.14',
19 |       description='Set of tools to make your work with steppy faster and more effective.',
20 |       long_description=long_description,
21 |       url='https://github.com/minerva-ml/steppy-toolkit',
22 |       download_url='https://github.com/minerva-ml/steppy-toolkit/archive/0.1.14.tar.gz',
23 |       author='Kamil A. Kaczmarek, Jakub Czakon',
24 |       author_email='kamil.kaczmarek@neptune.ml, jakub.czakon@neptune.ml',
25 |       keywords=['machine-learning', 'reproducibility', 'pipeline', 'tools'],
26 |       license='MIT',
27 |       install_requires=[
28 |           'neptune-cli>=2.8.17',
29 |           'setuptools>=39.2.0',
30 |           'steppy>=0.1.15'],
31 |       zip_safe=False,
32 |       classifiers=[])
33 | 


--------------------------------------------------------------------------------
/toolkit/resources/apostrophes.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "arent": "are not",
 3 |   "cant": "cannot",
 4 |   "couldnt": "could not",
 5 |   "didnt": "did not",
 6 |   "doesnt": "does not",
 7 |   "dont": "do not",
 8 |   "hadnt": "had not",
 9 |   "hasnt": "has not",
10 |   "havent": "have not",
11 |   "hed": "he would",
12 |   "hell": "he will",
13 |   "hes": "he is",
14 |   "id": "I had",
15 |   "ill": "I will",
16 |   "im": "I am",
17 |   "isnt": "is not",
18 |   "its": "it is",
19 |   "itll": "it will",
20 |   "ive": "I have",
21 |   "lets": "let us",
22 |   "mightnt": "might not",
23 |   "mustnt": "must not",
24 |   "shant": "shall not",
25 |   "shed" : "she would",
26 |   "shell": "she will",
27 |   "shes": "she is",
28 |   "shouldnt": "should not",
29 |   "thats": "that is",
30 |   "theres": "there is",
31 |   "theyd": "they would",
32 |   "theyll": "they will",
33 |   "theyre": "they are",
34 |   "theyve": "they have",
35 |   "wed": "we would",
36 |   "were": "we are",
37 |   "werent": "were not",
38 |   "weve": "we have",
39 |   "whatll": "what will",
40 |   "whatre": "what are",
41 |   "whats": "what is",
42 |   "whatve": "what have",
43 |   "wheres": "where is",
44 |   "whod": "who would",
45 |   "wholl": "who will",
46 |   "whore": "who are",
47 |   "whos": "who is",
48 |   "whove": "who have",
49 |   "wont": "will not",
50 |   "wouldnt": "would not",
51 |   "youd": "you would",
52 |   "youll": "you will",
53 |   "youre": "you are",
54 |   "youve": "you have",
55 |   "re":  "are",
56 |   "wasnt": "was not",
57 |   "well":  "will"
58 | }
59 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | .pytest_cache
  6 | tests/.cache
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # neptune, pycharm
 12 | .cache
 13 | .cache/
 14 | .idea/
 15 | .idea_modules/
 16 | out/
 17 | output
 18 | output/
 19 | *.log
 20 | target/
 21 | devbook.ipynb
 22 | devbook_local.ipynb
 23 | 
 24 | # Distribution / packaging
 25 | .Python
 26 | env/
 27 | build/
 28 | develop-eggs/
 29 | dist/
 30 | downloads/
 31 | eggs/
 32 | .eggs/
 33 | lib/
 34 | lib64/
 35 | parts/
 36 | sdist/
 37 | var/
 38 | wheels/
 39 | *.egg-info/
 40 | .installed.cfg
 41 | *.egg
 42 | 
 43 | # PyInstaller
 44 | #  Usually these files are written by a python script from a template
 45 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 46 | *.manifest
 47 | *.spec
 48 | 
 49 | # Installer logs
 50 | pip-log.txt
 51 | pip-delete-this-directory.txt
 52 | 
 53 | # Unit test / coverage reports
 54 | htmlcov/
 55 | .tox/
 56 | .coverage
 57 | .coverage.*
 58 | nosetests.xml
 59 | coverage.xml
 60 | *.cover
 61 | .hypothesis/
 62 | 
 63 | # Translations
 64 | *.mo
 65 | *.pot
 66 | 
 67 | # Django stuff:
 68 | local_settings.py
 69 | 
 70 | # Flask stuff:
 71 | instance/
 72 | .webassets-cache
 73 | 
 74 | # Scrapy stuff:
 75 | .scrapy
 76 | 
 77 | # Sphinx documentation
 78 | docs/_build/
 79 | 
 80 | # Jupyter Notebook
 81 | Untitled*.ipynb
 82 | .ipynb_checkpoints
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # celery beat schedule file
 88 | celerybeat-schedule
 89 | 
 90 | # SageMath parsed files
 91 | *.sage.py
 92 | 
 93 | # dotenv
 94 | .env
 95 | 
 96 | # virtualenv
 97 | .venv
 98 | venv/
 99 | ENV/
100 | 
101 | # Spyder project settings
102 | .spyderproject
103 | .spyproject
104 | 
105 | # Rope project settings
106 | .ropeproject
107 | 
108 | # mkdocs documentation
109 | /site
110 | 
111 | # mypy
112 | .mypy_cache/
113 | 
114 | # Working directories
115 | examples/cache/
116 | 


--------------------------------------------------------------------------------
/toolkit/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | 
 4 | import imgaug as ia
 5 | import numpy as np
 6 | from PIL import Image
 7 | from imgaug import augmenters as iaa
 8 | from pycocotools import mask as cocomask
 9 | 
10 | 
11 | def from_pil(*images):
12 |     images = [np.array(image) for image in images]
13 |     if len(images) == 1:
14 |         return images[0]
15 |     else:
16 |         return images
17 | 
18 | 
19 | def to_pil(*images):
20 |     images = [Image.fromarray((image).astype(np.uint8)) for image in images]
21 |     if len(images) == 1:
22 |         return images[0]
23 |     else:
24 |         return images
25 | 
26 | 
27 | def rle_from_binary(prediction):
28 |     prediction = np.asfortranarray(prediction)
29 |     return cocomask.encode(prediction)
30 | 
31 | 
32 | def binary_from_rle(rle):
33 |     return cocomask.decode(rle)
34 | 
35 | 
36 | class ImgAug:
37 |     def __init__(self, augmenters):
38 |         if not isinstance(augmenters, list):
39 |             augmenters = [augmenters]
40 |         self.augmenters = augmenters
41 |         self.seq_det = None
42 | 
43 |     def _pre_call_hook(self):
44 |         seq = iaa.Sequential(self.augmenters)
45 |         seq = reseed(seq, deterministic=True)
46 |         self.seq_det = seq
47 | 
48 |     def transform(self, *images):
49 |         images = [self.seq_det.augment_image(image) for image in images]
50 |         if len(images) == 1:
51 |             return images[0]
52 |         else:
53 |             return images
54 | 
55 |     def __call__(self, *args):
56 |         self._pre_call_hook()
57 |         return self.transform(*args)
58 | 
59 | 
60 | def get_seed():
61 |     seed = int(time.time()) + int(os.getpid())
62 |     return seed
63 | 
64 | 
65 | def reseed(augmenter, deterministic=True):
66 |     augmenter.random_state = ia.new_random_state(get_seed())
67 |     if deterministic:
68 |         augmenter.deterministic = True
69 | 
70 |     for lists in augmenter.get_children_lists():
71 |         for aug in lists:
72 |             aug = reseed(aug, deterministic=True)
73 |     return augmenter
74 | 


--------------------------------------------------------------------------------
/toolkit/keras_transformers/loaders.py:
--------------------------------------------------------------------------------
 1 | from keras.preprocessing import text, sequence
 2 | from sklearn.externals import joblib
 3 | 
 4 | from steppy.base import BaseTransformer
 5 | 
 6 | 
 7 | class Tokenizer(BaseTransformer):
 8 |     def __init__(self, char_level, maxlen, num_words):
 9 |         super().__init__()
10 |         self.char_level = char_level
11 |         self.maxlen = maxlen
12 |         self.num_words = num_words
13 | 
14 |         self.tokenizer = text.Tokenizer(char_level=self.char_level, num_words=self.num_words)
15 | 
16 |     def fit(self, X, X_valid=None, train_mode=True):
17 |         self.tokenizer.fit_on_texts(X)
18 |         return self
19 | 
20 |     def transform(self, X, X_valid=None, train_mode=True):
21 |         X_tokenized = self._transform(X)
22 | 
23 |         if X_valid is not None:
24 |             X_valid_tokenized = self._transform(X_valid)
25 |         else:
26 |             X_valid_tokenized = None
27 |         return {'X': X_tokenized,
28 |                 'X_valid': X_valid_tokenized,
29 |                 'tokenizer': self.tokenizer}
30 | 
31 |     def _transform(self, X):
32 |         list_tokenized = self.tokenizer.texts_to_sequences(list(X))
33 |         X_tokenized = sequence.pad_sequences(list_tokenized, maxlen=self.maxlen)
34 |         return X_tokenized
35 | 
36 |     def load(self, filepath):
37 |         object_pickle = joblib.load(filepath)
38 |         self.char_level = object_pickle['char_level']
39 |         self.maxlen = object_pickle['maxlen']
40 |         self.num_words = object_pickle['num_words']
41 |         self.tokenizer = object_pickle['tokenizer']
42 |         return self
43 | 
44 |     def persist(self, filepath):
45 |         object_pickle = {'char_level': self.char_level,
46 |                          'maxlen': self.maxlen,
47 |                          'num_words': self.num_words,
48 |                          'tokenizer': self.tokenizer}
49 |         joblib.dump(object_pickle, filepath)
50 | 
51 | 
52 | class TextAugmenter(BaseTransformer):
53 |     pass
54 |     """
55 |     Augmentations by Thesaurus synonim substitution or typos
56 |     """
57 | 


--------------------------------------------------------------------------------
/toolkit/catboost_transformers/models.py:
--------------------------------------------------------------------------------
 1 | from toolkit.toolkit_base import SteppyToolkitError
 2 | 
 3 | try:
 4 |     import catboost as ctb
 5 |     from catboost import CatBoostClassifier
 6 |     from steppy.base import BaseTransformer
 7 |     from steppy.utils import get_logger
 8 | 
 9 |     from toolkit.sklearn_transformers.models import MultilabelEstimators
10 | except ImportError as e:
11 |     msg = 'SteppyToolkitError: you have missing modules. Install requirements specific to catboost_transformers.' \
12 |           'Use this file: toolkit/catboost_transformers/requirements.txt'
13 |     raise SteppyToolkitError(msg) from e
14 | 
15 | logger = get_logger()
16 | 
17 | 
18 | class CatboostClassifierMultilabel(MultilabelEstimators):
19 |     @property
20 |     def estimator(self):
21 |         return CatBoostClassifier
22 | 
23 | 
24 | class CatBoost(BaseTransformer):
25 |     def __init__(self, **kwargs):
26 |         super().__init__()
27 |         self.estimator = ctb.CatBoostClassifier(**kwargs)
28 | 
29 |     def fit(self,
30 |             X, y,
31 |             X_valid, y_valid,
32 |             feature_names=None,
33 |             categorical_features=None,
34 |             **kwargs):
35 | 
36 |         logger.info('Catboost, train data shape        {}'.format(X.shape))
37 |         logger.info('Catboost, validation data shape   {}'.format(X_valid.shape))
38 |         logger.info('Catboost, train labels shape      {}'.format(y.shape))
39 |         logger.info('Catboost, validation labels shape {}'.format(y_valid.shape))
40 | 
41 |         categorical_indeces = self._get_categorical_indices(feature_names, categorical_features)
42 |         self.estimator.fit(X, y,
43 |                            eval_set=(X_valid, y_valid),
44 |                            cat_features=categorical_indeces)
45 |         return self
46 | 
47 |     def transform(self, X, **kwargs):
48 |         prediction = self.estimator.predict_proba(X)[:, 1]
49 |         return {'prediction': prediction}
50 | 
51 |     def load(self, filepath):
52 |         self.estimator.load_model(filepath)
53 |         return self
54 | 
55 |     def persist(self, filepath):
56 |         self.estimator.save_model(filepath)
57 | 
58 |     def _get_categorical_indices(self, feature_names, categorical_features):
59 |         if categorical_features:
60 |             return [feature_names.index(feature) for feature in categorical_features]
61 |         else:
62 |             return None
63 | 


--------------------------------------------------------------------------------
/toolkit/keras_transformers/callbacks.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from deepsense import neptune
 4 | from keras import backend as K
 5 | from keras.callbacks import Callback
 6 | 
 7 | 
 8 | class NeptuneMonitor(Callback):
 9 |     def __init__(self, model_name):
10 |         super().__init__()
11 |         self.model_name = model_name
12 |         self.ctx = neptune.Context()
13 |         self.batch_loss_channel_name = get_correct_channel_name(self.ctx,
14 |                                                                 '{} Batch Log-loss training'.format(self.model_name))
15 |         self.epoch_loss_channel_name = get_correct_channel_name(self.ctx,
16 |                                                                 '{} Log-loss training'.format(self.model_name))
17 |         self.epoch_val_loss_channel_name = get_correct_channel_name(self.ctx,
18 |                                                                     '{} Log-loss validation'.format(self.model_name))
19 | 
20 |         self.epoch_id = 0
21 |         self.batch_id = 0
22 | 
23 |     def on_batch_end(self, batch, logs={}):
24 |         self.batch_id += 1
25 |         self.ctx.channel_send(self.batch_loss_channel_name, self.batch_id, logs['loss'])
26 | 
27 |     def on_epoch_end(self, epoch, logs={}):
28 |         self.epoch_id += 1
29 |         self.ctx.channel_send(self.epoch_loss_channel_name, self.epoch_id, logs['loss'])
30 |         self.ctx.channel_send(self.epoch_val_loss_channel_name, self.epoch_id, logs['val_loss'])
31 | 
32 | 
33 | class ReduceLR(Callback):
34 |     def __init__(self, gamma):
35 |         self.gamma = gamma
36 | 
37 |     def on_epoch_end(self, epoch, logs={}):
38 |         if self.gamma is not None:
39 |             K.set_value(self.model.optimizer.lr, self.gamma * K.get_value(self.model.optimizer.lr))
40 | 
41 | 
42 | class UnfreezeLayers(Callback):
43 |     def __init__(self, unfreeze_on_epoch, from_layer=0, to_layer=1):
44 |         self.unfreeze_on_epoch = unfreeze_on_epoch
45 |         self.from_layer = from_layer
46 |         self.to_layer = to_layer
47 | 
48 |         self.epoch_id = 0
49 |         self.batch_id = 0
50 | 
51 |     def on_epoch_end(self, epoch, logs={}):
52 |         if self.epoch_id == self.unfreeze_on_epoch:
53 |             for i, layer in enumerate(self.model.layers):
54 |                 if i >= self.from_layer and i <= self.to_layer:
55 |                     layer.trainable = True
56 |         self.epoch_id += 1
57 | 
58 | 
59 | def get_correct_channel_name(ctx, name):
60 |     channels_with_name = [channel for channel in ctx._experiment._channels if name in channel.name]
61 |     if len(channels_with_name) == 0:
62 |         return name
63 |     else:
64 |         channel_ids = [re.split('[^\d]', channel.name)[-1] for channel in channels_with_name]
65 |         channel_ids = sorted([int(idx) if idx != '' else 0 for idx in channel_ids])
66 |         last_id = channel_ids[-1]
67 |         corrected_name = '{} {}'.format(name, last_id + 1)
68 |         return corrected_name
69 | 


--------------------------------------------------------------------------------
/toolkit/keras_transformers/contrib.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import, division
 3 | 
 4 | import sys
 5 | from os.path import dirname
 6 | 
 7 | sys.path.append(dirname(dirname(__file__)))
 8 | from keras import initializers
 9 | from keras.engine import InputSpec, Layer
10 | from keras import backend as K
11 | import tensorflow as tf
12 | 
13 | 
14 | class AttentionWeightedAverage(Layer):
15 |     """
16 |     Computes a weighted average of the different channels across timesteps.
17 |     Uses 1 parameter pr. channel to compute the attention value for a single timestep.
18 |     """
19 | 
20 |     def __init__(self, return_attention=False, **kwargs):
21 |         self.init = initializers.get('uniform')
22 |         self.supports_masking = True
23 |         self.return_attention = return_attention
24 |         super(AttentionWeightedAverage, self).__init__(**kwargs)
25 | 
26 |     def build(self, input_shape):
27 |         self.input_spec = [InputSpec(ndim=3)]
28 |         assert len(input_shape) == 3
29 | 
30 |         self.W = self.add_weight(shape=(input_shape[2], 1),
31 |                                  name='{}_W'.format(self.name),
32 |                                  initializer=self.init)
33 |         self.trainable_weights = [self.W]
34 |         super(AttentionWeightedAverage, self).build(input_shape)
35 | 
36 |     def call(self, x, mask=None):
37 |         # computes a probability distribution over the timesteps
38 |         # uses 'max trick' for numerical stability
39 |         # reshape is done to avoid issue with Tensorflow
40 |         # and 1-dimensional weights
41 |         logits = K.dot(x, self.W)
42 |         x_shape = K.shape(x)
43 |         logits = K.reshape(logits, (x_shape[0], x_shape[1]))
44 |         ai = K.exp(logits - K.max(logits, axis=-1, keepdims=True))
45 | 
46 |         # masked timesteps have zero weight
47 |         if mask is not None:
48 |             mask = K.cast(mask, K.floatx())
49 |             ai = ai * mask
50 |         att_weights = ai / (K.sum(ai, axis=1, keepdims=True) + K.epsilon())
51 |         weighted_input = x * K.expand_dims(att_weights)
52 |         result = K.sum(weighted_input, axis=1)
53 |         if self.return_attention:
54 |             return [result, att_weights]
55 |         return result
56 | 
57 |     def get_output_shape_for(self, input_shape):
58 |         return self.compute_output_shape(input_shape)
59 | 
60 |     def compute_output_shape(self, input_shape):
61 |         output_len = input_shape[2]
62 |         if self.return_attention:
63 |             return [(input_shape[0], output_len), (input_shape[0], input_shape[1])]
64 |         return (input_shape[0], output_len)
65 | 
66 |     def compute_mask(self, input, input_mask=None):
67 |         if isinstance(input_mask, list):
68 |             return [None] * len(input_mask)
69 |         else:
70 |             return None
71 | 
72 | 
73 | def pair_loss(y_true, y_pred):
74 |     y_true = tf.cast(y_true, tf.int32)
75 |     parts = tf.dynamic_partition(y_pred, y_true, 2)
76 |     y_pos = parts[1]
77 |     y_neg = parts[0]
78 |     y_pos = tf.expand_dims(y_pos, 0)
79 |     y_neg = tf.expand_dims(y_neg, -1)
80 |     out = K.sigmoid(y_neg - y_pos)
81 |     return K.mean(out)
82 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.
 6 | 
 7 | ## Our Standards
 8 | 
 9 | Examples of behavior that contributes to creating a positive environment include:
10 | 
11 | * Using welcoming and inclusive language
12 | * Being respectful of differing viewpoints and experiences
13 | * Gracefully accepting constructive criticism
14 | * Focusing on what is best for the community
15 | * Showing empathy towards other community members
16 | 
17 | Examples of unacceptable behavior by participants include:
18 | 
19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances
20 | * Trolling, insulting/derogatory comments, and personal or political attacks
21 | * Public or private harassment
22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission
23 | * Other conduct which could reasonably be considered inappropriate in a professional setting
24 | 
25 | ## Our Responsibilities
26 | 
27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
28 | 
29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
30 | 
31 | ## Scope
32 | 
33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
34 | 
35 | ## Enforcement
36 | 
37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at ml-team@neptune.ml. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
38 | 
39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
40 | 
41 | ## Attribution
42 | 
43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version]
44 | 
45 | [homepage]: http://contributor-covenant.org
46 | [version]: http://contributor-covenant.org/version/1/4/
47 | 


--------------------------------------------------------------------------------
/toolkit/pytorch_transformers/validation.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from sklearn.metrics import accuracy_score
  5 | from torch.autograd import Variable
  6 | 
  7 | 
  8 | class DiceLoss(nn.Module):
  9 |     def __init__(self):
 10 |         super(DiceLoss, self).__init__()
 11 |         self.sigmoid = nn.Sigmoid()
 12 | 
 13 |     def forward(self, output, target):
 14 |         prediction = self.sigmoid(output)
 15 |         return 1 - 2 * torch.sum(prediction * target) / (torch.sum(prediction) + torch.sum(target) + 1e-7)
 16 | 
 17 | 
 18 | def segmentation_loss(output, target, weight_bce=1.0, weight_dice=1.0):
 19 |     bce = nn.BCEWithLogitsLoss()
 20 |     dice = DiceLoss()
 21 |     return weight_bce*bce(output, target) + weight_dice*dice(output, target)
 22 | 
 23 | 
 24 | def cross_entropy(output, target, squeeze=False):
 25 |     if squeeze:
 26 |         target = target.squeeze(1)
 27 |     return F.nll_loss(output, target)
 28 | 
 29 | 
 30 | def mse(output, target, squeeze=False):
 31 |     if squeeze:
 32 |         target = target.squeeze(1)
 33 |     return F.mse_loss(output, target)
 34 | 
 35 | 
 36 | def multi_output_cross_entropy(outputs, targets):
 37 |     losses = []
 38 |     for output, target in zip(outputs, targets):
 39 |         loss = cross_entropy(output, target)
 40 |         losses.append(loss)
 41 |     return sum(losses) / len(losses)
 42 | 
 43 | 
 44 | def score_model(model, loss_function, datagen):
 45 |     batch_gen, steps = datagen
 46 |     partial_batch_losses = {}
 47 |     for batch_id, data in enumerate(batch_gen):
 48 |         X = data[0]
 49 |         targets_tensors = data[1:]
 50 | 
 51 |         if torch.cuda.is_available():
 52 |             X = Variable(X, volatile=True).cuda()
 53 |             targets_var = []
 54 |             for target_tensor in targets_tensors:
 55 |                 targets_var.append(Variable(target_tensor, volatile=True).cuda())
 56 |         else:
 57 |             X = Variable(X, volatile=True)
 58 |             targets_var = []
 59 |             for target_tensor in targets_tensors:
 60 |                 targets_var.append(Variable(target_tensor, volatile=True))
 61 | 
 62 |         outputs = model(X)
 63 |         if len(loss_function) == 1:
 64 |             for (name, loss_function_one, weight), target in zip(loss_function, targets_var):
 65 |                 loss_sum = loss_function_one(outputs, target) * weight
 66 |         else:
 67 |             batch_losses = []
 68 |             for (name, loss_function_one, weight), output, target in zip(loss_function, outputs, targets_var):
 69 |                 loss = loss_function_one(output, target) * weight
 70 |                 batch_losses.append(loss)
 71 |                 partial_batch_losses.setdefault(name, []).append(loss)
 72 |             loss_sum = sum(batch_losses)
 73 |         partial_batch_losses.setdefault('sum', []).append(loss_sum)
 74 |         if batch_id == steps:
 75 |             break
 76 |     average_losses = {name: sum(losses) / steps for name, losses in partial_batch_losses.items()}
 77 |     return average_losses
 78 | 
 79 | 
 80 | def torch_acc_score(output, target):
 81 |     output = output.data.cpu().numpy()
 82 |     y_true = target.numpy()
 83 |     y_pred = output.argmax(axis=1)
 84 |     return accuracy_score(y_true, y_pred)
 85 | 
 86 | 
 87 | def torch_acc_score_multi_output(outputs, targets, take_first=None):
 88 |     accuracies = []
 89 |     for i, (output, target) in enumerate(zip(outputs, targets)):
 90 |         if i == take_first:
 91 |             break
 92 |         accuracy = torch_acc_score(output, target)
 93 |         accuracies.append(accuracy)
 94 |     avg_accuracy = sum(accuracies) / len(accuracies)
 95 |     return avg_accuracy
 96 | 
 97 | 
 98 | def multiclass_segmentation_loss(output, target):
 99 |     cross_entropy = nn.CrossEntropyLoss()
100 |     return cross_entropy(output, target)
101 | 


--------------------------------------------------------------------------------
/toolkit/pytorch_transformers/loaders/classification.py:
--------------------------------------------------------------------------------
  1 | from math import ceil
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | import torchvision.transforms as transforms
  6 | from PIL import Image
  7 | from sklearn.externals import joblib
  8 | from steppy.base import BaseTransformer
  9 | from torch.utils.data import Dataset, DataLoader
 10 | 
 11 | 
 12 | class MetadataImageDataset(Dataset):
 13 |     def __init__(self, X, y, image_transform, target_transform, image_augment):
 14 |         super().__init__()
 15 |         self.X = X
 16 |         if y is not None:
 17 |             self.y = y
 18 |         else:
 19 |             self.y = None
 20 | 
 21 |         self.image_transform = image_transform
 22 |         self.image_augment = image_augment
 23 |         self.target_transform = target_transform
 24 | 
 25 |     def load_image(self, img_filepath):
 26 |         image = np.asarray(Image.open(img_filepath))
 27 |         image = image / 255.0
 28 |         return image
 29 | 
 30 |     def __len__(self):
 31 |         return self.X.shape[0]
 32 | 
 33 |     def __getitem__(self, index):
 34 |         img_filepath = self.X[index]
 35 | 
 36 |         Xi = self.load_image(img_filepath)
 37 | 
 38 |         if self.image_augment is not None:
 39 |             Xi = self.image_augment(Xi)
 40 | 
 41 |         if self.image_transform is not None:
 42 |             Xi = self.image_transform(Xi)
 43 |         if self.y is not None:
 44 |             yi = self.y[index]
 45 |             if self.target_transform is not None:
 46 |                 yi = self.target_transform(yi)
 47 |             return Xi, yi
 48 |         else:
 49 |             return Xi
 50 | 
 51 | 
 52 | class MetadataImageLoader(BaseTransformer):
 53 |     def __init__(self, loader_params):
 54 |         super().__init__()
 55 |         self.loader_params = loader_params
 56 | 
 57 |         self.dataset = MetadataImageDataset
 58 |         self.image_transform = transforms.ToTensor()
 59 |         self.target_transform = target_transform
 60 |         self.image_augment = None
 61 | 
 62 |     def transform(self, X, y, validation_data, train_mode):
 63 |         if train_mode:
 64 |             flow, steps = self.get_datagen(X, y, train_mode, self.loader_params['training'])
 65 |         else:
 66 |             flow, steps = self.get_datagen(X, y, train_mode, self.loader_params['inference'])
 67 | 
 68 |         if validation_data is not None:
 69 |             X_valid, y_valid = validation_data
 70 |             valid_flow, valid_steps = self.get_datagen(X_valid, y_valid, False, self.loader_params['inference'])
 71 |         else:
 72 |             valid_flow = None
 73 |             valid_steps = None
 74 | 
 75 |         return {'datagen': (flow, steps),
 76 |                 'validation_datagen': (valid_flow, valid_steps)}
 77 | 
 78 |     def get_datagen(self, X, y, train_mode, loader_params):
 79 |         if train_mode:
 80 |             dataset = self.dataset(X, y,
 81 |                                    image_augment=self.image_augment,
 82 |                                    image_transform=self.image_transform,
 83 |                                    target_transform=self.target_transform)
 84 | 
 85 |         else:
 86 |             dataset = self.dataset(X, y,
 87 |                                    image_augment=None,
 88 |                                    image_transform=self.image_transform,
 89 |                                    target_transform=self.target_transform)
 90 |         datagen = DataLoader(dataset, **loader_params)
 91 |         steps = ceil(X.shape[0] / loader_params['batch_size'])
 92 |         return datagen, steps
 93 | 
 94 |     def load(self, filepath):
 95 |         params = joblib.load(filepath)
 96 |         self.loader_params = params['loader_params']
 97 |         return self
 98 | 
 99 |     def persist(self, filepath):
100 |         params = {'loader_params': self.loader_params}
101 |         joblib.dump(params, filepath)
102 | 
103 | 
104 | def target_transform(y):
105 |     return torch.from_numpy(y).type(torch.LongTensor)
106 | 


--------------------------------------------------------------------------------
/toolkit/xgboost_transformers/models.py:
--------------------------------------------------------------------------------
 1 | from toolkit.toolkit_base import SteppyToolkitError
 2 | 
 3 | try:
 4 |     import xgboost as xgb
 5 |     from attrdict import AttrDict
 6 |     from steppy.base import BaseTransformer
 7 |     from steppy.utils import get_logger
 8 |     from xgboost import XGBClassifier
 9 | 
10 |     from toolkit.sklearn_transformers.models import MultilabelEstimators
11 | except ImportError as e:
12 |     msg = 'SteppyToolkitError: you have missing modules. Install requirements specific to xgboost_transformers.' \
13 |           'Use this file: toolkit/xgboost_transformers/requirements.txt'
14 |     raise SteppyToolkitError(msg) from e
15 | 
16 | logger = get_logger()
17 | 
18 | 
19 | class XGBoostClassifierMultilabel(MultilabelEstimators):
20 |     @property
21 |     def estimator(self):
22 |         return XGBClassifier
23 | 
24 | 
25 | class XGBoost(BaseTransformer):
26 |     """
27 |     Accepts three dictionaries that reflects XGBoost API:
28 |         - dmatrix_parameters  -> parameters of the xgboost.DMatrix class.
29 |           See: https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.DMatrix
30 |         - training_parameters -> parameters of the xgboost.train function.
31 |           See: https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.train
32 |         - predict_parameters -> parameters of the xgboost.Booster.predict function.
33 |           See: https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.Booster.predict
34 |         - booster_parameters  -> parameters of the Booster.
35 |           See: https://xgboost.readthedocs.io/en/latest/parameter.html
36 |     """
37 |     def __init__(self,
38 |                  dmatrix_parameters=None,
39 |                  training_parameters=None,
40 |                  predict_parameters=None,
41 |                  booster_parameters=None):
42 |         super().__init__()
43 |         logger.info('initializing XGBoost transformer')
44 |         if dmatrix_parameters is not None:
45 |             isinstance(dmatrix_parameters, dict), 'XGBoost transformer: dmatrix_parameters must be dict, ' \
46 |                                                   'got {} instead'.format(type(dmatrix_parameters))
47 |         if training_parameters is not None:
48 |             isinstance(training_parameters, dict), 'XGBoost transformer: training_parameters must be dict, ' \
49 |                                                    'got {} instead'.format(type(training_parameters))
50 |         if predict_parameters is not None:
51 |             isinstance(predict_parameters, dict), 'XGBoost transformer: predict_parameters must be dict, ' \
52 |                                                   'got {} instead'.format(type(predict_parameters))
53 |         if booster_parameters is not None:
54 |             isinstance(booster_parameters, dict), 'XGBoost transformer: booster_parameters must be dict, ' \
55 |                                                   'got {} instead'.format(type(booster_parameters))
56 | 
57 |         self.dmatrix_parameters = dmatrix_parameters or {}
58 |         self.training_parameters = training_parameters or {}
59 |         self.predict_parameters = predict_parameters or {}
60 |         self.booster_parameters = booster_parameters or {}
61 | 
62 |     def fit(self, X, y, X_valid, y_valid):
63 |         logger.info('XGBoost, train data shape        {}'.format(X.shape))
64 |         logger.info('XGBoost, validation data shape   {}'.format(X_valid.shape))
65 |         logger.info('XGBoost, train labels shape      {}'.format(y.shape))
66 |         logger.info('XGBoost, validation labels shape {}'.format(y_valid.shape))
67 | 
68 |         train = xgb.DMatrix(data=X,
69 |                             label=y,
70 |                             **self.dmatrix_parameters)
71 |         valid = xgb.DMatrix(data=X_valid,
72 |                             label=y_valid,
73 |                             **self.dmatrix_parameters)
74 |         self.estimator = xgb.train(params=self.booster_parameters,
75 |                                    dtrain=train,
76 |                                    evals=[(train, 'train'), (valid, 'valid')],
77 |                                    **self.training_parameters)
78 |         return self
79 | 
80 |     def transform(self, X, y=None, **kwargs):
81 |         X_DMatrix = xgb.DMatrix(X, label=y, **self.dmatrix_parameters)
82 |         prediction = self.estimator.predict(X_DMatrix, **self.predict_parameters)
83 |         return {'prediction': prediction}
84 | 
85 |     def load(self, filepath):
86 |         self.estimator = xgb.Booster(params=self.booster_parameters)
87 |         self.estimator.load_model(filepath)
88 |         return self
89 | 
90 |     def persist(self, filepath):
91 |         self.estimator.save_model(filepath)
92 | 


--------------------------------------------------------------------------------
/toolkit/keras_transformers/embeddings.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from gensim.models import KeyedVectors
  4 | from sklearn.externals import joblib
  5 | 
  6 | from steppy.base import BaseTransformer
  7 | 
  8 | 
  9 | class EmbeddingsMatrix(BaseTransformer):
 10 |     def __init__(self, pretrained_filepath, max_features, embedding_size):
 11 |         super().__init__()
 12 |         self.pretrained_filepath = pretrained_filepath
 13 |         self.max_features = max_features
 14 |         self.embedding_size = embedding_size
 15 | 
 16 |     def fit(self, tokenizer):
 17 |         self.embedding_matrix = self._get_embedding_matrix(tokenizer)
 18 |         return self
 19 | 
 20 |     def transform(self, tokenizer):
 21 |         return {'embeddings_matrix': self.embedding_matrix}
 22 | 
 23 |     def _get_embedding_matrix(self, tokenizer):
 24 |         return NotImplementedError
 25 | 
 26 |     def persist(self, filepath):
 27 |         joblib.dump(self.embedding_matrix, filepath)
 28 | 
 29 |     def load(self, filepath):
 30 |         self.embedding_matrix = joblib.load(filepath)
 31 |         return self
 32 | 
 33 | 
 34 | class GloveEmbeddingsMatrix(EmbeddingsMatrix):
 35 |     def _get_embedding_matrix(self, tokenizer):
 36 |         return load_glove_embeddings(self.pretrained_filepath,
 37 |                                      tokenizer,
 38 |                                      self.max_features,
 39 |                                      self.embedding_size)
 40 | 
 41 | 
 42 | class Word2VecEmbeddingsMatrix(EmbeddingsMatrix):
 43 |     def _get_embedding_matrix(self, tokenizer):
 44 |         return load_word2vec_embeddings(self.pretrained_filepath,
 45 |                                         tokenizer,
 46 |                                         self.max_features,
 47 |                                         self.embedding_size)
 48 | 
 49 | 
 50 | class FastTextEmbeddingsMatrix(EmbeddingsMatrix):
 51 |     def _get_embedding_matrix(self, tokenizer):
 52 |         return load_fasttext_embeddings(self.pretrained_filepath,
 53 |                                         tokenizer,
 54 |                                         self.max_features,
 55 |                                         self.embedding_size)
 56 | 
 57 | 
 58 | def load_glove_embeddings(filepath, tokenizer, max_features, embedding_size):
 59 |     embeddings_index = dict()
 60 |     with open(filepath) as f:
 61 |         for line in f:
 62 |             # Note: use split(' ') instead of split() if you get an error.
 63 |             values = line.split(' ')
 64 |             word = values[0]
 65 |             coefs = np.asarray(values[1:], dtype='float32')
 66 |             embeddings_index[word] = coefs
 67 | 
 68 |     all_embs = np.stack(embeddings_index.values())
 69 |     emb_mean, emb_std = all_embs.mean(), all_embs.std()
 70 | 
 71 |     word_index = tokenizer.word_index
 72 |     nb_words = min(max_features, len(word_index))
 73 |     embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embedding_size))
 74 |     for word, i in word_index.items():
 75 |         if i >= max_features:
 76 |             continue
 77 |         embedding_vector = embeddings_index.get(word)
 78 |         if embedding_vector is not None:
 79 |             embedding_matrix[i] = embedding_vector
 80 |     return embedding_matrix
 81 | 
 82 | 
 83 | def load_word2vec_embeddings(filepath, tokenizer, max_features, embedding_size):
 84 |     model = KeyedVectors.load_word2vec_format(filepath, binary=True)
 85 | 
 86 |     emb_mean, emb_std = model.wv.syn0.mean(), model.wv.syn0.std()
 87 | 
 88 |     word_index = tokenizer.word_index
 89 |     nb_words = min(max_features, len(word_index))
 90 |     embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embedding_size))
 91 |     for word, i in word_index.items():
 92 |         if i >= max_features:
 93 |             continue
 94 |         try:
 95 |             embedding_vector = model[word]
 96 |             embedding_matrix[i] = embedding_vector
 97 |         except KeyError:
 98 |             continue
 99 |     return embedding_matrix
100 | 
101 | 
102 | def load_fasttext_embeddings(filepath, tokenizer, max_features, embedding_size):
103 |     embeddings_index = dict()
104 |     with open(filepath) as f:
105 |         for i, line in enumerate(f):
106 |             line = line.strip()
107 |             if i == 0:
108 |                 continue
109 |             values = line.split(' ')
110 |             word = values[0]
111 |             coefs = np.asarray(values[1:], dtype='float32')
112 |             if coefs.shape[0] != embedding_size:
113 |                 continue
114 |             embeddings_index[word] = coefs
115 | 
116 |     all_embs = np.stack(embeddings_index.values())
117 |     emb_mean, emb_std = all_embs.mean(), all_embs.std()
118 | 
119 |     word_index = tokenizer.word_index
120 |     nb_words = min(max_features, len(word_index))
121 |     embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embedding_size))
122 |     for word, i in word_index.items():
123 |         if i >= max_features:
124 |             continue
125 |         embedding_vector = embeddings_index.get(word)
126 |         if embedding_vector is not None:
127 |             embedding_matrix[i] = embedding_vector
128 |     return embedding_matrix
129 | 


--------------------------------------------------------------------------------
/toolkit/lightgbm_transformers/models.py:
--------------------------------------------------------------------------------
  1 | from toolkit.toolkit_base import SteppyToolkitError
  2 | 
  3 | try:
  4 |     import lightgbm as lgb
  5 |     import numpy as np
  6 |     import pandas as pd
  7 |     from sklearn.externals import joblib
  8 |     from steppy.base import BaseTransformer
  9 |     from steppy.utils import get_logger
 10 | except ImportError as e:
 11 |     msg = 'SteppyToolkitError: you have missing modules. Install requirements specific to lightgbm_transformers.' \
 12 |           'Use this file: toolkit/lightgbm_transformers/requirements.txt'
 13 |     raise SteppyToolkitError(msg) from e
 14 | 
 15 | logger = get_logger()
 16 | 
 17 | 
 18 | class LightGBM(BaseTransformer):
 19 |     """
 20 |     Accepts three dictionaries that reflects LightGBM API:
 21 |         - booster_parameters  -> parameters of the Booster
 22 |           See: https://lightgbm.readthedocs.io/en/latest/Parameters.html
 23 |         - dataset_parameters  -> parameters of the lightgbm.Dataset class
 24 |           See: https://lightgbm.readthedocs.io/en/latest/Python-API.html#data-structure-api
 25 |         - training_parameters -> parameters of the lightgbm.train function
 26 |           See: https://lightgbm.readthedocs.io/en/latest/Python-API.html#training-api
 27 |     """
 28 |     def __init__(self,
 29 |                  booster_parameters=None,
 30 |                  dataset_parameters=None,
 31 |                  training_parameters=None):
 32 |         super().__init__()
 33 |         logger.info('initializing LightGBM transformer')
 34 |         if booster_parameters is not None:
 35 |             isinstance(booster_parameters, dict), 'LightGBM transformer: booster_parameters must be dict, ' \
 36 |                                                   'got {} instead'.format(type(booster_parameters))
 37 |         if dataset_parameters is not None:
 38 |             isinstance(dataset_parameters, dict), 'LightGBM transformer: dataset_parameters must be dict, ' \
 39 |                                                   'got {} instead'.format(type(dataset_parameters))
 40 |         if training_parameters is not None:
 41 |             isinstance(training_parameters, dict), 'LightGBM transformer: training_parameters must be dict, ' \
 42 |                                                    'got {} instead'.format(type(training_parameters))
 43 | 
 44 |         self.booster_parameters = booster_parameters or {}
 45 |         self.dataset_parameters = dataset_parameters or {}
 46 |         self.training_parameters = training_parameters or {}
 47 | 
 48 |     def fit(self, X, y, X_valid, y_valid):
 49 |         self._check_target_shape_and_type(y, 'y')
 50 |         self._check_target_shape_and_type(y_valid, 'y_valid')
 51 |         y = self._format_target(y)
 52 |         y_valid = self._format_target(y_valid)
 53 | 
 54 |         logger.info('LightGBM transformer, train data shape        {}'.format(X.shape))
 55 |         logger.info('LightGBM transformer, validation data shape   {}'.format(X_valid.shape))
 56 |         logger.info('LightGBM transformer, train labels shape      {}'.format(y.shape))
 57 |         logger.info('LightGBM transformer, validation labels shape {}'.format(y_valid.shape))
 58 | 
 59 |         data_train = lgb.Dataset(data=X,
 60 |                                  label=y,
 61 |                                  **self.dataset_parameters)
 62 |         data_valid = lgb.Dataset(data=X_valid,
 63 |                                  label=y_valid,
 64 |                                  **self.dataset_parameters)
 65 |         self.estimator = lgb.train(params=self.booster_parameters,
 66 |                                    train_set=data_train,
 67 |                                    valid_sets=[data_train, data_valid],
 68 |                                    valid_names=['data_train', 'data_valid'],
 69 |                                    **self.training_parameters)
 70 |         return self
 71 | 
 72 |     def transform(self, X, **kwargs):
 73 |         prediction = self.estimator.predict(X)
 74 |         return {'prediction': prediction}
 75 | 
 76 |     def load(self, filepath):
 77 |         self.estimator = joblib.load(filepath)
 78 |         return self
 79 | 
 80 |     def persist(self, filepath):
 81 |         joblib.dump(self.estimator, filepath)
 82 | 
 83 |     def _check_target_shape_and_type(self, target, name):
 84 |         if not any([isinstance(target, obj_type) for obj_type in [pd.Series, np.ndarray, list]]):
 85 |             msg = '"target" must be "numpy.ndarray" or "Pandas.Series" or "list", got {} instead.'.format(type(target))
 86 |             raise SteppyToolkitError(msg)
 87 |         if not isinstance(target, list):
 88 |             assert len(target.shape) == 1, '"{}" must be 1-D. It is {}-D instead.'.format(name, len(target.shape))
 89 | 
 90 |     def _format_target(self, target):
 91 |         if isinstance(target, pd.Series):
 92 |             return target.values
 93 |         elif isinstance(target, np.ndarray):
 94 |             return target
 95 |         elif isinstance(target, list):
 96 |             return np.array(target)
 97 |         else:
 98 |             raise TypeError(
 99 |                 '"target" must be "numpy.ndarray" or "Pandas.Series" or "list", got {} instead.'.format(
100 |                     type(target)))
101 | 


--------------------------------------------------------------------------------
/toolkit/sklearn_transformers/models.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | import sklearn.linear_model as lr
  4 | from sklearn import ensemble
  5 | from sklearn import svm
  6 | from sklearn.externals import joblib
  7 | 
  8 | from steppy.base import BaseTransformer
  9 | from steppy.utils import get_logger
 10 | 
 11 | logger = get_logger()
 12 | 
 13 | 
 14 | class SklearnBaseTransformer(BaseTransformer):
 15 |     def __init__(self, estimator):
 16 |         super().__init__()
 17 |         self.estimator = estimator
 18 | 
 19 |     def fit(self, X, y=None, **kwargs):
 20 |         self.estimator.fit(X, y)
 21 |         return self
 22 | 
 23 |     def persist(self, filepath):
 24 |         joblib.dump(self.estimator, filepath)
 25 | 
 26 |     def load(self, filepath):
 27 |         self.estimator = joblib.load(filepath)
 28 |         return self
 29 | 
 30 | 
 31 | class SklearnClassifier(SklearnBaseTransformer):
 32 |     RESULT_KEY = 'prediction'
 33 | 
 34 |     def transform(self, X, y=None, **kwargs):
 35 |         prediction = self.estimator.predict_proba(X)
 36 |         return {SklearnClassifier.RESULT_KEY: prediction}
 37 | 
 38 | 
 39 | class SklearnRegressor(SklearnBaseTransformer):
 40 |     RESULT_KEY = 'prediction'
 41 | 
 42 |     def transform(self, X, y=None, **kwargs):
 43 |         prediction = self.estimator.predict(X)
 44 |         return {SklearnRegressor.RESULT_KEY: prediction}
 45 | 
 46 | 
 47 | class SklearnTransformer(SklearnBaseTransformer):
 48 |     RESULT_KEY = 'transformed'
 49 | 
 50 |     def transform(self, X, y=None, **kwargs):
 51 |         transformed = self.estimator.transform(X)
 52 |         return {SklearnTransformer.RESULT_KEY: transformed}
 53 | 
 54 | 
 55 | class SklearnPipeline(SklearnBaseTransformer):
 56 |     RESULT_KEY = 'transformed'
 57 | 
 58 |     def transform(self, X, y=None, **kwargs):
 59 |         transformed = self.estimator.transform(X)
 60 |         return {SklearnPipeline.RESULT_KEY: transformed}
 61 | 
 62 | 
 63 | class MultilabelEstimators(BaseTransformer):
 64 |     def __init__(self, label_nr, **kwargs):
 65 |         super().__init__()
 66 |         self.label_nr = label_nr
 67 |         self.estimators = self._get_estimators(**kwargs)
 68 | 
 69 |     @property
 70 |     def estimator(self):
 71 |         return NotImplementedError
 72 | 
 73 |     def _get_estimators(self, **kwargs):
 74 |         estimators = []
 75 |         for i in range(self.label_nr):
 76 |             estimators.append((i, self.estimator(**kwargs)))
 77 |         return estimators
 78 | 
 79 |     def fit(self, X, y, **kwargs):
 80 |         for i, estimator in self.estimators:
 81 |             logger.info('fitting estimator {}'.format(i))
 82 |             estimator.fit(X, y[:, i])
 83 |         return self
 84 | 
 85 |     def transform(self, X, y=None, **kwargs):
 86 |         predictions = []
 87 |         for i, estimator in self.estimators:
 88 |             prediction = estimator.predict_proba(X)
 89 |             predictions.append(prediction)
 90 |         predictions = np.stack(predictions, axis=0)
 91 |         predictions = predictions[:, :, 1].transpose()
 92 |         return {'predicted_probability': predictions}
 93 | 
 94 |     def load(self, filepath):
 95 |         params = joblib.load(filepath)
 96 |         self.label_nr = params['label_nr']
 97 |         self.estimators = params['estimators']
 98 |         return self
 99 | 
100 |     def persist(self, filepath):
101 |         params = {'label_nr': self.label_nr,
102 |                   'estimators': self.estimators}
103 |         joblib.dump(params, filepath)
104 | 
105 | 
106 | class LogisticRegressionMultilabel(MultilabelEstimators):
107 |     @property
108 |     def estimator(self):
109 |         return lr.LogisticRegression
110 | 
111 | 
112 | class SVCMultilabel(MultilabelEstimators):
113 |     @property
114 |     def estimator(self):
115 |         return svm.SVC
116 | 
117 | 
118 | class LinearSVCMultilabel(MultilabelEstimators):
119 |     @property
120 |     def estimator(self):
121 |         return LinearSVC_proba
122 | 
123 | 
124 | class RandomForestMultilabel(MultilabelEstimators):
125 |     @property
126 |     def estimator(self):
127 |         return ensemble.RandomForestClassifier
128 | 
129 | 
130 | class LinearSVC_proba(svm.LinearSVC):
131 |     def _platt_func(self, x):
132 |         return 1.0 / (1 + np.exp(-x))
133 | 
134 |     def predict_proba(self, X):
135 |         f = np.vectorize(self._platt_func)
136 |         raw_predictions = self.decision_function(X)
137 |         platt_predictions = f(raw_predictions).reshape(-1, 1)
138 |         prob_positive = platt_predictions / platt_predictions.sum(axis=1)[:, None]
139 |         prob_negative = 1.0 - prob_positive
140 |         probabilities = np.hstack([prob_negative, prob_positive])
141 |         return probabilities
142 | 
143 | 
144 | def make_transformer(estimator, mode='classifier'):
145 |     if mode == 'classifier':
146 |         transformer = SklearnClassifier(estimator)
147 |     elif mode == 'regressor':
148 |         transformer = SklearnRegressor(estimator)
149 |     elif mode == 'transformer':
150 |         transformer = SklearnTransformer(estimator)
151 |     elif mode == 'pipeline':
152 |         transformer = SklearnPipeline(estimator)
153 |     else:
154 |         raise NotImplementedError("""Only classifier, regressor and transformer modes are available""")
155 | 
156 |     return transformer
157 | 


--------------------------------------------------------------------------------
/toolkit/preprocessing/text.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import re
  3 | import string
  4 | 
  5 | import nltk
  6 | import numpy as np
  7 | import pandas as pd
  8 | from nltk.corpus import stopwords
  9 | from nltk.stem.wordnet import WordNetLemmatizer
 10 | from nltk.tokenize import TweetTokenizer
 11 | from sklearn.externals import joblib
 12 | from steppy.base import BaseTransformer
 13 | 
 14 | lem = WordNetLemmatizer()
 15 | tokenizer = TweetTokenizer()
 16 | 
 17 | nltk.download('wordnet')
 18 | nltk.download('stopwords')
 19 | 
 20 | eng_stopwords = set(stopwords.words("english"))
 21 | with open('steps/resources/apostrophes.json', 'r') as f:
 22 |     APOSTROPHES_WORDS = json.load(f)
 23 | 
 24 | 
 25 | class WordListFilter(BaseTransformer):
 26 |     def __init__(self, word_list_filepath):
 27 |         super().__init__()
 28 |         self.word_set = self._read_data(word_list_filepath)
 29 | 
 30 |     def transform(self, X):
 31 |         X = self._transform(X)
 32 |         return {'X': X}
 33 | 
 34 |     def _transform(self, X):
 35 |         X = pd.DataFrame(X, columns=['text']).astype(str)
 36 |         X['text'] = X['text'].apply(self._filter_words)
 37 |         return X['text'].values
 38 | 
 39 |     def _filter_words(self, x):
 40 |         x = x.lower()
 41 |         x = ' '.join([w for w in x.split() if w in self.word_set])
 42 |         return x
 43 | 
 44 |     def _read_data(self, filepath):
 45 |         with open(filepath, 'r+') as f:
 46 |             data = f.read()
 47 |         return set(data.split('\n'))
 48 | 
 49 |     def load(self, filepath):
 50 |         return self
 51 | 
 52 |     def persist(self, filepath):
 53 |         joblib.dump({}, filepath)
 54 | 
 55 | 
 56 | class TextCleaner(BaseTransformer):
 57 |     def __init__(self,
 58 |                  drop_punctuation,
 59 |                  drop_newline,
 60 |                  drop_multispaces,
 61 |                  all_lower_case,
 62 |                  fill_na_with,
 63 |                  deduplication_threshold,
 64 |                  apostrophes,
 65 |                  use_stopwords):
 66 |         super().__init__()
 67 |         self.drop_punctuation = drop_punctuation
 68 |         self.drop_newline = drop_newline
 69 |         self.drop_multispaces = drop_multispaces
 70 |         self.all_lower_case = all_lower_case
 71 |         self.fill_na_with = fill_na_with
 72 |         self.deduplication_threshold = deduplication_threshold
 73 |         self.apostrophes = apostrophes
 74 |         self.use_stopwords = use_stopwords
 75 | 
 76 |     def transform(self, X):
 77 |         X = pd.DataFrame(X, columns=['text']).astype(str)
 78 |         X['text'] = X['text'].apply(self._transform)
 79 |         if self.fill_na_with:
 80 |             X['text'] = X['text'].fillna(self.fill_na_with).values
 81 |         return {'X': X['text'].values}
 82 | 
 83 |     def _transform(self, x):
 84 |         if self.all_lower_case:
 85 |             x = self._lower(x)
 86 |         if self.drop_punctuation:
 87 |             x = self._remove_punctuation(x)
 88 |         if self.drop_newline:
 89 |             x = self._remove_newline(x)
 90 |         if self.drop_multispaces:
 91 |             x = self._substitute_multiple_spaces(x)
 92 |         if self.deduplication_threshold is not None:
 93 |             x = self._deduplicate(x)
 94 |         if self.apostrophes:
 95 |             x = self._apostrophes(x)
 96 |         if self.use_stopwords:
 97 |             x = self._use_stopwords(x)
 98 |         return x
 99 | 
100 |     def _use_stopwords(self, x):
101 |         words = tokenizer.tokenize(x)
102 |         words = [w for w in words if not w in eng_stopwords]
103 |         x = " ".join(words)
104 |         return x
105 | 
106 |     def _apostrophes(self, x):
107 |         words = tokenizer.tokenize(x)
108 |         words = [APOSTROPHES_WORDS[word] if word in APOSTROPHES_WORDS else word for word in words]
109 |         words = [lem.lemmatize(word, "v") for word in words]
110 |         words = [w for w in words if not w in eng_stopwords]
111 |         x = " ".join(words)
112 |         return x
113 | 
114 |     def _lower(self, x):
115 |         return x.lower()
116 | 
117 |     def _remove_punctuation(self, x):
118 |         return re.sub(r'[^\w\s]', ' ', x)
119 | 
120 |     def _remove_newline(self, x):
121 |         x = x.replace('\n', ' ')
122 |         x = x.replace('\n\n', ' ')
123 |         return x
124 | 
125 |     def _substitute_multiple_spaces(self, x):
126 |         return ' '.join(x.split())
127 | 
128 |     def _deduplicate(self, x):
129 |         word_list = x.split()
130 |         num_words = len(word_list)
131 |         if num_words == 0:
132 |             return x
133 |         else:
134 |             num_unique_words = len(set(word_list))
135 |             unique_ratio = num_words / num_unique_words
136 |             if unique_ratio > self.deduplication_threshold:
137 |                 x = ' '.join(x.split()[:num_unique_words])
138 |             return x
139 | 
140 |     def load(self, filepath):
141 |         params = joblib.load(filepath)
142 |         self.drop_punctuation = params['drop_punctuation']
143 |         self.all_lower_case = params['all_lower_case']
144 |         self.fill_na_with = params['fill_na_with']
145 |         return self
146 | 
147 |     def persist(self, filepath):
148 |         params = {'drop_punctuation': self.drop_punctuation,
149 |                   'all_lower_case': self.all_lower_case,
150 |                   'fill_na_with': self.fill_na_with,
151 |                   }
152 |         joblib.dump(params, filepath)
153 | 
154 | 
155 | class TextCounter(BaseTransformer):
156 |     def transform(self, X):
157 |         X = pd.DataFrame(X, columns=['text']).astype(str)
158 |         X = X['text'].apply(self._transform)
159 |         X['caps_vs_length'] = self._caps_vs_length(X)
160 |         X['num_symbols'] = X['text'].apply(lambda comment: sum(comment.count(w) for w in '*&$%'))
161 |         X['num_words'] = X['text'].apply(lambda comment: len(comment.split()))
162 |         X['num_unique_words'] = X['text'].apply(lambda comment: len(set(w for w in comment.split())))
163 |         X['words_vs_unique'] = self._words_vs_unique(X)
164 |         X['mean_word_len'] = X['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
165 |         X.drop('text', axis=1, inplace=True)
166 |         X.fillna(0.0, inplace=True)
167 |         return {'X': X}
168 | 
169 |     def _transform(self, x):
170 |         features = {}
171 |         features['text'] = x
172 |         features['char_count'] = char_count(x)
173 |         features['word_count'] = word_count(x)
174 |         features['punctuation_count'] = punctuation_count(x)
175 |         features['upper_case_count'] = upper_case_count(x)
176 |         features['lower_case_count'] = lower_case_count(x)
177 |         features['digit_count'] = digit_count(x)
178 |         features['space_count'] = space_count(x)
179 |         features['newline_count'] = newline_count(x)
180 |         return pd.Series(features)
181 | 
182 |     def _caps_vs_length(self, X):
183 |         try:
184 |             return X.apply(lambda row: float(row['upper_case_count']) / float(row['char_count']), axis=1)
185 |         except ZeroDivisionError:
186 |             return 0
187 | 
188 |     def _words_vs_unique(self, X):
189 |         try:
190 |             return X['num_unique_words'] / X['num_words']
191 |         except ZeroDivisionError:
192 |             return 0
193 | 
194 |     def load(self, filepath):
195 |         return self
196 | 
197 |     def persist(self, filepath):
198 |         joblib.dump({}, filepath)
199 | 
200 | 
201 | def char_count(x):
202 |     return len(x)
203 | 
204 | 
205 | def word_count(x):
206 |     return len(x.split())
207 | 
208 | 
209 | def newline_count(x):
210 |     return x.count('\n')
211 | 
212 | 
213 | def upper_case_count(x):
214 |     return sum(c.isupper() for c in x)
215 | 
216 | 
217 | def lower_case_count(x):
218 |     return sum(c.islower() for c in x)
219 | 
220 | 
221 | def digit_count(x):
222 |     return sum(c.isdigit() for c in x)
223 | 
224 | 
225 | def space_count(x):
226 |     return sum(c.isspace() for c in x)
227 | 
228 | 
229 | def punctuation_count(x):
230 |     return occurrence(x, string.punctuation)
231 | 
232 | 
233 | def occurrence(s1, s2):
234 |     return sum([1 for x in s1 if x in s2])
235 | 


--------------------------------------------------------------------------------
/toolkit/pytorch_transformers/models.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | from functools import partial
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | import torch.nn as nn
  8 | from steppy.base import BaseTransformer
  9 | from steppy.utils import get_logger
 10 | from torch.autograd import Variable
 11 | from torch.nn import init
 12 | 
 13 | from toolkit.pytorch_transformers.utils import persist_torch_model
 14 | 
 15 | logger = get_logger()
 16 | 
 17 | 
 18 | class Model(BaseTransformer):
 19 |     def __init__(self,
 20 |                  architecture_config,
 21 |                  training_config,
 22 |                  callbacks_config):
 23 |         super().__init__()
 24 |         self.architecture_config = architecture_config
 25 |         self.training_config = training_config
 26 |         self.callbacks_config = callbacks_config
 27 | 
 28 |         self.model = None
 29 |         self.optimizer = None
 30 |         self.loss_function = None
 31 |         self.callbacks = None
 32 |         self.validation_loss = {}
 33 | 
 34 |     @property
 35 |     def output_names(self):
 36 |         return [name for (name, func, weight) in self.loss_function]
 37 | 
 38 |     def _initialize_model_weights(self):
 39 |         logger.info('initializing model weights...')
 40 |         weights_init_config = self.architecture_config['weights_init']
 41 | 
 42 |         if weights_init_config['function'] == 'normal':
 43 |             weights_init_func = partial(init_weights_normal, **weights_init_config['params'])
 44 |         elif weights_init_config['function'] == 'xavier':
 45 |             weights_init_func = init_weights_xavier
 46 |         elif weights_init_config['function'] == 'he':
 47 |             weights_init_func = init_weights_he
 48 |         else:
 49 |             raise NotImplementedError
 50 | 
 51 |         self.model.apply(weights_init_func)
 52 | 
 53 |     def fit(self, datagen, validation_datagen=None):
 54 |         self._initialize_model_weights()
 55 | 
 56 |         if torch.cuda.is_available():
 57 |             self.model = self.model.cuda()
 58 | 
 59 |         self.callbacks.set_params(self, validation_datagen=validation_datagen)
 60 |         self.callbacks.on_train_begin()
 61 | 
 62 |         batch_gen, steps = datagen
 63 |         for epoch_id in range(self.training_config['epochs']):
 64 |             self.callbacks.on_epoch_begin()
 65 |             for batch_id, data in enumerate(batch_gen):
 66 |                 self.callbacks.on_batch_begin()
 67 |                 metrics = self._fit_loop(data)
 68 |                 self.callbacks.on_batch_end(metrics=metrics)
 69 |                 if batch_id == steps:
 70 |                     break
 71 |             self.callbacks.on_epoch_end()
 72 |             if self.callbacks.training_break():
 73 |                 break
 74 |         self.callbacks.on_train_end()
 75 |         return self
 76 | 
 77 |     def _fit_loop(self, data):
 78 |         X = data[0]
 79 |         targets_tensors = data[1:]
 80 | 
 81 |         if torch.cuda.is_available():
 82 |             X = Variable(X).cuda()
 83 |             targets_var = []
 84 |             for target_tensor in targets_tensors:
 85 |                 targets_var.append(Variable(target_tensor).cuda())
 86 |         else:
 87 |             X = Variable(X)
 88 |             targets_var = []
 89 |             for target_tensor in targets_tensors:
 90 |                 targets_var.append(Variable(target_tensor))
 91 | 
 92 |         self.optimizer.zero_grad()
 93 |         outputs_batch = self.model(X)
 94 |         partial_batch_losses = {}
 95 | 
 96 |         assert len(targets_tensors) == len(outputs_batch) == len(self.loss_function),\
 97 |             '''Number of targets, model outputs and elements of loss function must equal.
 98 |             You have n_targets={0}, n_model_outputs={1}, n_loss_function_elements={2}.
 99 |             The order of elements must also be preserved.'''.format(len(targets_tensors),
100 |                                                                     len(outputs_batch),
101 |                                                                     len(self.loss_function))
102 | 
103 |         if len(self.output_names) == 1:
104 |             for (name, loss_function, weight), target in zip(self.loss_function, targets_var):
105 |                 batch_loss = loss_function(outputs_batch, target) * weight
106 |         else:
107 |             for (name, loss_function, weight), output, target in zip(self.loss_function, outputs_batch, targets_var):
108 |                 partial_batch_losses[name] = loss_function(output, target) * weight
109 |             batch_loss = sum(partial_batch_losses.values())
110 |         partial_batch_losses['sum'] = batch_loss
111 |         batch_loss.backward()
112 |         self.optimizer.step()
113 | 
114 |         return partial_batch_losses
115 | 
116 |     def _transform(self, datagen, validation_datagen=None):
117 |         self.model.eval()
118 |         batch_gen, steps = datagen
119 |         outputs = {}
120 |         for batch_id, data in enumerate(batch_gen):
121 |             if isinstance(data, list):
122 |                 X = data[0]
123 |             else:
124 |                 X = data
125 | 
126 |             if torch.cuda.is_available():
127 |                 X = Variable(X, volatile=True).cuda()
128 |             else:
129 |                 X = Variable(X, volatile=True)
130 | 
131 |             outputs_batch = self.model(X)
132 |             if len(self.output_names) == 1:
133 |                 outputs.setdefault(self.output_names[0], []).append(outputs_batch.data.cpu().numpy())
134 |             else:
135 |                 for name, output in zip(self.output_names, outputs_batch):
136 |                     output_ = output.data.cpu().numpy()
137 |                     outputs.setdefault(name, []).append(output_)
138 |             if batch_id == steps:
139 |                 break
140 |         self.model.train()
141 |         outputs = {'{}_prediction'.format(name): np.vstack(outputs_) for name, outputs_ in outputs.items()}
142 |         return outputs
143 | 
144 |     def transform(self, datagen, validation_datagen=None):
145 |         predictions = self._transform(datagen, validation_datagen)
146 |         return NotImplementedError
147 | 
148 |     def load(self, filepath):
149 |         self.model.eval()
150 | 
151 |         if torch.cuda.is_available():
152 |             self.model.cpu()
153 |             self.model.load_state_dict(torch.load(filepath))
154 |             self.model.cuda()
155 |         else:
156 |             self.model.load_state_dict(torch.load(filepath, map_location=lambda storage, loc: storage))
157 |         return self
158 | 
159 |     def persist(self, filepath):
160 |         checkpoint_callback = self.callbacks_config.get('model_checkpoint')
161 |         if checkpoint_callback:
162 |             checkpoint_filepath = checkpoint_callback['filepath']
163 |             if os.path.exists(checkpoint_filepath):
164 |                 shutil.copyfile(checkpoint_filepath, filepath)
165 |             else:
166 |                 persist_torch_model(self.model, filepath)
167 |         else:
168 |             persist_torch_model(self.model, filepath)
169 | 
170 | 
171 | class PyTorchBasic(nn.Module):
172 |     def _flatten_features(self, in_size, features):
173 |         f = features(Variable(torch.ones(1, *in_size)))
174 |         return int(np.prod(f.size()[1:]))
175 | 
176 |     def forward(self, x):
177 |         features = self.features(x)
178 |         flat_features = features.view(-1, self.flat_features)
179 |         out = self.classifier(flat_features)
180 |         return out
181 | 
182 |     def forward_target(self, x):
183 |         return self.forward(x)
184 | 
185 | 
186 | def init_weights_normal(model, mean, std_conv2d, std_linear):
187 |     if type(model) == nn.Conv2d:
188 |         model.weight.data.normal_(mean=mean, std=std_conv2d)
189 |     if type(model) == nn.Linear:
190 |         model.weight.data.normal_(mean=mean, std=std_linear)
191 | 
192 | 
193 | def init_weights_xavier(model):
194 |     if isinstance(model, nn.Conv2d):
195 |         init.xavier_normal(model.weight)
196 |         init.constant(model.bias, 0)
197 | 
198 | 
199 | def init_weights_he(model):
200 |     if isinstance(model, nn.Conv2d):
201 |         init.kaiming_normal(model.weight)
202 |         init.constant(model.bias, 0)
203 | 


--------------------------------------------------------------------------------
/toolkit/postprocessing.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from sklearn.externals import joblib
  4 | 
  5 | from steppy.base import BaseTransformer
  6 | 
  7 | 
  8 | class ClassPredictor(BaseTransformer):
  9 |     def transform(self, prediction_proba):
 10 |         predictions_class = np.argmax(prediction_proba, axis=1)
 11 |         return {'y_pred': predictions_class}
 12 | 
 13 |     def load(self, filepath):
 14 |         return ClassPredictor()
 15 | 
 16 |     def persist(self, filepath):
 17 |         joblib.dump({}, filepath)
 18 | 
 19 | 
 20 | class PredictionAverage(BaseTransformer):
 21 |     def __init__(self, weights=None):
 22 |         super().__init__()
 23 |         self.weights = weights
 24 | 
 25 |     def transform(self, prediction_proba_list):
 26 |         if self.weights is not None:
 27 |             reshaped_weights = self._reshape_weights(prediction_proba_list.shape)
 28 |             prediction_proba_list *= reshaped_weights
 29 |             avg_pred = np.sum(prediction_proba_list, axis=0)
 30 |         else:
 31 |             avg_pred = np.mean(prediction_proba_list, axis=0)
 32 |         return {'prediction_probability': avg_pred}
 33 | 
 34 |     def load(self, filepath):
 35 |         params = joblib.load(filepath)
 36 |         self.weights = params['weights']
 37 |         return self
 38 | 
 39 |     def persist(self, filepath):
 40 |         joblib.dump({'weights': self.weights}, filepath)
 41 | 
 42 |     def _reshape_weights(self, prediction_shape):
 43 |         dim = len(prediction_shape)
 44 |         reshape_dim = (-1,) + tuple([1] * (dim - 1))
 45 |         reshaped_weights = np.array(self.weights).reshape(reshape_dim)
 46 |         return reshaped_weights
 47 | 
 48 | 
 49 | class PredictionAverageUnstack(BaseTransformer):
 50 |     def transform(self, prediction_probability, id_list):
 51 |         df = pd.DataFrame(prediction_probability)
 52 |         df['id'] = id_list
 53 |         avg_pred = df.groupby('id').mean().reset_index().drop(['id'], axis=1).values
 54 |         return {'prediction_probability': avg_pred}
 55 | 
 56 |     def load(self, filepath):
 57 |         return self
 58 | 
 59 |     def persist(self, filepath):
 60 |         joblib.dump({}, filepath)
 61 | 
 62 | 
 63 | class ProbabilityCalibration(BaseTransformer):
 64 |     def __init__(self, power):
 65 |         super().__init__()
 66 |         self.power = power
 67 | 
 68 |     def transform(self, predicted_probability):
 69 |         predicted_probability = np.array(predicted_probability) ** self.power
 70 |         return {'predicted_probability': predicted_probability}
 71 | 
 72 | 
 73 | class BlendingOptimizer(BaseTransformer):
 74 |     """Class for optimizing the weights in blending of different models' predictions.
 75 | 
 76 |     Args:
 77 |         metric (Callable): Callable metric function to optimize.
 78 |         maximize (bool): default True. Boolean indicating whether the `metric` needs to be maximized or minimized.
 79 |         power (float): default 1.0. Power to apply on each models' predictions before blending.
 80 |     Example:
 81 |         >>> from sklearn.metrics import mean_absolute_error
 82 |         >>> y         = [0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9]
 83 |         >>> p_model_1 = [0.11, 0.19, 0.25, 0.37, 0.55, 0.62, 0.78, 0.81, 0.94]
 84 |         >>> p_model_2 = [0.07, 0.21, 0.29, 0.33, 0.53, 0.54, 0.74, 0.74, 0.91]
 85 |         >>> preds = [p_model_1, p_model_2]
 86 |         >>> opt = BlendingOptimizer(metric=mean_absolute_error, maximize=False)
 87 |         >>> opt.fit(X=preds, y=y)
 88 |         >>> y_pred = opt.transform(X=preds)['y_pred']
 89 |         >>> print('MAE 1: {:0.3f}'.format(mean_absolute_error(y, p_model_1)))
 90 |         >>> print('MAE 2: {:0.3f}'.format(mean_absolute_error(y, p_model_2)))
 91 |         >>> print('MAE blended: {:0.3f}'.format(mean_absolute_error(y, y_pred)))
 92 |     """
 93 | 
 94 |     def __init__(self, metric, maximize=True, power=1.0):
 95 |         super().__init__()
 96 |         self.metric = metric
 97 |         self.maximize = maximize
 98 |         self._power = power
 99 |         self._score = None
100 |         self._weights = None
101 | 
102 |     def fit(self, X, y, step_size=0.1, init_weights=None, warm_start: bool=False):
103 |         """Fit the weights on the given predictions.
104 | 
105 |         Args:
106 |             X (array-like): Predictions of different models for the labels.
107 |             y (array-like): Labels.
108 |             step_size (float): Step size for optimizing the weights.
109 |                                Smaller step sizes most likely improve resulting score but increases training time.
110 |             init_weights (array-like): Initial weights for training.
111 |                                        When `warm_start` is used `init_weights` are ignored.
112 |             warm_start (bool): Continues training. Will only work when `fit` has been called with this object earlier.
113 |                                When `warm_start` is used `init_weights` are ignored.
114 |         Returns: self
115 |         """
116 |         assert len(np.shape(X)) == 2, 'X must be 2-dimensional, got {}-D instead.'.format(len(np.shape(X)))
117 |         assert np.shape(X)[0] > 1, 'X must contain predictions from at least two models. ' \
118 |                                    'Got {} instead'.format(np.shape(X)[0])
119 | 
120 |         assert np.shape(X)[1] == len(y), (
121 |             'BlendingOptimizer: Length of predictions and labels does not match: '
122 |             'preds_len={}, y_len={}'.format(np.shape(X)[1], len(y)))
123 | 
124 |         if warm_start:
125 |             assert self._weights is not None, 'Optimizer has to be fitted before `warm_start` can be used.'
126 |             weights = self._weights
127 |         elif init_weights is None:
128 |             weights = np.array([1.0] * len(X))
129 |         else:
130 |             assert (len(init_weights) == np.shape(X)[0]), (
131 |                 'BlendingOptimizer: Number of models to blend its predictions and weights does not match: '
132 |                 'n_models={}, weights_len={}'.format(np.shape(X)[0], len(init_weights)))
133 |             weights = init_weights
134 | 
135 |         def __is_better_score(score_to_test, score):
136 |             return score_to_test > score if self.maximize else not score_to_test > score
137 | 
138 |         score = 0
139 |         best_score = self.maximize - 0.5
140 | 
141 |         while __is_better_score(best_score, score):
142 |             best_score = self.metric(y, np.average(np.power(X, self._power), weights=weights, axis=0) ** (
143 |                     1.0 / self._power))
144 |             score = best_score
145 |             best_index, best_step = -1, 0.0
146 |             for j in range(len(X)):
147 |                 delta = np.array([(0 if k != j else step_size) for k in range(len(X))])
148 |                 s = self.metric(y, np.average(np.power(X, self._power), weights=weights + delta, axis=0) ** (
149 |                         1.0 / self._power))
150 |                 if __is_better_score(s, best_score):
151 |                     best_index, best_score, best_step = j, s, step_size
152 |                     continue
153 |                 if weights[j] - step_size >= 0:
154 |                     s = self.metric(y, np.average(np.power(X, self._power), weights=weights - delta, axis=0) ** (
155 |                             1.0 / self._power))
156 |                     if s > best_score:
157 |                         best_index, best_score, best_step = j, s, -step_size
158 |             if __is_better_score(best_score, score):
159 |                 weights[best_index] += best_step
160 | 
161 |         self._weights = weights
162 |         self._score = best_score
163 | 
164 |         return self
165 | 
166 |     def transform(self, X):
167 |         """Performs predictions blending using the trained weights.
168 | 
169 |         Args:
170 |             X (array-like): Predictions of different models.
171 |         Returns: dict with blended predictions (key is 'y_pred').
172 |         """
173 |         assert np.shape(X)[0] == len(self._weights), (
174 |             'BlendingOptimizer: Number of models to blend its predictions and weights does not match: '
175 |             'n_models={}, weights_len={}'.format(np.shape(X)[0], len(self._weights)))
176 |         blended_predictions = np.average(np.power(X, self._power),
177 |                                          weights=self._weights,
178 |                                          axis=0) ** (1.0 / self._power)
179 | 
180 |         return {'y_pred': blended_predictions}
181 | 
182 |     def fit_transform(self, X, y, step_size=0.1, init_weights=None, warm_start=False):
183 |         """Fit optimizer to X, then transforms X. See `fit` and `transform` for further explanation."""
184 |         self.fit(X=X, y=y, step_size=step_size, init_weights=init_weights, warm_start=warm_start)
185 | 
186 |         return self.transform(X=X)
187 | 
188 |     def load(self, filepath):
189 |         params = joblib.load(filepath)
190 |         self.metric = params['metric']
191 |         self.maximize = params['maximize']
192 |         self._power = params['power']
193 |         self._score = params['score']
194 |         self._weights = params['weights']
195 |         return self
196 | 
197 |     def persist(self, filepath):
198 |         joblib.dump({'metric': self.metric,
199 |                      'maximize': self.maximize,
200 |                      'power': self._power,
201 |                      'score': self._score,
202 |                      'weights': self._weights},
203 |                     filepath)
204 | 


--------------------------------------------------------------------------------
/toolkit/preprocessing/misc.py:
--------------------------------------------------------------------------------
  1 | import category_encoders as ce
  2 | import numpy as np
  3 | import pandas as pd
  4 | import sklearn.decomposition as decomposition
  5 | from fancyimpute import SimpleFill
  6 | from sklearn.externals import joblib
  7 | from sklearn.feature_extraction import text
  8 | from sklearn.preprocessing import Normalizer, MinMaxScaler
  9 | from steppy.base import BaseTransformer
 10 | 
 11 | 
 12 | class XYSplit(BaseTransformer):
 13 |     def __init__(self, x_columns, y_columns):
 14 |         super().__init__()
 15 |         self.x_columns = x_columns
 16 |         self.y_columns = y_columns
 17 |         self.columns_to_get = None
 18 |         self.target_columns = None
 19 | 
 20 |     def transform(self, meta, train_mode):
 21 |         X = meta[self.x_columns].values
 22 |         if train_mode:
 23 |             y = meta[self.y_columns].values
 24 |         else:
 25 |             y = None
 26 | 
 27 |         return {'X': X,
 28 |                 'y': y}
 29 | 
 30 |     def load(self, filepath):
 31 |         params = joblib.load(filepath)
 32 |         self.columns_to_get = params['x_columns']
 33 |         self.target_columns = params['y_columns']
 34 |         return self
 35 | 
 36 |     def persist(self, filepath):
 37 |         params = {'x_columns': self.x_columns,
 38 |                   'y_columns': self.y_columns
 39 |                   }
 40 |         joblib.dump(params, filepath)
 41 | 
 42 | 
 43 | class TfIdfVectorizer(BaseTransformer):
 44 |     def __init__(self, **kwargs):
 45 |         super().__init__()
 46 |         self.vectorizer = text.TfidfVectorizer(**kwargs)
 47 | 
 48 |     def fit(self, text):
 49 |         self.vectorizer.fit(text)
 50 |         return self
 51 | 
 52 |     def transform(self, text):
 53 |         return {'features': self.vectorizer.transform(text)}
 54 | 
 55 |     def load(self, filepath):
 56 |         self.vectorizer = joblib.load(filepath)
 57 |         return self
 58 | 
 59 |     def persist(self, filepath):
 60 |         joblib.dump(self.vectorizer, filepath)
 61 | 
 62 | 
 63 | class TruncatedSVD(BaseTransformer):
 64 |     def __init__(self, **kwargs):
 65 |         super().__init__()
 66 |         self.truncated_svd = decomposition.TruncatedSVD(**kwargs)
 67 | 
 68 |     def fit(self, features):
 69 |         self.truncated_svd.fit(features)
 70 |         return self
 71 | 
 72 |     def transform(self, features):
 73 |         return {'features': self.truncated_svd.transform(features)}
 74 | 
 75 |     def load(self, filepath):
 76 |         self.truncated_svd = joblib.load(filepath)
 77 |         return self
 78 | 
 79 |     def persist(self, filepath):
 80 |         joblib.dump(self.truncated_svd, filepath)
 81 | 
 82 | 
 83 | class Steppy_Normalizer(BaseTransformer):
 84 |     def __init__(self):
 85 |         super().__init__()
 86 |         self.normalizer = Normalizer()
 87 | 
 88 |     def fit(self, X):
 89 |         self.normalizer.fit(X)
 90 |         return self
 91 | 
 92 |     def transform(self, X):
 93 |         X = self.normalizer.transform(X)
 94 |         return {'X': X}
 95 | 
 96 |     def load(self, filepath):
 97 |         self.normalizer = joblib.load(filepath)
 98 |         return self
 99 | 
100 |     def persist(self, filepath):
101 |         joblib.dump(self.normalizer, filepath)
102 | 
103 | 
104 | class Steppy_MinMaxScaler(BaseTransformer):
105 |     def __init__(self):
106 |         super().__init__()
107 |         self.minmax_scaler = MinMaxScaler()
108 | 
109 |     def fit(self, X):
110 |         self.minmax_scaler.fit(X)
111 |         return self
112 | 
113 |     def transform(self, X):
114 |         X = self.minmax_scaler.transform(X)
115 |         return {'X': X}
116 | 
117 |     def load(self, filepath):
118 |         self.minmax_scaler = joblib.load(filepath)
119 |         return self
120 | 
121 |     def persist(self, filepath):
122 |         joblib.dump(self.minmax_scaler, filepath)
123 | 
124 | 
125 | class MinMaxScalerMultilabel(BaseTransformer):
126 |     def __init__(self):
127 |         super().__init__()
128 |         self.minmax_scalers = []
129 | 
130 |     def fit(self, X):
131 |         for i in range(X.shape[1]):
132 |             minmax_scaler = Steppy_MinMaxScaler()
133 |             minmax_scaler.fit(X[:, i, :])
134 |             self.minmax_scalers.append(minmax_scaler)
135 |         return self
136 | 
137 |     def transform(self, X):
138 |         for i, minmax_scaler in enumerate(self.minmax_scalers):
139 |             X[:, i, :] = minmax_scaler.transform(X[:, i, :])
140 |         return {'X': X}
141 | 
142 |     def load(self, filepath):
143 |         self.minmax_scalers = joblib.load(filepath)
144 |         return self
145 | 
146 |     def persist(self, filepath):
147 |         joblib.dump(self.minmax_scalers, filepath)
148 | 
149 | 
150 | class FillNan(BaseTransformer):
151 |     def __init__(self, fill_method='zero', fill_missing=True, **kwargs):
152 |         """Imputs NaN's using various filling methods like mean, zero, median, min, random
153 | 
154 | 
155 |         Args:
156 |             fill_method: How NaN's will be exchanged. Possible values: 'mean', 'zero', 'median', 'min', 'random'
157 |             fill_missing: If True, transformer will fill NaN values by filling method
158 |         """
159 |         super().__init__()
160 |         self.fill_missing = fill_missing
161 |         self.filler = SimpleFill(fill_method)
162 | 
163 |     def transform(self, X):
164 |         """
165 |         Args:
166 |             X: DataFrame with NaN's
167 |         Returns:
168 |             Dictionary with one key - 'X' corresponding to given DataFrame but without nan's
169 | 
170 |         """
171 |         if self.fill_missing:
172 |             X = self.filler.complete(X)
173 |         return {'X': X}
174 | 
175 |     def load(self, filepath):
176 |         self.filler = joblib.load(filepath)
177 |         return self
178 | 
179 |     def persist(self, filepath):
180 |         joblib.dump(self.filler, filepath)
181 | 
182 | 
183 | class CategoricalEncoder(BaseTransformer):
184 |     def __init__(self):
185 |         """Encode features to categorical type"""
186 |         super().__init__()
187 |         self.encoder_class = ce.OrdinalEncoder
188 |         self.categorical_encoder = None
189 | 
190 |     def fit(self, X):
191 |         """
192 |         Args:
193 |             X: DataFrame of categorical features to encode
194 |         """
195 |         self.categorical_encoder = self.encoder_class(cols=list(X))
196 |         self.categorical_encoder.fit(X)
197 |         return self
198 | 
199 |     def transform(self, X):
200 |         """
201 |         Args:
202 |             X: DataFrame of categorical features to encode
203 |         Returns:
204 |             Dictionary with one key - 'categorical_features' corresponding to encoded features form X
205 |         """
206 |         X = self.categorical_encoder.transform(X)
207 |         return {'categorical_features': X}
208 | 
209 |     def load(self, filepath):
210 |         self.categorical_encoder = joblib.load(filepath)
211 |         return self
212 | 
213 |     def persist(self, filepath):
214 |         joblib.dump(self.categorical_encoder, filepath)
215 | 
216 | 
217 | class GroupbyAggregate(BaseTransformer):
218 |     def __init__(self, id_column, groupby_aggregations):
219 |         """Group and aggregate features by specified configuration
220 | 
221 | 
222 |         Args:
223 |             id_column: Column with id's which will be preprocessed
224 |             groupby_aggregations: list of tuples
225 | 
226 |         Example
227 |             groupby_aggregations = [(['f0'], [('f2', 'min'),
228 |                                               ('f2', 'median')]),
229 |                                     (['f0', 'f1'], [('f2', 'mean'),
230 |                                                     ('f2', 'max'),
231 |                                                     ('f2', 'kurt')])]
232 |             X = np.array([[0, 0, 0],
233 |                           [0, 0, 1],
234 |                           [0, 1, 0]])
235 |             X = pd.DataFrame(X)
236 |             X.columns = ['f0', 'f1', 'f2']
237 | 
238 |             tr = GroupbyAggregate(list(range(3)), groupby_aggregations)
239 |             aggregations = tr.fit_transform(X)
240 |         """
241 |         super().__init__()
242 |         self.id_column = id_column
243 |         self.groupby_aggregations = groupby_aggregations
244 | 
245 |     def fit(self, X):
246 |         features = pd.DataFrame({self.id_column: X[self.id_column].unique()})
247 |         for groupby_cols, specs in self.groupby_aggregations:
248 |             group_object = X.groupby(groupby_cols)
249 |             for select, agg in specs:
250 |                 groupby_aggregate_name = self._create_colname_from_specs(groupby_cols, select, agg)
251 |                 features = features.merge(group_object[select]
252 |                                           .agg(agg)
253 |                                           .reset_index()
254 |                                           .rename(index=str,
255 |                                                   columns={select: groupby_aggregate_name})
256 |                                           [groupby_cols + [groupby_aggregate_name]],
257 |                                           on=groupby_cols,
258 |                                           how='left')
259 |         self.features = features
260 |         return self
261 | 
262 |     def transform(self, X):
263 |         return {'numerical_features': self.features}
264 | 
265 |     def load(self, filepath):
266 |         self.features = joblib.load(filepath)
267 |         return self
268 | 
269 |     def persist(self, filepath):
270 |         joblib.dump(self.features, filepath)
271 | 
272 |     def _create_colname_from_specs(self, groupby_cols, select, agg):
273 |         return '{}_{}_{}'.format('_'.join(groupby_cols), agg, select)
274 | 
275 | 
276 | class FeatureJoiner(BaseTransformer):
277 |     """Concatenate all features to one DataFrame of given id_column
278 | 
279 |     Args:
280 |         id_column: Column with id's which will be preprocessed
281 |     """
282 | 
283 |     def __init__(self, id_column):
284 |         super().__init__()
285 |         self.id_column = id_column
286 | 
287 |     def transform(self, numerical_feature_list, categorical_feature_list):
288 |         """
289 |         Args:
290 |             numerical_feature_list: list of numerical features
291 |             categorical_feature_list: list of categorical features
292 | 
293 |         Returns:
294 |             Dictionary with following keys:
295 |                 features: DataFrame with concatenated features
296 |                 feature_names: list of features names
297 |                 categorical_features: list of categorical feature names
298 |         """
299 |         features = numerical_feature_list + categorical_feature_list
300 |         for feature in features:
301 |             feature = self._format_target(feature)
302 |             feature.set_index(self.id_column, drop=True, inplace=True)
303 |         features = pd.concat(features, axis=1).astype(np.float32).reset_index()
304 | 
305 |         outputs = dict()
306 |         outputs['features'] = features
307 |         outputs['feature_names'] = list(features.columns)
308 |         outputs['categorical_features'] = self._get_feature_names(categorical_feature_list)
309 |         return outputs
310 | 
311 |     def _get_feature_names(self, dataframes):
312 |         feature_names = []
313 |         for dataframe in dataframes:
314 |             try:
315 |                 feature_names.extend(list(dataframe.columns))
316 |             except Exception as e:
317 |                 print(e)
318 |                 feature_names.append(dataframe.name)
319 | 
320 |         return feature_names
321 | 
322 |     def _format_target(self, target):
323 |         if isinstance(target, pd.Series):
324 |             return pd.DataFrame(target)
325 |         return target


--------------------------------------------------------------------------------
/toolkit/keras_transformers/models.py:
--------------------------------------------------------------------------------
  1 | import shutil
  2 | 
  3 | from keras.models import load_model
  4 | from steppy.base import BaseTransformer
  5 | 
  6 | from toolkit.keras_transformers.architectures import vdcnn, scnn, dpcnn, cudnn_gru, cudnn_lstm
  7 | from toolkit.keras_transformers.contrib import AttentionWeightedAverage
  8 | 
  9 | 
 10 | class KerasModelTransformer(BaseTransformer):
 11 |     def __init__(self, architecture_config, training_config, callbacks_config):
 12 |         super().__init__()
 13 |         self.architecture_config = architecture_config
 14 |         self.training_config = training_config
 15 |         self.callbacks_config = callbacks_config
 16 | 
 17 |     def reset(self):
 18 |         self.model = self._build_model(**self.architecture_config)
 19 | 
 20 |     def _compile_model(self, model_params, optimizer_params):
 21 |         model = self._build_model(**model_params)
 22 |         optimizer = self._build_optimizer(**optimizer_params)
 23 |         loss = self._build_loss()
 24 |         model.compile(optimizer=optimizer, loss=loss)
 25 |         return model
 26 | 
 27 |     def _create_callbacks(self, **kwargs):
 28 |         raise NotImplementedError
 29 | 
 30 |     def _build_model(self, **kwargs):
 31 |         raise NotImplementedError
 32 | 
 33 |     def _build_optimizer(self, **kwargs):
 34 |         raise NotImplementedError
 35 | 
 36 |     def _build_loss(self, **kwargs):
 37 |         raise NotImplementedError
 38 | 
 39 |     def persist(self, filepath):
 40 |         checkpoint_callback = self.callbacks_config.get('model_checkpoint')
 41 |         if checkpoint_callback:
 42 |             checkpoint_filepath = checkpoint_callback['filepath']
 43 |             shutil.copyfile(checkpoint_filepath, filepath)
 44 |         else:
 45 |             self.model.save(filepath)
 46 | 
 47 |     def load(self, filepath):
 48 |         self.model = load_model(filepath,
 49 |                                 custom_objects={'AttentionWeightedAverage': AttentionWeightedAverage})
 50 |         return self
 51 | 
 52 | 
 53 | class ClassifierXY(KerasModelTransformer):
 54 |     def fit(self, X, y, validation_data, *args, **kwargs):
 55 |         self.callbacks = self._create_callbacks(**self.callbacks_config)
 56 |         self.model = self._compile_model(**self.architecture_config)
 57 | 
 58 |         self.model.fit(X, y,
 59 |                        validation_data=validation_data,
 60 |                        callbacks=self.callbacks,
 61 |                        verbose=1,
 62 |                        **self.training_config)
 63 |         return self
 64 | 
 65 |     def transform(self, X, y=None, validation_data=None, *args, **kwargs):
 66 |         predictions = self.model.predict(X, verbose=1)
 67 |         return {'prediction_probability': predictions}
 68 | 
 69 | 
 70 | class ClassifierGenerator(KerasModelTransformer):
 71 |     def fit(self, datagen, X, y, datagen_valid=None, X_valid=None, y_valid=None, *args, **kwargs):
 72 |         self.callbacks = self._create_callbacks(**self.callbacks_config)
 73 |         self.model = self._compile_model(**self.architecture_config)
 74 | 
 75 |         fit_args = self.training_config['fit_args']
 76 |         flow_args = self.training_config['flow_args']
 77 |         batch_size = flow_args['batch_size']
 78 |         if X_valid is None:
 79 |             self.model.fit_generator(
 80 |                 datagen.flow(X, y, **flow_args),
 81 |                 steps_per_epoch=len(X) // batch_size,
 82 |                 callbacks=self.callbacks,
 83 |                 **fit_args)
 84 |             return self
 85 |         else:
 86 |             if datagen_valid is None:
 87 |                 datagen_valid = datagen
 88 |             self.model.fit_generator(
 89 |                 datagen.flow(X, y, **flow_args),
 90 |                 steps_per_epoch=len(X) // batch_size,
 91 |                 validation_data=datagen_valid.flow(X_valid, y_valid, **flow_args),
 92 |                 validation_steps=len(X_valid) // batch_size,
 93 |                 callbacks=self.callbacks,
 94 |                 **fit_args)
 95 |             return self
 96 | 
 97 |     def transform(self, datagen, X, datagen_valid=None, X_valid=None, *args, **kwargs):
 98 |         flow_args = self.training_config['flow_args']
 99 |         y_proba_train = self.model.predict_generator(
100 |             datagen.flow(X, shuffle=False, **flow_args))
101 |         result = dict(output=y_proba_train)
102 |         if X_valid is not None:
103 |             if datagen_valid is None:
104 |                 datagen_valid = datagen
105 |             y_proba_valid = self.model.predict_generator(
106 |                 datagen_valid.flow(X_valid, shuffle=False, **flow_args))
107 |             result.update(dict(output_valid=y_proba_valid))
108 |         return result
109 | 
110 | 
111 | class PretrainedEmbeddingModel(ClassifierXY):
112 |     def fit(self, X, y, validation_data, embedding_matrix):
113 |         X_valid, y_valid = validation_data
114 |         self.callbacks = self._create_callbacks(**self.callbacks_config)
115 |         self.architecture_config['model_params']['embedding_matrix'] = embedding_matrix
116 |         self.model = self._compile_model(**self.architecture_config)
117 |         self.model.fit(X, y,
118 |                        validation_data=[X_valid, y_valid],
119 |                        callbacks=self.callbacks,
120 |                        verbose=1,
121 |                        **self.training_config)
122 |         return self
123 | 
124 |     def transform(self, X, y=None, validation_data=None, embedding_matrix=None):
125 |         predictions = self.model.predict(X, verbose=1)
126 |         return {'prediction_probability': predictions}
127 | 
128 | 
129 | class CharVDCNNTransformer(ClassifierXY):
130 |     def _build_model(self, embedding_size, maxlen, max_features,
131 |                      filter_nr, kernel_size, repeat_block,
132 |                      dense_size, repeat_dense, output_size, output_activation,
133 |                      max_pooling, mean_pooling, weighted_average_attention, concat_mode,
134 |                      dropout_embedding, conv_dropout, dense_dropout, dropout_mode,
135 |                      conv_kernel_reg_l2, conv_bias_reg_l2,
136 |                      dense_kernel_reg_l2, dense_bias_reg_l2,
137 |                      use_prelu, use_batch_norm, batch_norm_first):
138 |         return vdcnn(embedding_size, maxlen, max_features,
139 |                      filter_nr, kernel_size, repeat_block,
140 |                      dense_size, repeat_dense, output_size, output_activation,
141 |                      max_pooling, mean_pooling, weighted_average_attention, concat_mode,
142 |                      dropout_embedding, conv_dropout, dense_dropout, dropout_mode,
143 |                      conv_kernel_reg_l2, conv_bias_reg_l2,
144 |                      dense_kernel_reg_l2, dense_bias_reg_l2,
145 |                      use_prelu, use_batch_norm, batch_norm_first)
146 | 
147 | 
148 | class WordSCNNTransformer(PretrainedEmbeddingModel):
149 |     def _build_model(self, embedding_matrix, embedding_size, trainable_embedding, maxlen, max_features,
150 |                      filter_nr, kernel_size, repeat_block,
151 |                      dense_size, repeat_dense, output_size, output_activation,
152 |                      max_pooling, mean_pooling, weighted_average_attention, concat_mode,
153 |                      dropout_embedding, conv_dropout, dense_dropout, dropout_mode,
154 |                      conv_kernel_reg_l2, conv_bias_reg_l2,
155 |                      dense_kernel_reg_l2, dense_bias_reg_l2,
156 |                      use_prelu, use_batch_norm, batch_norm_first):
157 |         return scnn(embedding_matrix, embedding_size, trainable_embedding, maxlen, max_features,
158 |                     filter_nr, kernel_size, repeat_block,
159 |                     dense_size, repeat_dense, output_size, output_activation,
160 |                     max_pooling, mean_pooling, weighted_average_attention, concat_mode,
161 |                     dropout_embedding, conv_dropout, dense_dropout, dropout_mode,
162 |                     conv_kernel_reg_l2, conv_bias_reg_l2,
163 |                     dense_kernel_reg_l2, dense_bias_reg_l2,
164 |                     use_prelu, use_batch_norm, batch_norm_first)
165 | 
166 | 
167 | class WordDPCNNTransformer(PretrainedEmbeddingModel):
168 |     def _build_model(self, embedding_matrix, embedding_size, trainable_embedding, maxlen, max_features,
169 |                      filter_nr, kernel_size, repeat_block,
170 |                      dense_size, repeat_dense, output_size, output_activation,
171 |                      max_pooling, mean_pooling, weighted_average_attention, concat_mode,
172 |                      dropout_embedding, conv_dropout, dense_dropout, dropout_mode,
173 |                      conv_kernel_reg_l2, conv_bias_reg_l2,
174 |                      dense_kernel_reg_l2, dense_bias_reg_l2,
175 |                      use_prelu, use_batch_norm, batch_norm_first):
176 |         """
177 |         Implementation of http://ai.tencent.com/ailab/media/publications/ACL3-Brady.pdf
178 |         """
179 |         return dpcnn(embedding_matrix, embedding_size, trainable_embedding, maxlen, max_features,
180 |                      filter_nr, kernel_size, repeat_block,
181 |                      dense_size, repeat_dense, output_size, output_activation,
182 |                      max_pooling, mean_pooling, weighted_average_attention, concat_mode,
183 |                      dropout_embedding, conv_dropout, dense_dropout, dropout_mode,
184 |                      conv_kernel_reg_l2, conv_bias_reg_l2,
185 |                      dense_kernel_reg_l2, dense_bias_reg_l2,
186 |                      use_prelu, use_batch_norm, batch_norm_first)
187 | 
188 | 
189 | class WordCuDNNLSTMTransformer(PretrainedEmbeddingModel):
190 |     def _build_model(self, embedding_matrix, embedding_size, trainable_embedding,
191 |                      maxlen, max_features,
192 |                      unit_nr, repeat_block,
193 |                      dense_size, repeat_dense, output_size, output_activation,
194 |                      max_pooling, mean_pooling, weighted_average_attention, concat_mode,
195 |                      dropout_embedding, rnn_dropout, dense_dropout, dropout_mode,
196 |                      rnn_kernel_reg_l2, rnn_recurrent_reg_l2, rnn_bias_reg_l2,
197 |                      dense_kernel_reg_l2, dense_bias_reg_l2,
198 |                      use_prelu, use_batch_norm, batch_norm_first):
199 |         return cudnn_lstm(embedding_matrix, embedding_size, trainable_embedding,
200 |                           maxlen, max_features,
201 |                           unit_nr, repeat_block,
202 |                           dense_size, repeat_dense, output_size, output_activation,
203 |                           max_pooling, mean_pooling, weighted_average_attention, concat_mode,
204 |                           dropout_embedding, rnn_dropout, dense_dropout, dropout_mode,
205 |                           rnn_kernel_reg_l2, rnn_recurrent_reg_l2, rnn_bias_reg_l2,
206 |                           dense_kernel_reg_l2, dense_bias_reg_l2,
207 |                           use_prelu, use_batch_norm, batch_norm_first)
208 | 
209 | 
210 | class WordCuDNNGRUTransformer(PretrainedEmbeddingModel):
211 |     def _build_model(self, embedding_matrix, embedding_size, trainable_embedding,
212 |                      maxlen, max_features,
213 |                      unit_nr, repeat_block,
214 |                      dense_size, repeat_dense, output_size, output_activation,
215 |                      max_pooling, mean_pooling, weighted_average_attention, concat_mode,
216 |                      dropout_embedding, rnn_dropout, dense_dropout, dropout_mode,
217 |                      rnn_kernel_reg_l2, rnn_recurrent_reg_l2, rnn_bias_reg_l2,
218 |                      dense_kernel_reg_l2, dense_bias_reg_l2,
219 |                      use_prelu, use_batch_norm, batch_norm_first):
220 |         return cudnn_gru(embedding_matrix, embedding_size, trainable_embedding,
221 |                          maxlen, max_features,
222 |                          unit_nr, repeat_block,
223 |                          dense_size, repeat_dense, output_size, output_activation,
224 |                          max_pooling, mean_pooling, weighted_average_attention, concat_mode,
225 |                          dropout_embedding, rnn_dropout, dense_dropout, dropout_mode,
226 |                          rnn_kernel_reg_l2, rnn_recurrent_reg_l2, rnn_bias_reg_l2,
227 |                          dense_kernel_reg_l2, dense_bias_reg_l2,
228 |                          use_prelu, use_batch_norm, batch_norm_first)
229 | 


--------------------------------------------------------------------------------
/toolkit/pytorch_transformers/callbacks.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from datetime import datetime, timedelta
  3 | 
  4 | from deepsense import neptune
  5 | from steppy.utils import get_logger
  6 | from torch.optim.lr_scheduler import ExponentialLR
  7 | 
  8 | from toolkit.pytorch_transformers.utils import Averager, persist_torch_model
  9 | from toolkit.pytorch_transformers.validation import score_model
 10 | 
 11 | logger = get_logger()
 12 | 
 13 | 
 14 | class Callback:
 15 |     def __init__(self):
 16 |         self.epoch_id = None
 17 |         self.batch_id = None
 18 | 
 19 |         self.model = None
 20 |         self.optimizer = None
 21 |         self.loss_function = None
 22 |         self.output_names = None
 23 |         self.validation_datagen = None
 24 |         self.lr_scheduler = None
 25 | 
 26 |     def set_params(self, transformer, validation_datagen):
 27 |         self.model = transformer.model
 28 |         self.optimizer = transformer.optimizer
 29 |         self.loss_function = transformer.loss_function
 30 |         self.output_names = transformer.output_names
 31 |         self.validation_datagen = validation_datagen
 32 |         self.validation_loss = transformer.validation_loss
 33 | 
 34 |     def on_train_begin(self, *args, **kwargs):
 35 |         self.epoch_id = 0
 36 |         self.batch_id = 0
 37 | 
 38 |     def on_train_end(self, *args, **kwargs):
 39 |         pass
 40 | 
 41 |     def on_epoch_begin(self, *args, **kwargs):
 42 |         pass
 43 | 
 44 |     def on_epoch_end(self, *args, **kwargs):
 45 |         self.epoch_id += 1
 46 | 
 47 |     def on_batch_begin(self, *args, **kwargs):
 48 |         pass
 49 | 
 50 |     def on_batch_end(self, *args, **kwargs):
 51 |         self.batch_id += 1
 52 | 
 53 |     def training_break(self, *args, **kwargs):
 54 |         return False
 55 | 
 56 |     def get_validation_loss(self):
 57 |         if self.epoch_id not in self.validation_loss.keys():
 58 |             self.validation_loss[self.epoch_id] = score_model(self.model,
 59 |                                                               self.loss_function,
 60 |                                                               self.validation_datagen)
 61 |         return self.validation_loss[self.epoch_id]
 62 | 
 63 | 
 64 | class CallbackList:
 65 |     def __init__(self, callbacks=None):
 66 |         if callbacks is None:
 67 |             self.callbacks = []
 68 |         elif isinstance(callbacks, Callback):
 69 |             self.callbacks = [callbacks]
 70 |         else:
 71 |             self.callbacks = callbacks
 72 | 
 73 |     def __len__(self):
 74 |         return len(self.callbacks)
 75 | 
 76 |     def set_params(self, *args, **kwargs):
 77 |         for callback in self.callbacks:
 78 |             callback.set_params(*args, **kwargs)
 79 | 
 80 |     def on_train_begin(self, *args, **kwargs):
 81 |         for callback in self.callbacks:
 82 |             callback.on_train_begin(*args, **kwargs)
 83 | 
 84 |     def on_train_end(self, *args, **kwargs):
 85 |         for callback in self.callbacks:
 86 |             callback.on_train_end(*args, **kwargs)
 87 | 
 88 |     def on_epoch_begin(self, *args, **kwargs):
 89 |         for callback in self.callbacks:
 90 |             callback.on_epoch_begin(*args, **kwargs)
 91 | 
 92 |     def on_epoch_end(self, *args, **kwargs):
 93 |         for callback in self.callbacks:
 94 |             callback.on_epoch_end(*args, **kwargs)
 95 | 
 96 |     def on_batch_begin(self, *args, **kwargs):
 97 |         for callback in self.callbacks:
 98 |             callback.on_batch_begin(*args, **kwargs)
 99 | 
100 |     def on_batch_end(self, *args, **kwargs):
101 |         for callback in self.callbacks:
102 |             callback.on_batch_end(*args, **kwargs)
103 | 
104 |     def training_break(self, *args, **kwargs):
105 |         callback_out = [callback.training_break(*args, **kwargs) for callback in self.callbacks]
106 |         return any(callback_out)
107 | 
108 | 
109 | class TrainingMonitor(Callback):
110 |     def __init__(self, epoch_every=None, batch_every=None):
111 |         super().__init__()
112 |         self.epoch_loss_averagers = {}
113 |         if epoch_every == 0:
114 |             self.epoch_every = False
115 |         else:
116 |             self.epoch_every = epoch_every
117 |         if batch_every == 0:
118 |             self.batch_every = False
119 |         else:
120 |             self.batch_every = batch_every
121 | 
122 |     def on_train_begin(self, *args, **kwargs):
123 |         self.epoch_loss_averagers = {}
124 |         self.epoch_id = 0
125 |         self.batch_id = 0
126 | 
127 |     def on_epoch_end(self, *args, **kwargs):
128 |         for name, averager in self.epoch_loss_averagers.items():
129 |             epoch_avg_loss = averager.value
130 |             averager.reset()
131 |             if self.epoch_every and ((self.epoch_id % self.epoch_every) == 0):
132 |                 logger.info('epoch {0} {1}:     {2:.5f}'.format(self.epoch_id, name, epoch_avg_loss))
133 |         self.epoch_id += 1
134 | 
135 |     def on_batch_end(self, metrics, *args, **kwargs):
136 |         for name, loss in metrics.items():
137 |             loss = loss.data.cpu().numpy()[0]
138 |             if name in self.epoch_loss_averagers.keys():
139 |                 self.epoch_loss_averagers[name].send(loss)
140 |             else:
141 |                 self.epoch_loss_averagers[name] = Averager()
142 |                 self.epoch_loss_averagers[name].send(loss)
143 | 
144 |             if self.batch_every and ((self.batch_id % self.batch_every) == 0):
145 |                 logger.info('epoch {0} batch {1} {2}:     {3:.5f}'.format(self.epoch_id, self.batch_id, name, loss))
146 |         self.batch_id += 1
147 | 
148 | 
149 | class ValidationMonitor(Callback):
150 |     def __init__(self, epoch_every=None, batch_every=None):
151 |         super().__init__()
152 |         if epoch_every == 0:
153 |             self.epoch_every = False
154 |         else:
155 |             self.epoch_every = epoch_every
156 |         if batch_every == 0:
157 |             self.batch_every = False
158 |         else:
159 |             self.batch_every = batch_every
160 | 
161 |     def on_epoch_end(self, *args, **kwargs):
162 |         if self.epoch_every and ((self.epoch_id % self.epoch_every) == 0):
163 |             self.model.eval()
164 |             val_loss = self.get_validation_loss()
165 |             self.model.train()
166 |             for name, loss in val_loss.items():
167 |                 loss = loss.data.cpu().numpy()[0]
168 |                 logger.info('epoch {0} validation {1}:     {2:.5f}'.format(self.epoch_id, name, loss))
169 |         self.epoch_id += 1
170 | 
171 | 
172 | class EarlyStopping(Callback):
173 |     def __init__(self, patience, minimize=True):
174 |         super().__init__()
175 |         self.patience = patience
176 |         self.minimize = minimize
177 |         self.best_score = None
178 |         self.epoch_since_best = 0
179 |         self._training_break = False
180 | 
181 |     def on_epoch_end(self, *args, **kwargs):
182 |         self.model.eval()
183 |         val_loss = self.get_validation_loss()
184 |         loss_sum = val_loss['sum']
185 |         loss_sum = loss_sum.data.cpu().numpy()[0]
186 | 
187 |         self.model.train()
188 | 
189 |         if not self.best_score:
190 |             self.best_score = loss_sum
191 | 
192 |         if (self.minimize and loss_sum < self.best_score) or (not self.minimize and loss_sum > self.best_score):
193 |             self.best_score = loss_sum
194 |             self.epoch_since_best = 0
195 |         else:
196 |             self.epoch_since_best += 1
197 | 
198 |         if self.epoch_since_best > self.patience:
199 |             self._training_break = True
200 | 
201 |         self.epoch_id += 1
202 | 
203 |     def training_break(self, *args, **kwargs):
204 |         return self._training_break
205 | 
206 | 
207 | class ExponentialLRScheduler(Callback):
208 |     def __init__(self, gamma, epoch_every=1, batch_every=None):
209 |         super().__init__()
210 |         self.gamma = gamma
211 |         if epoch_every == 0:
212 |             self.epoch_every = False
213 |         else:
214 |             self.epoch_every = epoch_every
215 |         if batch_every == 0:
216 |             self.batch_every = False
217 |         else:
218 |             self.batch_every = batch_every
219 | 
220 |     def set_params(self, transformer, validation_datagen):
221 |         self.validation_datagen = validation_datagen
222 |         self.model = transformer.model
223 |         self.optimizer = transformer.optimizer
224 |         self.loss_function = transformer.loss_function
225 |         self.lr_scheduler = ExponentialLR(self.optimizer, self.gamma, last_epoch=-1)
226 | 
227 |     def on_train_begin(self, *args, **kwargs):
228 |         self.epoch_id = 0
229 |         self.batch_id = 0
230 |         logger.info('initial lr: {0}'.format(self.optimizer.state_dict()['param_groups'][0]['initial_lr']))
231 | 
232 |     def on_epoch_end(self, *args, **kwargs):
233 |         if self.epoch_every and (((self.epoch_id + 1) % self.epoch_every) == 0):
234 |             self.lr_scheduler.step()
235 |             logger.info('epoch {0} current lr: {1}'.format(self.epoch_id + 1,
236 |                                                            self.optimizer.state_dict()['param_groups'][0]['lr']))
237 |         self.epoch_id += 1
238 | 
239 |     def on_batch_end(self, *args, **kwargs):
240 |         if self.batch_every and ((self.batch_id % self.batch_every) == 0):
241 |             self.lr_scheduler.step()
242 |             logger.info('epoch {0} batch {1} current lr: {2}'.format(
243 |                 self.epoch_id + 1, self.batch_id + 1, self.optimizer.state_dict()['param_groups'][0]['lr']))
244 |         self.batch_id += 1
245 | 
246 | 
247 | class ModelCheckpoint(Callback):
248 |     def __init__(self, filepath, epoch_every=1, minimize=True):
249 |         super().__init__()
250 |         self.filepath = filepath
251 |         self.minimize = minimize
252 |         self.best_score = None
253 | 
254 |         if epoch_every == 0:
255 |             self.epoch_every = False
256 |         else:
257 |             self.epoch_every = epoch_every
258 | 
259 |     def on_train_begin(self, *args, **kwargs):
260 |         self.epoch_id = 0
261 |         self.batch_id = 0
262 |         os.makedirs(os.path.dirname(self.filepath), exist_ok=True)
263 | 
264 |     def on_epoch_end(self, *args, **kwargs):
265 |         if self.epoch_every and ((self.epoch_id % self.epoch_every) == 0):
266 |             self.model.eval()
267 |             val_loss = self.get_validation_loss()
268 |             loss_sum = val_loss['sum']
269 |             loss_sum = loss_sum.data.cpu().numpy()[0]
270 | 
271 |             self.model.train()
272 | 
273 |             if not self.best_score:
274 |                 self.best_score = loss_sum
275 | 
276 |             if (self.minimize and loss_sum < self.best_score) or (not self.minimize and loss_sum > self.best_score) or (
277 |                     self.epoch_id == 0):
278 |                 self.best_score = loss_sum
279 |                 persist_torch_model(self.model, self.filepath)
280 |                 logger.info('epoch {0} model persisted to {1}'.format(self.epoch_id, self.filepath))
281 | 
282 |         self.epoch_id += 1
283 | 
284 | 
285 | class NeptuneMonitor(Callback):
286 |     def __init__(self, model_name):
287 |         super().__init__()
288 |         self.model_name = model_name
289 |         self.ctx = neptune.Context()
290 |         self.epoch_loss_averager = Averager()
291 | 
292 |     def on_train_begin(self, *args, **kwargs):
293 |         self.epoch_loss_averagers = {}
294 |         self.epoch_id = 0
295 |         self.batch_id = 0
296 | 
297 |     def on_batch_end(self, metrics, *args, **kwargs):
298 |         for name, loss in metrics.items():
299 |             loss = loss.data.cpu().numpy()[0]
300 | 
301 |             if name in self.epoch_loss_averagers.keys():
302 |                 self.epoch_loss_averagers[name].send(loss)
303 |             else:
304 |                 self.epoch_loss_averagers[name] = Averager()
305 |                 self.epoch_loss_averagers[name].send(loss)
306 | 
307 |             self.ctx.channel_send('{} batch {} loss'.format(self.model_name, name), x=self.batch_id, y=loss)
308 | 
309 |         self.batch_id += 1
310 | 
311 |     def on_epoch_end(self, *args, **kwargs):
312 |         self._send_numeric_channels()
313 |         self.epoch_id += 1
314 | 
315 |     def _send_numeric_channels(self, *args, **kwargs):
316 |         for name, averager in self.epoch_loss_averagers.items():
317 |             epoch_avg_loss = averager.value
318 |             averager.reset()
319 |             self.ctx.channel_send('{} epoch {} loss'.format(self.model_name, name), x=self.epoch_id, y=epoch_avg_loss)
320 | 
321 |         self.model.eval()
322 |         val_loss = self.get_validation_loss()
323 |         self.model.train()
324 |         for name, loss in val_loss.items():
325 |             loss = loss.data.cpu().numpy()[0]
326 |             self.ctx.channel_send('{} epoch_val {} loss'.format(self.model_name, name), x=self.epoch_id, y=loss)
327 | 
328 | 
329 | class ExperimentTiming(Callback):
330 |     def __init__(self, epoch_every=None, batch_every=None):
331 |         super().__init__()
332 |         if epoch_every == 0:
333 |             self.epoch_every = False
334 |         else:
335 |             self.epoch_every = epoch_every
336 |         if batch_every == 0:
337 |             self.batch_every = False
338 |         else:
339 |             self.batch_every = batch_every
340 |         self.batch_start = None
341 |         self.epoch_start = None
342 |         self.current_sum = None
343 |         self.current_mean = None
344 | 
345 |     def on_train_begin(self, *args, **kwargs):
346 |         self.epoch_id = 0
347 |         self.batch_id = 0
348 |         logger.info('starting training...')
349 | 
350 |     def on_train_end(self, *args, **kwargs):
351 |         logger.info('training finished')
352 | 
353 |     def on_epoch_begin(self, *args, **kwargs):
354 |         if self.epoch_id > 0:
355 |             epoch_time = datetime.now() - self.epoch_start
356 |             if self.epoch_every:
357 |                 if (self.epoch_id % self.epoch_every) == 0:
358 |                     logger.info('epoch {0} time {1}'.format(self.epoch_id - 1, str(epoch_time)[:-7]))
359 |         self.epoch_start = datetime.now()
360 |         self.current_sum = timedelta()
361 |         self.current_mean = timedelta()
362 |         logger.info('epoch {0} ...'.format(self.epoch_id))
363 | 
364 |     def on_batch_begin(self, *args, **kwargs):
365 |         if self.batch_id > 0:
366 |             current_delta = datetime.now() - self.batch_start
367 |             self.current_sum += current_delta
368 |             self.current_mean = self.current_sum / self.batch_id
369 |         if self.batch_every:
370 |             if self.batch_id > 0 and (((self.batch_id - 1) % self.batch_every) == 0):
371 |                 logger.info('epoch {0} average batch time: {1}'.format(self.epoch_id, str(self.current_mean)[:-5]))
372 |         if self.batch_every:
373 |             if self.batch_id == 0 or self.batch_id % self.batch_every == 0:
374 |                 logger.info('epoch {0} batch {1} ...'.format(self.epoch_id, self.batch_id))
375 |         self.batch_start = datetime.now()
376 | 
377 | 
378 | class ReduceLROnPlateau(Callback):  # thank you keras
379 |     def __init__(self):
380 |         super().__init__()
381 |         pass
382 | 


--------------------------------------------------------------------------------
/toolkit/pytorch_transformers/architectures/unet.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | 
  6 | from toolkit.pytorch_transformers.architectures.utils import get_downsample_pad, get_upsample_pad
  7 | 
  8 | 
  9 | class UNet(nn.Module):
 10 |     def __init__(self, conv_kernel=3,
 11 |                  pool_kernel=3, pool_stride=2,
 12 |                  repeat_blocks=2, n_filters=8,
 13 |                  batch_norm=True, dropout=0.1,
 14 |                  in_channels=3, out_channels=2,
 15 |                  kernel_scale=3,
 16 |                  **kwargs):
 17 | 
 18 |         assert conv_kernel % 2 == 1, "Size of convolution kernel has to be an odd number. " \
 19 |                                      "Otherwise convolution layer will not keep image size"
 20 |         assert pool_stride > 1 or pool_kernel % 2 == 1, "Pooling layer stride has to be greater than one or" \
 21 |                                                         "kernel of pooling layer has to be an odd number."
 22 |         warnings.warn("Please make sure, that your input tensor's dimensions are divisible by "
 23 |                       "(pool_stride ** repeat_blocks)")
 24 | 
 25 |         super(UNet, self).__init__()
 26 | 
 27 |         self.conv_kernel = conv_kernel
 28 |         self.conv_stride = 1
 29 |         self.pool_kernel = pool_kernel
 30 |         self.pool_stride = pool_stride
 31 |         self.repeat_blocks = repeat_blocks
 32 |         self.n_filters = n_filters
 33 |         self.batch_norm = batch_norm
 34 |         self.dropout = dropout
 35 |         self.in_channels = in_channels
 36 |         self.out_channels = out_channels
 37 |         self.kernel_scale = kernel_scale
 38 | 
 39 |         self.input_block = self._input_block()
 40 |         self.down_convs = self._down_convs()
 41 |         self.down_pools = self._down_pools()
 42 |         self.floor_block = self._floor_block()
 43 |         self.up_convs = self._up_convs()
 44 |         self.up_samples = self._up_samples()
 45 |         self.classification_block = self._classification_block()
 46 |         self.output_layer = self._output_layer()
 47 | 
 48 |     def _down_convs(self):
 49 |         down_convs = []
 50 |         for i in range(self.repeat_blocks):
 51 |             in_channels = int(self.n_filters * 2 ** i)
 52 |             down_convs.append(DownConv(in_channels, self.conv_kernel, self.batch_norm, self.dropout))
 53 |         return nn.ModuleList(down_convs)
 54 | 
 55 |     def _up_convs(self):
 56 |         up_convs = []
 57 |         for i in range(self.repeat_blocks):
 58 |             in_channels = int(self.n_filters * 2 ** (i + 2))
 59 |             up_convs.append(UpConv(in_channels, self.conv_kernel, self.batch_norm, self.dropout))
 60 |         return nn.ModuleList(up_convs)
 61 | 
 62 |     def _down_pools(self):
 63 |         down_pools = []
 64 |         padding = get_downsample_pad(stride=self.pool_stride, kernel=self.pool_kernel)
 65 |         for _ in range(self.repeat_blocks):
 66 |             down_pools.append(nn.MaxPool2d(kernel_size=self.pool_kernel,
 67 |                                            stride=self.pool_stride,
 68 |                                            padding=padding))
 69 |         return nn.ModuleList(down_pools)
 70 | 
 71 |     def _up_samples(self):
 72 |         up_samples = []
 73 |         kernel_scale = self.kernel_scale
 74 |         stride = self.pool_stride
 75 |         kernel_size = kernel_scale * stride
 76 |         padding, output_padding = get_upsample_pad(stride=stride, kernel=kernel_size)
 77 |         for i in range(self.repeat_blocks):
 78 |             in_channels = int(self.n_filters * 2 ** (i + 2))
 79 |             out_channels = int(self.n_filters * 2 ** (i + 1))
 80 |             up_samples.append(nn.ConvTranspose2d(in_channels=in_channels,
 81 |                                                  out_channels=out_channels,
 82 |                                                  kernel_size=kernel_size,
 83 |                                                  stride=stride,
 84 |                                                  padding=padding,
 85 |                                                  output_padding=output_padding,
 86 |                                                  bias=False
 87 |                                                  ))
 88 |         return nn.ModuleList(up_samples)
 89 | 
 90 |     def _input_block(self):
 91 |         stride = self.conv_stride
 92 |         padding = get_downsample_pad(stride=stride, kernel=self.conv_kernel)
 93 |         if self.batch_norm:
 94 |             input_block = nn.Sequential(nn.Conv2d(in_channels=self.in_channels, out_channels=self.n_filters,
 95 |                                                   kernel_size=(self.conv_kernel, self.conv_kernel),
 96 |                                                   stride=stride, padding=padding),
 97 |                                         nn.BatchNorm2d(num_features=self.n_filters),
 98 |                                         nn.ReLU(),
 99 | 
100 |                                         nn.Conv2d(in_channels=self.n_filters, out_channels=self.n_filters,
101 |                                                   kernel_size=(self.conv_kernel, self.conv_kernel),
102 |                                                   stride=stride, padding=padding),
103 |                                         nn.BatchNorm2d(num_features=self.n_filters),
104 |                                         nn.ReLU(),
105 | 
106 |                                         nn.Dropout(self.dropout),
107 |                                         )
108 |         else:
109 |             input_block = nn.Sequential(nn.Conv2d(in_channels=self.in_channels, out_channels=self.n_filters,
110 |                                                   kernel_size=(self.conv_kernel, self.conv_kernel),
111 |                                                   stride=stride, padding=padding),
112 |                                         nn.ReLU(),
113 | 
114 |                                         nn.Conv2d(in_channels=self.n_filters, out_channels=self.n_filters,
115 |                                                   kernel_size=(self.conv_kernel, self.conv_kernel),
116 |                                                   stride=stride, padding=padding),
117 |                                         nn.ReLU(),
118 | 
119 |                                         nn.Dropout(self.dropout),
120 |                                         )
121 |         return input_block
122 | 
123 |     def _floor_block(self):
124 |         in_channels = int(self.n_filters * 2 ** self.repeat_blocks)
125 |         return nn.Sequential(DownConv(in_channels, self.conv_kernel, self.batch_norm, self.dropout),
126 |                              )
127 | 
128 |     def _classification_block(self):
129 |         in_block = int(2 * self.n_filters)
130 |         stride = self.conv_stride
131 |         padding = get_downsample_pad(stride=stride, kernel=self.conv_kernel)
132 | 
133 |         if self.batch_norm:
134 |             classification_block = nn.Sequential(nn.Conv2d(in_channels=in_block, out_channels=self.n_filters,
135 |                                                            kernel_size=(self.conv_kernel, self.conv_kernel),
136 |                                                            stride=stride, padding=padding),
137 |                                                  nn.BatchNorm2d(num_features=self.n_filters),
138 |                                                  nn.ReLU(),
139 |                                                  nn.Dropout(self.dropout),
140 | 
141 |                                                  nn.Conv2d(in_channels=self.n_filters, out_channels=self.n_filters,
142 |                                                            kernel_size=(self.conv_kernel, self.conv_kernel),
143 |                                                            stride=stride, padding=padding),
144 |                                                  nn.BatchNorm2d(num_features=self.n_filters),
145 |                                                  nn.ReLU(),
146 |                                                  )
147 |         else:
148 |             classification_block = nn.Sequential(nn.Conv2d(in_channels=in_block, out_channels=self.n_filters,
149 |                                                            kernel_size=(self.conv_kernel, self.conv_kernel),
150 |                                                            stride=stride, padding=padding),
151 |                                                  nn.ReLU(),
152 |                                                  nn.Dropout(self.dropout),
153 | 
154 |                                                  nn.Conv2d(in_channels=self.n_filters, out_channels=self.n_filters,
155 |                                                            kernel_size=(self.conv_kernel, self.conv_kernel),
156 |                                                            stride=stride, padding=padding),
157 |                                                  nn.ReLU(),
158 |                                                  )
159 |         return classification_block
160 | 
161 |     def _output_layer(self):
162 |         return nn.Conv2d(in_channels=self.n_filters, out_channels=self.out_channels,
163 |                          kernel_size=(1, 1), stride=1, padding=0)
164 | 
165 |     def forward(self, x):
166 |         x = self.input_block(x)
167 | 
168 |         down_convs_outputs = []
169 |         for block, down_pool in zip(self.down_convs, self.down_pools):
170 |             x = block(x)
171 |             down_convs_outputs.append(x)
172 |             x = down_pool(x)
173 |         x = self.floor_block(x)
174 | 
175 |         for down_conv_output, block, up_sample in zip(reversed(down_convs_outputs),
176 |                                                       reversed(self.up_convs),
177 |                                                       reversed(self.up_samples)):
178 |             x = up_sample(x)
179 |             x = torch.cat((down_conv_output, x), dim=1)
180 | 
181 |             x = block(x)
182 | 
183 |         x = self.classification_block(x)
184 |         x = self.output_layer(x)
185 |         return x
186 | 
187 | 
188 | class UNetMultitask(UNet):
189 |     def __init__(self,
190 |                  conv_kernel,
191 |                  pool_kernel,
192 |                  pool_stride,
193 |                  repeat_blocks,
194 |                  n_filters,
195 |                  batch_norm,
196 |                  dropout,
197 |                  in_channels,
198 |                  out_channels,
199 |                  nr_outputs):
200 |         super(UNetMultitask, self).__init__(conv_kernel,
201 |                                             pool_kernel,
202 |                                             pool_stride,
203 |                                             repeat_blocks,
204 |                                             n_filters,
205 |                                             batch_norm,
206 |                                             dropout,
207 |                                             in_channels,
208 |                                             out_channels)
209 |         self.nr_outputs = nr_outputs
210 |         output_legs = []
211 |         for i in range(self.nr_outputs):
212 |             output_legs.append(self._output_layer())
213 |         self.output_legs = nn.ModuleList(output_legs)
214 | 
215 |     def forward(self, x):
216 |         x = self.input_block(x)
217 | 
218 |         down_convs_outputs = []
219 |         for block, down_pool in zip(self.down_convs, self.down_pools):
220 |             x = block(x)
221 |             down_convs_outputs.append(x)
222 |             x = down_pool(x)
223 |         x = self.floor_block(x)
224 | 
225 |         for down_conv_output, block, up_sample in zip(reversed(down_convs_outputs),
226 |                                                       reversed(self.up_convs),
227 |                                                       reversed(self.up_samples)):
228 |             x = up_sample(x)
229 |             x = torch.cat((down_conv_output, x), dim=1)
230 | 
231 |             x = block(x)
232 | 
233 |         x = self.classification_block(x)
234 | 
235 |         outputs = [output_leg(x) for output_leg in self.output_legs]
236 |         return outputs
237 | 
238 | 
239 | class DownConv(nn.Module):
240 |     def __init__(self, in_channels, kernel_size, batch_norm, dropout):
241 |         super(DownConv, self).__init__()
242 |         self.in_channels = in_channels
243 |         self.block_channels = int(in_channels * 2.)
244 |         self.kernel_size = kernel_size
245 |         self.batch_norm = batch_norm
246 |         self.dropout = dropout
247 |         self.conv_stride = 1
248 | 
249 |         self.down_conv = self._down_conv()
250 | 
251 |     def _down_conv(self):
252 |         stride = self.conv_stride
253 |         padding = get_downsample_pad(stride=stride, kernel=self.kernel_size)
254 |         if self.batch_norm:
255 |             down_conv = nn.Sequential(nn.Conv2d(in_channels=self.in_channels, out_channels=self.block_channels,
256 |                                                 kernel_size=(self.kernel_size, self.kernel_size),
257 |                                                 stride=stride, padding=padding),
258 |                                       nn.BatchNorm2d(num_features=self.block_channels),
259 |                                       nn.ReLU(),
260 | 
261 |                                       nn.Conv2d(in_channels=self.block_channels, out_channels=self.block_channels,
262 |                                                 kernel_size=(self.kernel_size, self.kernel_size),
263 |                                                 stride=stride, padding=padding),
264 |                                       nn.BatchNorm2d(num_features=self.block_channels),
265 |                                       nn.ReLU(),
266 | 
267 |                                       nn.Dropout(self.dropout),
268 |                                       )
269 |         else:
270 |             down_conv = nn.Sequential(nn.Conv2d(in_channels=self.in_channels, out_channels=self.block_channels,
271 |                                                 kernel_size=(self.kernel_size, self.kernel_size),
272 |                                                 stride=stride, padding=padding),
273 |                                       nn.ReLU(),
274 | 
275 |                                       nn.Conv2d(in_channels=self.block_channels, out_channels=self.block_channels,
276 |                                                 kernel_size=(self.kernel_size, self.kernel_size),
277 |                                                 stride=stride, padding=padding),
278 |                                       nn.ReLU(),
279 | 
280 |                                       nn.Dropout(self.dropout),
281 |                                       )
282 |         return down_conv
283 | 
284 |     def forward(self, x):
285 |         return self.down_conv(x)
286 | 
287 | 
288 | class UpConv(nn.Module):
289 |     def __init__(self, in_channels, kernel_size, batch_norm, dropout):
290 |         super(UpConv, self).__init__()
291 |         self.in_channels = in_channels
292 |         self.block_channels = int(in_channels / 2.)
293 |         self.kernel_size = kernel_size
294 |         self.batch_norm = batch_norm
295 |         self.dropout = dropout
296 |         self.conv_stride = 1
297 | 
298 |         self.up_conv = self._up_conv()
299 | 
300 |     def _up_conv(self):
301 |         stride = self.conv_stride
302 |         padding = get_downsample_pad(stride=stride, kernel=self.kernel_size)
303 |         if self.batch_norm:
304 |             up_conv = nn.Sequential(nn.Conv2d(in_channels=self.in_channels, out_channels=self.block_channels,
305 |                                               kernel_size=(self.kernel_size, self.kernel_size),
306 |                                               stride=stride, padding=padding),
307 | 
308 |                                     nn.BatchNorm2d(num_features=self.block_channels),
309 |                                     nn.ReLU(),
310 | 
311 |                                     nn.Conv2d(in_channels=self.block_channels, out_channels=self.block_channels,
312 |                                               kernel_size=(self.kernel_size, self.kernel_size),
313 |                                               stride=stride, padding=padding),
314 |                                     nn.BatchNorm2d(num_features=self.block_channels),
315 |                                     nn.ReLU(),
316 | 
317 |                                     nn.Dropout(self.dropout)
318 |                                     )
319 |         else:
320 |             up_conv = nn.Sequential(nn.Conv2d(in_channels=self.in_channels, out_channels=self.block_channels,
321 |                                               kernel_size=(self.kernel_size, self.kernel_size),
322 |                                               stride=stride, padding=padding),
323 |                                     nn.ReLU(),
324 | 
325 |                                     nn.Conv2d(in_channels=self.block_channels, out_channels=self.block_channels,
326 |                                               kernel_size=(self.kernel_size, self.kernel_size),
327 |                                               stride=stride, padding=padding),
328 |                                     nn.ReLU(),
329 | 
330 |                                     nn.Dropout(self.dropout)
331 |                                     )
332 |         return up_conv
333 | 
334 |     def forward(self, x):
335 |         return self.up_conv(x)
336 | 


--------------------------------------------------------------------------------
/toolkit/keras_transformers/architectures.py:
--------------------------------------------------------------------------------
  1 | from keras import regularizers
  2 | from keras.activations import relu
  3 | from keras.layers import Input, Embedding, PReLU, Bidirectional, Lambda, \
  4 |     CuDNNLSTM, CuDNNGRU, Conv1D, Dense, BatchNormalization, Dropout, SpatialDropout1D, \
  5 |     GlobalMaxPool1D, GlobalAveragePooling1D, MaxPooling1D
  6 | from keras.layers.merge import add, concatenate
  7 | from keras.models import Model
  8 | 
  9 | from toolkit.keras_transformers.contrib import AttentionWeightedAverage
 10 | 
 11 | 
 12 | def scnn(embedding_matrix, embedding_size, trainable_embedding, maxlen, max_features,
 13 |          filter_nr, kernel_size, repeat_block, dense_size, repeat_dense, output_size, output_activation,
 14 |          max_pooling, mean_pooling, weighted_average_attention, concat_mode,
 15 |          dropout_embedding, conv_dropout, dense_dropout, dropout_mode,
 16 |          conv_kernel_reg_l2, conv_bias_reg_l2,
 17 |          dense_kernel_reg_l2, dense_bias_reg_l2,
 18 |          use_prelu, use_batch_norm, batch_norm_first):
 19 |     input_text = Input(shape=(maxlen,))
 20 |     x = Embedding(max_features, embedding_size, weights=[embedding_matrix], trainable=trainable_embedding)(
 21 |         input_text)
 22 | 
 23 |     x = dropout_block(dropout_embedding, dropout_mode)(x)
 24 | 
 25 |     for _ in range(repeat_block):
 26 |         x = convolutional_block(filter_nr, kernel_size, use_batch_norm, use_prelu, conv_dropout, dropout_mode,
 27 |                                 conv_kernel_reg_l2, conv_bias_reg_l2, batch_norm_first)(x)
 28 | 
 29 |     predictions = classification_block(dense_size=dense_size, repeat_dense=repeat_dense,
 30 |                                        output_size=output_size, output_activation=output_activation,
 31 |                                        max_pooling=max_pooling,
 32 |                                        mean_pooling=mean_pooling,
 33 |                                        weighted_average_attention=weighted_average_attention,
 34 |                                        concat_mode=concat_mode,
 35 |                                        dropout=dense_dropout,
 36 |                                        kernel_reg_l2=dense_kernel_reg_l2, bias_reg_l2=dense_bias_reg_l2,
 37 |                                        use_prelu=use_prelu, use_batch_norm=use_batch_norm,
 38 |                                        batch_norm_first=batch_norm_first)(x)
 39 |     model = Model(inputs=input_text, outputs=predictions)
 40 |     return model
 41 | 
 42 | 
 43 | def dpcnn(embedding_matrix, embedding_size, trainable_embedding, maxlen, max_features,
 44 |           filter_nr, kernel_size, repeat_block, dense_size, repeat_dense, output_size, output_activation,
 45 |           max_pooling, mean_pooling, weighted_average_attention, concat_mode,
 46 |           dropout_embedding, conv_dropout, dense_dropout, dropout_mode,
 47 |           conv_kernel_reg_l2, conv_bias_reg_l2,
 48 |           dense_kernel_reg_l2, dense_bias_reg_l2,
 49 |           use_prelu, use_batch_norm, batch_norm_first):
 50 |     """
 51 |     Note:
 52 |         Implementation of http://ai.tencent.com/ailab/media/publications/ACL3-Brady.pdf
 53 |         post activation is used instead of pre-activation, could be worth exploring
 54 |     """
 55 | 
 56 |     input_text = Input(shape=(maxlen,))
 57 |     if embedding_matrix is not None:
 58 |         embedding = Embedding(max_features, embedding_size,
 59 |                               weights=[embedding_matrix], trainable=trainable_embedding)(input_text)
 60 |     else:
 61 |         embedding = Embedding(max_features, embedding_size)(input_text)
 62 | 
 63 |     embedding = dropout_block(dropout_embedding, dropout_mode)(embedding)
 64 | 
 65 |     x = convolutional_block(filter_nr, kernel_size, use_batch_norm, use_prelu, conv_dropout, dropout_mode,
 66 |                             conv_kernel_reg_l2, conv_bias_reg_l2, batch_norm_first)(embedding)
 67 |     x = convolutional_block(filter_nr, kernel_size, conv_bias_reg_l2, use_prelu, conv_dropout, dropout_mode,
 68 |                             conv_kernel_reg_l2, conv_bias_reg_l2, batch_norm_first)(x)
 69 |     if embedding_size == filter_nr:
 70 |         x = add([embedding, x])
 71 |     else:
 72 |         embedding_resized = shape_matching_layer(filter_nr, use_prelu, conv_kernel_reg_l2, conv_bias_reg_l2)(embedding)
 73 |         x = add([embedding_resized, x])
 74 |     for _ in range(repeat_block):
 75 |         x = dpcnn_block(filter_nr, kernel_size, use_batch_norm, use_prelu, conv_dropout, dropout_mode,
 76 |                         conv_kernel_reg_l2, conv_bias_reg_l2, batch_norm_first)(x)
 77 | 
 78 |     predictions = classification_block(dense_size=dense_size, repeat_dense=repeat_dense,
 79 |                                        output_size=output_size, output_activation=output_activation,
 80 |                                        max_pooling=max_pooling,
 81 |                                        mean_pooling=mean_pooling,
 82 |                                        weighted_average_attention=weighted_average_attention,
 83 |                                        concat_mode=concat_mode,
 84 |                                        dropout=dense_dropout,
 85 |                                        kernel_reg_l2=dense_kernel_reg_l2, bias_reg_l2=dense_bias_reg_l2,
 86 |                                        use_prelu=use_prelu, use_batch_norm=use_batch_norm,
 87 |                                        batch_norm_first=batch_norm_first)(x)
 88 |     model = Model(inputs=input_text, outputs=predictions)
 89 |     return model
 90 | 
 91 | 
 92 | def cudnn_lstm(embedding_matrix, embedding_size, trainable_embedding,
 93 |                maxlen, max_features,
 94 |                unit_nr, repeat_block,
 95 |                dense_size, repeat_dense, output_size, output_activation,
 96 |                max_pooling, mean_pooling, weighted_average_attention, concat_mode,
 97 |                dropout_embedding, rnn_dropout, dense_dropout, dropout_mode,
 98 |                rnn_kernel_reg_l2, rnn_recurrent_reg_l2, rnn_bias_reg_l2,
 99 |                dense_kernel_reg_l2, dense_bias_reg_l2,
100 |                use_prelu, use_batch_norm, batch_norm_first):
101 |     input_text = Input(shape=(maxlen,))
102 |     if embedding_matrix is not None:
103 |         x = Embedding(max_features,
104 |                       embedding_size,
105 |                       weights=[embedding_matrix],
106 |                       trainable=trainable_embedding)(input_text)
107 |     else:
108 |         x = Embedding(max_features,
109 |                       embedding_size)(input_text)
110 | 
111 |     x = dropout_block(dropout_embedding, dropout_mode)(x)
112 | 
113 |     for _ in range(repeat_block):
114 |         x = cudnn_lstm_block(unit_nr=unit_nr, return_sequences=True, bidirectional=True,
115 |                             kernel_reg_l2=rnn_kernel_reg_l2,
116 |                             recurrent_reg_l2=rnn_recurrent_reg_l2,
117 |                             bias_reg_l2=rnn_bias_reg_l2,
118 |                             use_batch_norm=use_batch_norm, batch_norm_first=batch_norm_first,
119 |                             dropout=rnn_dropout, dropout_mode=dropout_mode, use_prelu=use_prelu)(x)
120 | 
121 |     predictions = classification_block(dense_size=dense_size, repeat_dense=repeat_dense,
122 |                                        output_size=output_size, output_activation=output_activation,
123 |                                        max_pooling=max_pooling,
124 |                                        mean_pooling=mean_pooling,
125 |                                        weighted_average_attention=weighted_average_attention,
126 |                                        concat_mode=concat_mode,
127 |                                        dropout=dense_dropout,
128 |                                        kernel_reg_l2=dense_kernel_reg_l2, bias_reg_l2=dense_bias_reg_l2,
129 |                                        use_prelu=use_prelu, use_batch_norm=use_batch_norm,
130 |                                        batch_norm_first=batch_norm_first)(x)
131 |     model = Model(inputs=input_text, outputs=predictions)
132 |     return model
133 | 
134 | 
135 | def cudnn_gru(embedding_matrix, embedding_size, trainable_embedding,
136 |               maxlen, max_features,
137 |               unit_nr, repeat_block,
138 |               dense_size, repeat_dense, output_size, output_activation,
139 |               max_pooling, mean_pooling, weighted_average_attention, concat_mode,
140 |               dropout_embedding, rnn_dropout, dense_dropout, dropout_mode,
141 |               rnn_kernel_reg_l2, rnn_recurrent_reg_l2, rnn_bias_reg_l2,
142 |               dense_kernel_reg_l2, dense_bias_reg_l2,
143 |               use_prelu, use_batch_norm, batch_norm_first):
144 |     input_text = Input(shape=(maxlen,))
145 |     if embedding_matrix is not None:
146 |         x = Embedding(max_features,
147 |                       embedding_size,
148 |                       weights=[embedding_matrix],
149 |                       trainable=trainable_embedding)(input_text)
150 |     else:
151 |         x = Embedding(max_features,
152 |                       embedding_size)(input_text)
153 | 
154 |     x = dropout_block(dropout_embedding, dropout_mode)(x)
155 | 
156 |     for _ in range(repeat_block):
157 |         x = cudnn_gru_block(unit_nr=unit_nr, return_sequences=True, bidirectional=True,
158 |                             kernel_reg_l2=rnn_kernel_reg_l2,
159 |                             recurrent_reg_l2=rnn_recurrent_reg_l2,
160 |                             bias_reg_l2=rnn_bias_reg_l2,
161 |                             use_batch_norm=use_batch_norm, batch_norm_first=batch_norm_first,
162 |                             dropout=rnn_dropout, dropout_mode=dropout_mode, use_prelu=use_prelu)(x)
163 | 
164 |     predictions = classification_block(dense_size=dense_size, repeat_dense=repeat_dense,
165 |                                        output_size=output_size, output_activation=output_activation,
166 |                                        max_pooling=max_pooling,
167 |                                        mean_pooling=mean_pooling,
168 |                                        weighted_average_attention=weighted_average_attention,
169 |                                        concat_mode=concat_mode,
170 |                                        dropout=dense_dropout,
171 |                                        kernel_reg_l2=dense_kernel_reg_l2, bias_reg_l2=dense_bias_reg_l2,
172 |                                        use_prelu=use_prelu, use_batch_norm=use_batch_norm,
173 |                                        batch_norm_first=batch_norm_first)(x)
174 |     model = Model(inputs=input_text, outputs=predictions)
175 |     return model
176 | 
177 | 
178 | def vdcnn(embedding_size, maxlen, max_features,
179 |           filter_nr, kernel_size, repeat_block, dense_size, repeat_dense, output_size, output_activation,
180 |           max_pooling, mean_pooling, weighted_average_attention, concat_mode,
181 |           dropout_embedding, conv_dropout, dense_dropout, dropout_mode,
182 |           conv_kernel_reg_l2, conv_bias_reg_l2,
183 |           dense_kernel_reg_l2, dense_bias_reg_l2,
184 |           use_prelu, use_batch_norm, batch_norm_first):
185 |     """
186 |     Note:
187 |         Implementation of http://www.aclweb.org/anthology/E17-1104
188 |         We didn't use k-max pooling but GlobalMaxPool1D at the end and didn't explore it in the
189 |         intermediate layers.
190 |     """
191 | 
192 |     input_text = Input(shape=(maxlen,))
193 |     x = Embedding(input_dim=max_features, output_dim=embedding_size)(input_text)
194 | 
195 |     x = dropout_block(dropout_embedding, dropout_mode)(x)
196 | 
197 |     x = convolutional_block(filter_nr, kernel_size, use_batch_norm, use_prelu, conv_dropout, dropout_mode,
198 |                             conv_kernel_reg_l2, conv_bias_reg_l2, batch_norm_first)(x)
199 | 
200 |     for i in range(repeat_block):
201 |         if i + 1 != repeat_block:
202 |             x = vdcnn_block(filter_nr, kernel_size, use_batch_norm, use_prelu, conv_dropout, dropout_mode,
203 |                             conv_kernel_reg_l2, conv_bias_reg_l2, batch_norm_first, last_block=False)(x)
204 |         else:
205 |             x = vdcnn_block(filter_nr, kernel_size, use_batch_norm, use_prelu, conv_dropout, dropout_mode,
206 |                             conv_kernel_reg_l2, conv_bias_reg_l2, batch_norm_first, last_block=True)(x)
207 | 
208 |     predictions = classification_block(dense_size=dense_size, repeat_dense=repeat_dense,
209 |                                        output_size=output_size, output_activation=output_activation,
210 |                                        max_pooling=max_pooling,
211 |                                        mean_pooling=mean_pooling,
212 |                                        weighted_average_attention=weighted_average_attention,
213 |                                        concat_mode=concat_mode,
214 |                                        dropout=dense_dropout,
215 |                                        kernel_reg_l2=dense_kernel_reg_l2, bias_reg_l2=dense_bias_reg_l2,
216 |                                        use_prelu=use_prelu, use_batch_norm=use_batch_norm,
217 |                                        batch_norm_first=batch_norm_first)(x)
218 |     model = Model(inputs=input_text, outputs=predictions)
219 |     return model
220 | 
221 | 
222 | def classification_block(dense_size, repeat_dense, output_size, output_activation,
223 |                          max_pooling, mean_pooling, weighted_average_attention, concat_mode,
224 |                          dropout,
225 |                          kernel_reg_l2, bias_reg_l2,
226 |                          use_prelu, use_batch_norm, batch_norm_first):
227 |     def f(x):
228 |         if max_pooling:
229 |             x_max = GlobalMaxPool1D()(x)
230 |         else:
231 |             x_max = None
232 | 
233 |         if mean_pooling:
234 |             x_mean = GlobalAveragePooling1D()(x)
235 |         else:
236 |             x_mean = None
237 |         if weighted_average_attention:
238 |             x_att = AttentionWeightedAverage()(x)
239 |         else:
240 |             x_att = None
241 | 
242 |         x = [xi for xi in [x_max, x_mean, x_att] if xi is not None]
243 |         if len(x) == 1:
244 |             x = x[0]
245 |         else:
246 |             if concat_mode == 'concat':
247 |                 x = concatenate(x, axis=-1)
248 |             else:
249 |                 NotImplementedError('only mode concat for now')
250 | 
251 |         for _ in range(repeat_dense):
252 |             x = dense_block(dense_size=dense_size,
253 |                             use_batch_norm=use_batch_norm,
254 |                             use_prelu=use_prelu,
255 |                             dropout=dropout,
256 |                             kernel_reg_l2=kernel_reg_l2,
257 |                             bias_reg_l2=bias_reg_l2,
258 |                             batch_norm_first=batch_norm_first)(x)
259 | 
260 |         x = Dense(output_size, activation=output_activation)(x)
261 |         return x
262 | 
263 |     return f
264 | 
265 | 
266 | def dropout_block(dropout, dropout_mode):
267 |     def f(x):
268 |         if dropout_mode == 'spatial':
269 |             x = SpatialDropout1D(dropout)(x)
270 |         elif dropout_mode == 'simple':
271 |             x = Dropout(dropout)(x)
272 |         else:
273 |             raise NotImplementedError('spatial and simple modes are supported')
274 |         return x
275 | 
276 |     return f
277 | 
278 | 
279 | def prelu_block(use_prelu):
280 |     def f(x):
281 |         if use_prelu:
282 |             x = PReLU()(x)
283 |         else:
284 |             x = Lambda(relu)(x)
285 |         return x
286 | 
287 |     return f
288 | 
289 | 
290 | def bn_relu_dropout_block(use_batch_norm, use_prelu, dropout, dropout_mode, batch_norm_first):
291 |     def f(x):
292 |         if use_batch_norm and batch_norm_first:
293 |             x = BatchNormalization()(x)
294 | 
295 |         x = prelu_block(use_prelu)(x)
296 |         x = dropout_block(dropout, dropout_mode)(x)
297 | 
298 |         if use_batch_norm and not batch_norm_first:
299 |             x = BatchNormalization()(x)
300 |         return x
301 | 
302 |     return f
303 | 
304 | 
305 | def convolutional_block(filter_nr, kernel_size, use_batch_norm, use_prelu, dropout, dropout_mode,
306 |                         kernel_reg_l2, bias_reg_l2, batch_norm_first):
307 |     def f(x):
308 |         x = Conv1D(filter_nr, kernel_size=kernel_size, padding='same', activation='linear',
309 |                    kernel_regularizer=regularizers.l2(kernel_reg_l2),
310 |                    bias_regularizer=regularizers.l2(bias_reg_l2))(x)
311 |         x = bn_relu_dropout_block(use_batch_norm=use_batch_norm,
312 |                                   batch_norm_first=batch_norm_first,
313 |                                   dropout=dropout,
314 |                                   dropout_mode=dropout_mode,
315 |                                   use_prelu=use_prelu)(x)
316 |         return x
317 | 
318 |     return f
319 | 
320 | 
321 | def shape_matching_layer(filter_nr, use_prelu, kernel_reg_l2, bias_reg_l2):
322 |     def f(x):
323 |         x = Conv1D(filter_nr, kernel_size=1, padding='same', activation='linear',
324 |                    kernel_regularizer=regularizers.l2(kernel_reg_l2),
325 |                    bias_regularizer=regularizers.l2(bias_reg_l2))(x)
326 |         x = prelu_block(use_prelu)(x)
327 |         return x
328 | 
329 |     return f
330 | 
331 | 
332 | def cudnn_lstm_block(unit_nr, return_sequences, bidirectional,
333 |                      kernel_reg_l2, recurrent_reg_l2, bias_reg_l2,
334 |                      use_batch_norm, batch_norm_first,
335 |                      dropout, dropout_mode, use_prelu):
336 |     def f(x):
337 |         gru_layer = CuDNNLSTM(uunits=unit_nr, return_sequences=return_sequences,
338 |                               kernel_regularizer=regularizers.l2(kernel_reg_l2),
339 |                               recurrent_regularizer=regularizers.l2(recurrent_reg_l2),
340 |                               bias_regularizer=regularizers.l2(bias_reg_l2)
341 |                               )
342 |         if bidirectional:
343 |             x = Bidirectional(gru_layer)(x)
344 |         else:
345 |             x = gru_layer(x)
346 |         x = bn_relu_dropout_block(use_batch_norm=use_batch_norm, batch_norm_first=batch_norm_first,
347 |                                   dropout=dropout, dropout_mode=dropout_mode,
348 |                                   use_prelu=use_prelu)(x)
349 |         return x
350 | 
351 |     return f
352 | 
353 | 
354 | def cudnn_gru_block(unit_nr, return_sequences, bidirectional,
355 |                     kernel_reg_l2, recurrent_reg_l2, bias_reg_l2,
356 |                     use_batch_norm, batch_norm_first,
357 |                     dropout, dropout_mode, use_prelu):
358 |     def f(x):
359 |         gru_layer = CuDNNGRU(units=unit_nr, return_sequences=return_sequences,
360 |                              kernel_regularizer=regularizers.l2(kernel_reg_l2),
361 |                              recurrent_regularizer=regularizers.l2(recurrent_reg_l2),
362 |                              bias_regularizer=regularizers.l2(bias_reg_l2)
363 |                              )
364 |         if bidirectional:
365 |             x = Bidirectional(gru_layer)(x)
366 |         else:
367 |             x = gru_layer(x)
368 |         x = bn_relu_dropout_block(use_batch_norm=use_batch_norm, batch_norm_first=batch_norm_first,
369 |                                   dropout=dropout, dropout_mode=dropout_mode,
370 |                                   use_prelu=use_prelu)(x)
371 |         return x
372 | 
373 |     return f
374 | 
375 | 
376 | def dense_block(dense_size, use_batch_norm, use_prelu, dropout, kernel_reg_l2, bias_reg_l2,
377 |                 batch_norm_first):
378 |     def f(x):
379 |         x = Dense(dense_size, activation='linear',
380 |                   kernel_regularizer=regularizers.l2(kernel_reg_l2),
381 |                   bias_regularizer=regularizers.l2(bias_reg_l2))(x)
382 | 
383 |         x = bn_relu_dropout_block(use_batch_norm=use_batch_norm,
384 |                                   use_prelu=use_prelu,
385 |                                   dropout=dropout,
386 |                                   dropout_mode='simple',
387 |                                   batch_norm_first=batch_norm_first)(x)
388 |         return x
389 | 
390 |     return f
391 | 
392 | 
393 | def dpcnn_block(filter_nr, kernel_size, use_batch_norm, use_prelu, dropout, dropout_mode,
394 |                 kernel_reg_l2, bias_reg_l2, batch_norm_first):
395 |     def f(x):
396 |         x = MaxPooling1D(pool_size=3, strides=2)(x)
397 |         main = convolutional_block(filter_nr, kernel_size, use_batch_norm, use_prelu, dropout, dropout_mode,
398 |                                    kernel_reg_l2, bias_reg_l2, batch_norm_first)(x)
399 |         main = convolutional_block(filter_nr, kernel_size, use_batch_norm, use_prelu, dropout, dropout_mode,
400 |                                    kernel_reg_l2, bias_reg_l2, batch_norm_first)(main)
401 |         x = add([main, x])
402 |         return x
403 | 
404 |     return f
405 | 
406 | 
407 | def vdcnn_block(filter_nr, kernel_size, use_batch_norm, use_prelu, dropout, dropout_mode,
408 |                 kernel_reg_l2, bias_reg_l2, batch_norm_first, last_block):
409 |     def f(x):
410 |         main = convolutional_block(filter_nr, kernel_size, use_batch_norm, use_prelu, dropout, dropout_mode,
411 |                                    kernel_reg_l2, bias_reg_l2, batch_norm_first)(x)
412 |         x = add([main, x])
413 |         main = convolutional_block(filter_nr, kernel_size, use_batch_norm, use_prelu, dropout, dropout_mode,
414 |                                    kernel_reg_l2, bias_reg_l2, batch_norm_first)(x)
415 |         x = add([main, x])
416 |         if not last_block:
417 |             x = MaxPooling1D(pool_size=3, strides=2)(x)
418 |         return x
419 | 
420 |     return f
421 | 


--------------------------------------------------------------------------------
/toolkit/pytorch_transformers/loaders/segmentation.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import multiprocessing as mp
  3 | from functools import partial
  4 | from itertools import product
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | import torchvision.transforms as transforms
  9 | from PIL import Image
 10 | from attrdict import AttrDict
 11 | from imgaug import augmenters as iaa
 12 | from scipy.stats import gmean
 13 | from sklearn.externals import joblib
 14 | from steppy.base import BaseTransformer
 15 | from torch.utils.data import Dataset, DataLoader
 16 | from tqdm import tqdm
 17 | 
 18 | from toolkit.utils import from_pil, to_pil, binary_from_rle, ImgAug
 19 | 
 20 | 
 21 | class ImageReader(BaseTransformer):
 22 |     def __init__(self, train_mode, x_columns, y_columns, target_format='png'):
 23 |         self.train_mode = train_mode
 24 |         self.x_columns = x_columns
 25 |         self.y_columns = y_columns
 26 |         self.target_format = target_format
 27 | 
 28 |     def transform(self, meta):
 29 |         X_ = meta[self.x_columns].values
 30 | 
 31 |         X = self.load_images(X_, filetype='png', grayscale=False)
 32 |         if self.train_mode:
 33 |             y_ = meta[self.y_columns].values
 34 |             y = self.load_images(y_, filetype=self.target_format, grayscale=True)
 35 |         else:
 36 |             y = None
 37 | 
 38 |         return {'X': X,
 39 |                 'y': y}
 40 | 
 41 |     def load_images(self, filepaths, filetype, grayscale=False):
 42 |         X = []
 43 |         for i in range(filepaths.shape[1]):
 44 |             column = filepaths[:, i]
 45 |             X.append([])
 46 |             for filepath in tqdm(column):
 47 |                 if filetype == 'png':
 48 |                     data = self.load_image(filepath, grayscale=grayscale)
 49 |                 elif filetype == 'json':
 50 |                     data = self.read_json(filepath)
 51 |                 else:
 52 |                     raise Exception('files must be png or json')
 53 |                 X[i].append(data)
 54 |         return X
 55 | 
 56 |     def load_image(self, img_filepath, grayscale):
 57 |         image = Image.open(img_filepath, 'r')
 58 |         if not grayscale:
 59 |             image = image.convert('RGB')
 60 |         else:
 61 |             image = image.convert('L').point(lambda x: 0 if x < 128 else 255, '1')
 62 |         return image
 63 | 
 64 |     def read_json(self, path):
 65 |         with open(path, 'r') as file:
 66 |             data = json.load(file)
 67 |         masks = [to_pil(binary_from_rle(rle)) for rle in data]
 68 |         return masks
 69 | 
 70 | 
 71 | class XYSplit(BaseTransformer):
 72 |     def __init__(self, train_mode, x_columns, y_columns):
 73 |         self.train_mode = train_mode
 74 |         super().__init__()
 75 |         self.x_columns = x_columns
 76 |         self.y_columns = y_columns
 77 |         self.columns_to_get = None
 78 |         self.target_columns = None
 79 | 
 80 |     def transform(self, meta):
 81 |         X = meta[self.x_columns[0]].values
 82 |         if self.train_mode:
 83 |             y = meta[self.y_columns[0]].values
 84 |         else:
 85 |             y = None
 86 | 
 87 |         return {'X': X,
 88 |                 'y': y}
 89 | 
 90 | 
 91 | class ImageSegmentationBaseDataset(Dataset):
 92 |     def __init__(self, X, y, train_mode,
 93 |                  image_transform, image_augment_with_target,
 94 |                  mask_transform, image_augment,
 95 |                  image_source='memory'):
 96 |         super().__init__()
 97 |         self.X = X
 98 |         if y is not None:
 99 |             self.y = y
100 |         else:
101 |             self.y = None
102 | 
103 |         self.train_mode = train_mode
104 |         self.image_transform = image_transform
105 |         self.mask_transform = mask_transform
106 |         self.image_augment = image_augment if image_augment is not None else ImgAug(iaa.Noop())
107 |         self.image_augment_with_target = image_augment_with_target if image_augment_with_target is not None else ImgAug(
108 |             iaa.Noop())
109 | 
110 |         self.image_source = image_source
111 | 
112 |     def __len__(self):
113 |         if self.image_source == 'memory':
114 |             return len(self.X[0])
115 |         elif self.image_source == 'disk':
116 |             return self.X.shape[0]
117 | 
118 |     def __getitem__(self, index):
119 |         if self.image_source == 'memory':
120 |             load_func = self.load_from_memory
121 |         elif self.image_source == 'disk':
122 |             load_func = self.load_from_disk
123 |         else:
124 |             raise NotImplementedError("Possible loading options: 'memory' and 'disk'!")
125 | 
126 |         Xi = load_func(self.X, index, filetype='png', grayscale=False)
127 | 
128 |         if self.y is not None:
129 |             Mi = self.load_target(self.y, index, load_func)
130 |             Xi, *Mi = from_pil(Xi, *Mi)
131 |             Xi, *Mi = self.image_augment_with_target(Xi, *Mi)
132 |             Xi = self.image_augment(Xi)
133 |             Xi, *Mi = to_pil(Xi, *Mi)
134 | 
135 |             if self.mask_transform is not None:
136 |                 Mi = [self.mask_transform(m) for m in Mi]
137 | 
138 |             if self.image_transform is not None:
139 |                 Xi = self.image_transform(Xi)
140 | 
141 |             Mi = torch.cat(Mi, dim=0)
142 |             return Xi, Mi
143 |         else:
144 |             Xi = from_pil(Xi)
145 |             Xi = self.image_augment(Xi)
146 |             Xi = to_pil(Xi)
147 | 
148 |             if self.image_transform is not None:
149 |                 Xi = self.image_transform(Xi)
150 |             return Xi
151 | 
152 |     def load_from_memory(self, data_source, index, **kwargs):
153 |         return data_source[0][index]
154 | 
155 |     def load_from_disk(self, data_source, index, *, filetype, grayscale=False):
156 |         if filetype == 'png':
157 |             img_filepath = data_source[index]
158 |             return self.load_image(img_filepath, grayscale=grayscale)
159 |         elif filetype == 'json':
160 |             json_filepath = data_source[index]
161 |             return self.read_json(json_filepath)
162 |         else:
163 |             raise Exception('files must be png or json')
164 | 
165 |     def load_image(self, img_filepath, grayscale):
166 |         image = Image.open(img_filepath, 'r')
167 |         if not grayscale:
168 |             image = image.convert('RGB')
169 |         else:
170 |             image = image.convert('L').point(lambda x: 0 if x < 128 else 1)
171 |         return image
172 | 
173 |     def read_json(self, path):
174 |         with open(path, 'r') as file:
175 |             data = json.load(file)
176 |         masks = [to_pil(binary_from_rle(rle)) for rle in data]
177 |         return masks
178 | 
179 |     def load_target(self, data_source, index, load_func):
180 |         raise NotImplementedError
181 | 
182 | 
183 | class ImageSegmentationJsonDataset(ImageSegmentationBaseDataset):
184 |     def load_target(self, data_source, index, load_func):
185 |         Mi = load_func(data_source, index, filetype='json')
186 |         return Mi
187 | 
188 | 
189 | class ImageSegmentationPngDataset(ImageSegmentationBaseDataset):
190 |     def load_target(self, data_source, index, load_func):
191 |         Mi = load_func(data_source, index, filetype='png', grayscale=True)
192 |         Mi = from_pil(Mi)
193 |         target = [to_pil(Mi == class_nr) for class_nr in [0, 1]]
194 |         return target
195 | 
196 | 
197 | class ImageSegmentationTTADataset(ImageSegmentationBaseDataset):
198 |     def __init__(self, tta_params, tta_transform, *args, **kwargs):
199 |         super().__init__(*args, **kwargs)
200 |         self.tta_params = tta_params
201 |         self.tta_transform = tta_transform
202 | 
203 |     def __getitem__(self, index):
204 |         if self.image_source == 'memory':
205 |             load_func = self.load_from_memory
206 |         elif self.image_source == 'disk':
207 |             load_func = self.load_from_disk
208 |         else:
209 |             raise NotImplementedError("Possible loading options: 'memory' and 'disk'!")
210 | 
211 |         Xi = load_func(self.X, index, filetype='png', grayscale=False)
212 |         Xi = from_pil(Xi)
213 | 
214 |         if self.image_augment is not None:
215 |             Xi = self.image_augment(Xi)
216 | 
217 |         if self.tta_params is not None:
218 |             tta_transform_specs = self.tta_params[index]
219 |             Xi = self.tta_transform(Xi, tta_transform_specs)
220 |         Xi = to_pil(Xi)
221 | 
222 |         if self.image_transform is not None:
223 |             Xi = self.image_transform(Xi)
224 | 
225 |         return Xi
226 | 
227 | 
228 | class ImageSegmentationLoaderBasic(BaseTransformer):
229 |     def __init__(self, train_mode, loader_params, dataset_params, augmentation_params):
230 |         super().__init__()
231 |         self.train_mode = train_mode
232 |         self.loader_params = AttrDict(loader_params)
233 |         self.dataset_params = AttrDict(dataset_params)
234 |         self.augmentation_params = AttrDict(augmentation_params)
235 | 
236 |         self.mask_transform = None
237 |         self.image_transform = None
238 | 
239 |         self.image_augment_train = None
240 |         self.image_augment_inference = None
241 |         self.image_augment_with_target_train = None
242 |         self.image_augment_with_target_inference = None
243 | 
244 |         self.dataset = None
245 | 
246 |     def transform(self, X, y, X_valid=None, y_valid=None, **kwargs):
247 |         if self.train_mode and y is not None:
248 |             flow, steps = self.get_datagen(X, y, True, self.loader_params.training)
249 |         else:
250 |             flow, steps = self.get_datagen(X, None, False, self.loader_params.inference)
251 | 
252 |         if X_valid is not None and y_valid is not None:
253 |             valid_flow, valid_steps = self.get_datagen(X_valid, y_valid, False, self.loader_params.inference)
254 |         else:
255 |             valid_flow = None
256 |             valid_steps = None
257 | 
258 |         return {'datagen': (flow, steps),
259 |                 'validation_datagen': (valid_flow, valid_steps)}
260 | 
261 |     def get_datagen(self, X, y, train_mode, loader_params):
262 |         if train_mode:
263 |             dataset = self.dataset(X, y,
264 |                                    train_mode=True,
265 |                                    image_augment=self.image_augment_train,
266 |                                    image_augment_with_target=self.image_augment_with_target_train,
267 |                                    mask_transform=self.mask_transform,
268 |                                    image_transform=self.image_transform,
269 |                                    image_source=self.dataset_params.image_source)
270 |         else:
271 |             dataset = self.dataset(X, y,
272 |                                    train_mode=False,
273 |                                    image_augment=self.image_augment_inference,
274 |                                    image_augment_with_target=self.image_augment_with_target_inference,
275 |                                    mask_transform=self.mask_transform,
276 |                                    image_transform=self.image_transform,
277 |                                    image_source=self.dataset_params.image_source)
278 | 
279 |         datagen = DataLoader(dataset, **loader_params)
280 |         steps = len(datagen)
281 |         return datagen, steps
282 | 
283 |     def load(self, filepath):
284 |         params = joblib.load(filepath)
285 |         self.loader_params = params['loader_params']
286 |         return self
287 | 
288 |     def save(self, filepath):
289 |         params = {'loader_params': self.loader_params}
290 |         joblib.dump(params, filepath)
291 | 
292 | 
293 | class ImageSegmentationLoaderBasicTTA(ImageSegmentationLoaderBasic):
294 |     def __init__(self, loader_params, dataset_params, augmentation_params):
295 |         self.loader_params = AttrDict(loader_params)
296 |         self.dataset_params = AttrDict(dataset_params)
297 |         self.augmentation_params = AttrDict(augmentation_params)
298 | 
299 |         self.mask_transform = None
300 |         self.image_transform = None
301 | 
302 |         self.image_augment_train = None
303 |         self.image_augment_inference = None
304 |         self.image_augment_with_target_train = None
305 |         self.image_augment_with_target_inference = None
306 | 
307 |         self.dataset = None
308 | 
309 |     def transform(self, X, tta_params, **kwargs):
310 |         flow, steps = self.get_datagen(X, tta_params, self.loader_params.inference)
311 |         valid_flow = None
312 |         valid_steps = None
313 |         return {'datagen': (flow, steps),
314 |                 'validation_datagen': (valid_flow, valid_steps)}
315 | 
316 |     def get_datagen(self, X, tta_params, loader_params):
317 |         dataset = self.dataset(tta_params=tta_params,
318 |                                tta_transform=self.augmentation_params.tta_transform,
319 |                                X=X,
320 |                                y=None,
321 |                                train_mode=False,
322 |                                image_augment=self.image_augment_inference,
323 |                                image_augment_with_target=self.image_augment_with_target_inference,
324 |                                mask_transform=self.mask_transform,
325 |                                image_transform=self.image_transform,
326 |                                image_source=self.dataset_params.image_source)
327 | 
328 |         datagen = DataLoader(dataset, **loader_params)
329 |         steps = len(datagen)
330 |         return datagen, steps
331 | 
332 | 
333 | class ImageSegmentationLoaderCropPad(ImageSegmentationLoaderBasic):
334 |     def __init__(self, train_mode, loader_params, dataset_params, augmentation_params):
335 |         super().__init__(train_mode, loader_params, dataset_params, augmentation_params)
336 | 
337 |         self.image_transform = transforms.Compose([transforms.Grayscale(num_output_channels=3),
338 |                                                    transforms.ToTensor(),
339 |                                                    transforms.Normalize(mean=self.dataset_params.MEAN,
340 |                                                                         std=self.dataset_params.STD),
341 |                                                    ])
342 |         self.mask_transform = transforms.Compose([transforms.Lambda(to_array),
343 |                                                   transforms.Lambda(to_tensor),
344 |                                                   ])
345 | 
346 |         self.image_augment_train = ImgAug(self.augmentation_params['image_augment_train'])
347 |         self.image_augment_with_target_train = ImgAug(self.augmentation_params['image_augment_with_target_train'])
348 |         self.image_augment_inference = ImgAug(self.augmentation_params['image_augment_inference'])
349 |         self.image_augment_with_target_inference = ImgAug(
350 |             self.augmentation_params['image_augment_with_target_inference'])
351 | 
352 |         if self.dataset_params.target_format == 'png':
353 |             self.dataset = ImageSegmentationPngDataset
354 |         elif self.dataset_params.target_format == 'json':
355 |             self.dataset = ImageSegmentationJsonDataset
356 |         else:
357 |             raise Exception('files must be png or json')
358 | 
359 | 
360 | class ImageSegmentationLoaderCropPadTTA(ImageSegmentationLoaderBasicTTA):
361 |     def __init__(self, loader_params, dataset_params, augmentation_params):
362 |         super().__init__(loader_params, dataset_params, augmentation_params)
363 | 
364 |         self.image_transform = transforms.Compose([transforms.Grayscale(num_output_channels=3),
365 |                                                    transforms.ToTensor(),
366 |                                                    transforms.Normalize(mean=self.dataset_params.MEAN,
367 |                                                                         std=self.dataset_params.STD),
368 |                                                    ])
369 |         self.mask_transform = transforms.Compose([transforms.Lambda(to_array),
370 |                                                   transforms.Lambda(to_tensor),
371 |                                                   ])
372 | 
373 |         self.image_augment_inference = ImgAug(self.augmentation_params['image_augment_inference'])
374 |         self.image_augment_with_target_inference = ImgAug(
375 |             self.augmentation_params['image_augment_with_target_inference'])
376 |         self.dataset = ImageSegmentationTTADataset
377 | 
378 | 
379 | class ImageSegmentationLoaderResize(ImageSegmentationLoaderBasic):
380 |     def __init__(self, train_mode, loader_params, dataset_params, augmentation_params):
381 |         super().__init__(train_mode, loader_params, dataset_params, augmentation_params)
382 | 
383 |         self.image_transform = transforms.Compose([transforms.Resize((self.dataset_params.h, self.dataset_params.w)),
384 |                                                    transforms.Grayscale(num_output_channels=3),
385 |                                                    transforms.ToTensor(),
386 |                                                    transforms.Normalize(mean=self.dataset_params.MEAN,
387 |                                                                         std=self.dataset_params.STD),
388 |                                                    ])
389 |         self.mask_transform = transforms.Compose([transforms.Resize((self.dataset_params.h, self.dataset_params.w),
390 |                                                                     interpolation=0),
391 |                                                   transforms.Lambda(to_array),
392 |                                                   transforms.Lambda(to_tensor),
393 |                                                   ])
394 | 
395 |         self.image_augment_train = ImgAug(self.augmentation_params['image_augment_train'])
396 |         self.image_augment_with_target_train = ImgAug(self.augmentation_params['image_augment_with_target_train'])
397 | 
398 |         if self.dataset_params.target_format == 'png':
399 |             self.dataset = ImageSegmentationPngDataset
400 |         elif self.dataset_params.target_format == 'json':
401 |             self.dataset = ImageSegmentationJsonDataset
402 |         else:
403 |             raise Exception('files must be png or json')
404 | 
405 | 
406 | class ImageSegmentationLoaderResizeTTA(ImageSegmentationLoaderBasicTTA):
407 |     def __init__(self, loader_params, dataset_params, augmentation_params):
408 |         super().__init__(loader_params, dataset_params, augmentation_params)
409 | 
410 |         self.image_transform = transforms.Compose([transforms.Resize((self.dataset_params.h, self.dataset_params.w)),
411 |                                                    transforms.Grayscale(num_output_channels=3),
412 |                                                    transforms.ToTensor(),
413 |                                                    transforms.Normalize(mean=self.dataset_params.MEAN,
414 |                                                                         std=self.dataset_params.STD),
415 |                                                    ])
416 |         self.mask_transform = transforms.Compose([transforms.Resize((self.dataset_params.h, self.dataset_params.w),
417 |                                                                     interpolation=0),
418 |                                                   transforms.Lambda(to_array),
419 |                                                   transforms.Lambda(to_tensor),
420 |                                                   ])
421 | 
422 |         self.dataset = ImageSegmentationTTADataset
423 | 
424 | 
425 | class MetaTestTimeAugmentationGenerator(BaseTransformer):
426 |     def __init__(self, **kwargs):
427 |         self.tta_transformations = AttrDict(kwargs)
428 | 
429 |     def transform(self, X, **kwargs):
430 |         X_tta_rows, tta_params, img_ids = [], [], []
431 |         for i in range(len(X)):
432 |             rows, params, ids = self._get_tta_data(i, X[i])
433 |             tta_params.extend(params)
434 |             img_ids.extend(ids)
435 |             X_tta_rows.extend(rows)
436 |         X_tta = np.array(X_tta_rows)
437 |         return {'X_tta': X_tta, 'tta_params': tta_params, 'img_ids': img_ids}
438 | 
439 |     def _get_tta_data(self, i, row):
440 |         original_specs = {'ud_flip': False, 'lr_flip': False, 'rotation': 0, 'color_shift': False}
441 |         tta_specs = [original_specs]
442 | 
443 |         ud_options = [True, False] if self.tta_transformations.flip_ud else [False]
444 |         lr_options = [True, False] if self.tta_transformations.flip_lr else [False]
445 |         rot_options = [0, 90, 180, 270] if self.tta_transformations.rotation else [0]
446 |         if self.tta_transformations.color_shift_runs:
447 |             color_shift_options = list(range(1, self.tta_transformations.color_shift_runs + 1, 1))
448 |         else:
449 |             color_shift_options = [False]
450 | 
451 |         for ud, lr, rot, color in product(ud_options, lr_options, rot_options, color_shift_options):
452 |             if ud is False and lr is False and rot == 0 and color is False:
453 |                 continue
454 |             else:
455 |                 tta_specs.append({'ud_flip': ud, 'lr_flip': lr, 'rotation': rot, 'color_shift': color})
456 | 
457 |         img_ids = [i] * len(tta_specs)
458 |         X_rows = [row] * len(tta_specs)
459 |         return X_rows, tta_specs, img_ids
460 | 
461 | 
462 | class TestTimeAugmentationGenerator(BaseTransformer):
463 |     def __init__(self, **kwargs):
464 |         self.tta_transformations = AttrDict(kwargs)
465 | 
466 |     def transform(self, X, **kwargs):
467 |         X_tta, tta_params, img_ids = [], [], []
468 |         X = X[0]
469 |         for i in range(len(X)):
470 |             images, params, ids = self._get_tta_data(i, X[i])
471 |             tta_params.extend(params)
472 |             img_ids.extend(ids)
473 |             X_tta.extend(images)
474 |         return {'X_tta': [X_tta], 'tta_params': tta_params, 'img_ids': img_ids}
475 | 
476 |     def _get_tta_data(self, i, row):
477 |         original_specs = {'ud_flip': False, 'lr_flip': False, 'rotation': 0, 'color_shift': False}
478 |         tta_specs = [original_specs]
479 | 
480 |         ud_options = [True, False] if self.tta_transformations.flip_ud else [False]
481 |         lr_options = [True, False] if self.tta_transformations.flip_lr else [False]
482 |         rot_options = [0, 90, 180, 270] if self.tta_transformations.rotation else [0]
483 |         if self.tta_transformations.color_shift_runs:
484 |             color_shift_options = list(range(1, self.tta_transformations.color_shift_runs + 1, 1))
485 |         else:
486 |             color_shift_options = [False]
487 | 
488 |         for ud, lr, rot, color in product(ud_options, lr_options, rot_options, color_shift_options):
489 |             if ud is False and lr is False and rot == 0 and color is False:
490 |                 continue
491 |             else:
492 |                 tta_specs.append({'ud_flip': ud, 'lr_flip': lr, 'rotation': rot, 'color_shift': color})
493 | 
494 |         img_ids = [i] * len(tta_specs)
495 |         X_rows = [row] * len(tta_specs)
496 |         return X_rows, tta_specs, img_ids
497 | 
498 | 
499 | class TestTimeAugmentationAggregator(BaseTransformer):
500 |     def __init__(self, tta_inverse_transform, method, nthreads):
501 |         self.tta_inverse_transform = tta_inverse_transform
502 |         self.method = method
503 |         self.nthreads = nthreads
504 | 
505 |     @property
506 |     def agg_method(self):
507 |         methods = {'mean': np.mean,
508 |                    'max': np.max,
509 |                    'min': np.min,
510 |                    'gmean': gmean
511 |                    }
512 |         return partial(methods[self.method], axis=-1)
513 | 
514 |     def transform(self, images, tta_params, img_ids, **kwargs):
515 |         _aggregate_augmentations = partial(aggregate_augmentations,
516 |                                            images=images,
517 |                                            tta_params=tta_params,
518 |                                            tta_inverse_transform=self.tta_inverse_transform,
519 |                                            img_ids=img_ids,
520 |                                            agg_method=self.agg_method)
521 |         unique_img_ids = set(img_ids)
522 |         threads = min(self.nthreads, len(unique_img_ids))
523 |         with mp.pool.ThreadPool(threads) as executor:
524 |             averages_images = executor.map(_aggregate_augmentations, unique_img_ids)
525 |         return {'aggregated_prediction': averages_images}
526 | 
527 | 
528 | def aggregate_augmentations(img_id, images, tta_params, tta_inverse_transform, img_ids, agg_method):
529 |     tta_predictions_for_id = []
530 |     for image, tta_param, ids in zip(images, tta_params, img_ids):
531 |         if ids == img_id:
532 |             tta_prediction = tta_inverse_transform(image, tta_param)
533 |             tta_predictions_for_id.append(tta_prediction)
534 |         else:
535 |             continue
536 |     tta_averaged = agg_method(np.stack(tta_predictions_for_id, axis=-1))
537 |     return tta_averaged
538 | 
539 | 
540 | def to_array(x):
541 |     x_ = x.convert('L')  # convert image to monochrome
542 |     x_ = np.array(x_)
543 |     x_ = x_.astype(np.float32)
544 |     return x_
545 | 
546 | 
547 | def to_tensor(x):
548 |     x_ = np.expand_dims(x, axis=0)
549 |     x_ = torch.from_numpy(x_)
550 |     return x_
551 | 


--------------------------------------------------------------------------------