├── .gitignore ├── Dockerfile ├── README.md └── src ├── models ├── __init__.py ├── gru.py └── regressor.py ├── train_deep.py └── util ├── args.py ├── const.py ├── cross_validation.py ├── extensions.py ├── load.py ├── ndcg.py ├── nlp_utils.py ├── preprocessed_dataset.py ├── resource.py ├── seed.py └── transforms.py /.gitignore: -------------------------------------------------------------------------------- 1 | # gitignore for gunosy-creative-evaluation 2 | *.csv 3 | *.png 4 | *.jpg 5 | *.pkl 6 | *.zip 7 | entity_vector.model.bin 8 | 9 | # for TeXs 10 | *.cut 11 | *.aux 12 | *.out 13 | *.synctex.gz 14 | *.xcp 15 | 16 | # Byte-compiled / optimized / DLL files 17 | __pycache__/ 18 | *.py[cod] 19 | *$py.class 20 | 21 | # C extensions 22 | *.so 23 | 24 | # Distribution / packaging 25 | .Python 26 | build/ 27 | develop-eggs/ 28 | dist/ 29 | downloads/ 30 | eggs/ 31 | .eggs/ 32 | lib/ 33 | lib64/ 34 | parts/ 35 | sdist/ 36 | var/ 37 | wheels/ 38 | *.egg-info/ 39 | .installed.cfg 40 | *.egg 41 | MANIFEST 42 | 43 | # PyInstaller 44 | # Usually these files are written by a python script from a template 45 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 46 | *.manifest 47 | *.spec 48 | 49 | # Installer logs 50 | pip-log.txt 51 | pip-delete-this-directory.txt 52 | 53 | # Unit test / coverage reports 54 | htmlcov/ 55 | .tox/ 56 | .coverage 57 | .coverage.* 58 | .cache 59 | nosetests.xml 60 | coverage.xml 61 | *.cover 62 | .hypothesis/ 63 | .pytest_cache/ 64 | 65 | # Translations 66 | *.mo 67 | *.pot 68 | 69 | # Django stuff: 70 | *.log 71 | local_settings.py 72 | db.sqlite3 73 | 74 | # Flask stuff: 75 | instance/ 76 | .webassets-cache 77 | 78 | # Scrapy stuff: 79 | .scrapy 80 | 81 | # Sphinx documentation 82 | docs/_build/ 83 | 84 | # PyBuilder 85 | target/ 86 | 87 | # Jupyter Notebook 88 | .ipynb_checkpoints 89 | 90 | # IPython 91 | profile_default/ 92 | ipython_config.py 93 | 94 | # pyenv 95 | .python-version 96 | 97 | # celery beat schedule file 98 | celerybeat-schedule 99 | 100 | # SageMath parsed files 101 | *.sage.py 102 | 103 | # Environments 104 | .env 105 | .venv 106 | env/ 107 | venv/ 108 | ENV/ 109 | env.bak/ 110 | venv.bak/ 111 | 112 | # Spyder project settings 113 | .spyderproject 114 | .spyproject 115 | 116 | # Rope project settings 117 | .ropeproject 118 | 119 | # mkdocs documentation 120 | /site 121 | 122 | # mypy 123 | .mypy_cache/ 124 | .dmypy.json 125 | dmypy.json -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | From nvidia/cuda:9.2-cudnn7-devel-ubuntu16.04 2 | 3 | ENV MYSQL_PWD root 4 | RUN echo "mysql-server mysql-server/root_password password $MYSQL_PWD" | debconf-set-selections 5 | RUN echo "mysql-server mysql-server/root_password_again password $MYSQL_PWD" | debconf-set-selections 6 | 7 | RUN apt-get update && apt-get install -y \ 8 | software-properties-common && \ 9 | add-apt-repository ppa:jonathonf/python-3.6 -y && \ 10 | apt-get -y update 11 | 12 | RUN apt-get install -y \ 13 | build-essential \ 14 | tmux \ 15 | python3.6 \ 16 | python3.6-dev \ 17 | python3-pip \ 18 | python3-wheel \ 19 | python3-setuptools \ 20 | python3-tk \ 21 | mysql-client \ 22 | mysql-server \ 23 | libmysqlclient-dev \ 24 | libssl-dev \ 25 | sudo \ 26 | mecab \ 27 | libmecab-dev \ 28 | mecab-ipadic-utf8 \ 29 | git \ 30 | make \ 31 | curl \ 32 | xz-utils \ 33 | file \ 34 | swig \ 35 | language-pack-ja-base \ 36 | language-pack-ja \ 37 | locales \ 38 | && locale-gen ja_JP.UTF-8 \ 39 | && localedef -f UTF-8 -i ja_JP ja_JP 40 | 41 | ENV TZ Asia/Tokyo 42 | ENV LANG ja_JP.UTF-8 43 | ENV LANGUAGE ja_JP:jp 44 | ENV LC_ALL ja_JP.UTF-8 45 | RUN ln -fns /usr/bin/python3.6 /usr/bin/python && \ 46 | ln -fns /usr/bin/python3.6 /usr/bin/python3 && \ 47 | ln -fns /usr/bin/pip3 /usr/bin/pip 48 | 49 | # install chainer and cupy 50 | RUN pip install --no-cache-dir cupy-cuda92 chainer 51 | 52 | # install mecab-ipadic-neologd 53 | RUN git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git && \ 54 | cd mecab-ipadic-neologd && \ 55 | bin/install-mecab-ipadic-neologd -n -y -p /var/lib/mecab/dic/mecab-ipadic-neologd 56 | 57 | # install mecab-python3 58 | RUN pip install --no-cache-dir mecab-python3 59 | 60 | # settings for Japanese 61 | # RUN update-locale LANG=ja_JP.UTF-8 LANGUAGE=ja_JP:ja 62 | 63 | RUN pip install --no-cache-dir jupyterlab 64 | EXPOSE 8888 65 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Multi-task Conditional Attention Networks 2 | 3 | A prototype version of our submitted paper: `Conversion Prediction Using Multi-task Conditional Attention Networks to Support the Creation of Effective Ad Creatives`. 4 | 5 | ## Setup using Docker 6 | 7 | ```shell 8 | $ docker build -t multi-task-cond-net-env . 9 | $ docker create -it -v /data:/data --name datavolume busybox 10 | $ docker run -it -p 8888:8888 --runtime=nvidia --volumes-from datavolume --rm --name multi-task-cond-net multi-task-cond-net-env 11 | ``` 12 | -------------------------------------------------------------------------------- /src/models/__init__.py: -------------------------------------------------------------------------------- 1 | from models import gru 2 | from models.mlp import MLPEncoder 3 | from models.regressor import MultiTaskRegressor, Regressor 4 | 5 | ARCHS = { 6 | 'mlp': MLPEncoder, 7 | 'gru': gru.GRUEncoder, 8 | 'gru_attn': gru.AttentionGRUEncoder, 9 | 'gru_attn_cond': gru.ConditionalAttentionGRUEncoder, 10 | 'gru_attn_word_cond': gru.ConditionalWordAttentionGRUEncoder, 11 | } 12 | 13 | MODEL_WRAPPERS = { 14 | 'regression': Regressor, 15 | 'multi_regression': MultiTaskRegressor, 16 | } 17 | -------------------------------------------------------------------------------- /src/models/gru.py: -------------------------------------------------------------------------------- 1 | import chainer 2 | import chainer.functions as F 3 | import chainer.links as L 4 | 5 | from models.base_embedding_layer import BaseEmbeddingLayer 6 | from util import const 7 | 8 | 9 | class GRUEncoder(chainer.Chain): 10 | 11 | def __init__(self, 12 | n_layers, 13 | n_vocab, 14 | n_genre, 15 | pretrained_w2v, 16 | is_update_w2v, 17 | dropout, 18 | genre_units=5): 19 | 20 | super(GRUEncoder, self).__init__() 21 | with self.init_scope(): 22 | self.base_embedding_layer = BaseEmbeddingLayer( 23 | n_vocab=n_vocab, 24 | n_genre=n_genre, genre_units=genre_units, 25 | pretrained_w2v=pretrained_w2v, 26 | is_update_w2v=is_update_w2v, 27 | dropout=dropout) 28 | 29 | self.title_encoder = L.NStepGRU( 30 | n_layers, 31 | self.base_embedding_layer.n_units, 32 | self.base_embedding_layer.n_units, dropout) 33 | self.content_encoder = L.NStepGRU( 34 | n_layers, 35 | self.base_embedding_layer.n_units, 36 | self.base_embedding_layer.n_units, dropout) 37 | 38 | self.out_units = self.base_embedding_layer.n_units * 2 \ 39 | + genre_units \ 40 | + const.PREPROCESS_GENDER_TARGET_NUM 41 | 42 | self.n_layers = n_layers 43 | self.dropout = dropout 44 | 45 | def forward(self, 46 | genre_xs, 47 | gender_xs, 48 | title_xs, 49 | content_xs, 50 | **kwargs): 51 | 52 | embeddings = self.base_embedding_layer( 53 | title_xs=title_xs, content_xs=content_xs, 54 | genre_xs=genre_xs) 55 | title_exs, content_exs, genre_exs = embeddings 56 | gender_exs = F.stack(gender_xs) 57 | 58 | last_title_h, title_ys = self.title_encoder(None, title_exs) 59 | last_content_h, content_ys = self.content_encoder(None, content_exs) 60 | 61 | concat_outputs = F.concat(( 62 | genre_exs, 63 | gender_exs, 64 | last_title_h[-1], 65 | last_content_h[-1], 66 | )) 67 | 68 | return concat_outputs 69 | 70 | 71 | class AttentionGRUEncoder(GRUEncoder): 72 | 73 | def __init__(self, 74 | n_layers, 75 | n_vocab, 76 | n_genre, 77 | pretrained_w2v, 78 | is_update_w2v, 79 | dropout, 80 | genre_units=5): 81 | 82 | super(AttentionGRUEncoder, self).__init__( 83 | n_layers=n_layers, 84 | n_vocab=n_vocab, 85 | n_genre=n_genre, 86 | pretrained_w2v=pretrained_w2v, 87 | is_update_w2v=is_update_w2v, 88 | dropout=dropout, 89 | genre_units=genre_units) 90 | 91 | with self.init_scope(): 92 | self.attn_title = L.Linear(self.base_embedding_layer.n_units, 1) 93 | self.attn_content = L.Linear(self.base_embedding_layer.n_units, 1) 94 | 95 | def calc_attention(self, xs, ys, attn_linear): 96 | 97 | concat_ys = F.concat(ys, axis=0) 98 | attn_ys = attn_linear(F.tanh(concat_ys)) 99 | 100 | cumsum_ys = self.xp.cumsum(self.xp.array([len(x) for x in xs], dtype=self.xp.int32)) 101 | 102 | split_attn_ys = F.split_axis(attn_ys, cumsum_ys[:-1].tolist(), axis=0) 103 | split_attn_ys_pad = F.pad_sequence(split_attn_ys, padding=-1024) 104 | attn_softmax = F.softmax(split_attn_ys_pad, axis=1) 105 | 106 | return attn_softmax 107 | 108 | def apply_attention(self, ys, attn_softmax): 109 | batchsize = len(ys) 110 | 111 | ys_pad = F.pad_sequence(ys, padding=0.0) 112 | ys_pad_reshape = F.reshape(ys_pad, (-1, ys_pad.shape[-1])) 113 | 114 | attn_softmax_reshape = F.broadcast_to( 115 | F.reshape(attn_softmax, (-1, attn_softmax.shape[-1])), ys_pad_reshape.shape) 116 | 117 | attn_hidden = ys_pad_reshape * attn_softmax_reshape 118 | attn_hidden_reshape = F.reshape(attn_hidden, (batchsize, -1, attn_hidden.shape[-1])) 119 | 120 | return F.sum(attn_hidden_reshape, axis=1) 121 | 122 | def forward(self, 123 | genre_xs, 124 | gender_xs, 125 | title_xs, 126 | content_xs, 127 | **kwargs): 128 | 129 | embedding = self.base_embedding_layer( 130 | title_xs=title_xs, content_xs=content_xs, 131 | genre_xs=genre_xs) 132 | title_exs, content_exs, genre_exs = embedding 133 | gender_exs = F.stack(gender_xs) 134 | 135 | last_title_h, title_ys = self.title_encoder(None, title_exs) 136 | last_content_h, content_ys = self.content_encoder(None, content_exs) 137 | 138 | attn_title = self.calc_attention(title_xs, title_ys, self.attn_title) 139 | attn_title_h = self.apply_attention(title_ys, attn_title) 140 | 141 | attn_content = self.calc_attention(content_xs, content_ys, self.attn_content) 142 | attn_content_h = self.apply_attention(content_ys, attn_content) 143 | 144 | concat_outputs = F.concat(( 145 | genre_exs, 146 | gender_exs, 147 | attn_title_h, 148 | attn_content_h, 149 | )) 150 | 151 | return concat_outputs 152 | 153 | 154 | class ConditionalAttentionGRUEncoder(AttentionGRUEncoder): 155 | 156 | def __init__(self, 157 | n_layers, 158 | n_vocab, 159 | n_genre, 160 | pretrained_w2v, 161 | is_update_w2v, 162 | dropout, 163 | genre_units=5): 164 | 165 | super(ConditionalAttentionGRUEncoder, self).__init__( 166 | n_layers=n_layers, 167 | n_vocab=n_vocab, 168 | n_genre=n_genre, 169 | pretrained_w2v=pretrained_w2v, 170 | is_update_w2v=is_update_w2v, 171 | dropout=dropout, 172 | genre_units=genre_units) 173 | 174 | with self.init_scope(): 175 | self.proj_cond = L.Linear(None, 1, nobias=True) 176 | 177 | def calc_attention(self, xs, ys, genre_exs, gender_exs, attn_linear): 178 | 179 | concat_ys = F.concat(ys, axis=0) # -> (total len of batched sentence, word embedding dim) 180 | attn_ys = attn_linear(F.tanh(concat_ys)) 181 | cond_feature = self.proj_cond(F.concat((genre_exs, gender_exs))) # -> (batchsize, proj_cond dim) 182 | 183 | cumsum_ys = self.xp.cumsum(self.xp.array([len(x) for x in xs], dtype=self.xp.int32)) 184 | split_attn_ys = F.split_axis(attn_ys, cumsum_ys[:-1].tolist(), axis=0) 185 | split_attn_ys_pad = F.pad_sequence(split_attn_ys, padding=-1024) 186 | 187 | bool_cond = split_attn_ys_pad.array == -1024 188 | split_attn_ys_pad = split_attn_ys_pad * F.expand_dims( 189 | F.broadcast_to(cond_feature, (split_attn_ys_pad.shape[:-1])), axis=-1) 190 | 191 | padding_array = self.xp.full(split_attn_ys_pad.shape, -1024, dtype=self.xp.float32) 192 | 193 | split_attn_ys_pad = F.where(bool_cond, padding_array, split_attn_ys_pad) 194 | 195 | attn_softmax = F.softmax(split_attn_ys_pad, axis=1) 196 | 197 | return attn_softmax 198 | 199 | def apply_attention(self, ys, attn_softmax): 200 | batchsize = len(ys) 201 | 202 | ys_pad = F.pad_sequence(ys, padding=0.0) 203 | ys_pad_reshape = F.reshape(ys_pad, (-1, ys_pad.shape[-1])) 204 | 205 | attn_softmax_reshape = F.broadcast_to( 206 | F.reshape(attn_softmax, (-1, attn_softmax.shape[-1])), ys_pad_reshape.shape) 207 | 208 | attn_hidden = ys_pad_reshape * attn_softmax_reshape 209 | attn_hidden_reshape = F.reshape(attn_hidden, (batchsize, -1, attn_hidden.shape[-1])) 210 | 211 | return F.sum(attn_hidden_reshape, axis=1) 212 | 213 | def forward(self, 214 | genre_xs, 215 | gender_xs, 216 | title_xs, 217 | content_xs, 218 | **kwargs): 219 | 220 | embedding = self.base_embedding_layer( 221 | title_xs=title_xs, content_xs=content_xs, 222 | genre_xs=genre_xs) 223 | title_exs, content_exs, genre_exs = embedding 224 | gender_exs = F.stack(gender_xs) 225 | 226 | last_title_h, title_ys = self.title_encoder(None, title_exs) 227 | last_content_h, content_ys = self.content_encoder(None, content_exs) 228 | 229 | attn_title = self.calc_attention(title_xs, title_ys, genre_exs, 230 | gender_exs, self.attn_title) 231 | attn_title_h = self.apply_attention(title_ys, attn_title) 232 | 233 | attn_content = self.calc_attention(content_xs, content_ys, genre_exs, 234 | gender_exs, self.attn_content) 235 | attn_content_h = self.apply_attention(content_ys, attn_content) 236 | 237 | concat_outputs = F.concat(( 238 | genre_exs, 239 | gender_exs, 240 | attn_title_h, 241 | attn_content_h, 242 | )) 243 | 244 | return concat_outputs 245 | -------------------------------------------------------------------------------- /src/models/regressor.py: -------------------------------------------------------------------------------- 1 | import chainer 2 | import chainer.functions as F 3 | import chainer.links as L 4 | from chainer import reporter 5 | 6 | 7 | class Regressor(chainer.Chain): 8 | 9 | def __init__(self, encoder, dropout=0.): 10 | super(Regressor, self).__init__() 11 | with self.init_scope(): 12 | self.encoder = encoder 13 | self.output = L.Linear(encoder.out_units, 1) 14 | 15 | self.dropout = dropout 16 | 17 | def forward(self, ys, **kwargs): 18 | 19 | concat_outputs = F.concat(self.predict(**kwargs), axis=0) 20 | concat_truths = F.concat(ys, axis=0) 21 | 22 | loss = F.mean_squared_error(concat_outputs, concat_truths) 23 | reporter.report({'loss': loss.data}, self) 24 | 25 | return loss 26 | 27 | def predict(self, **kwargs): 28 | 29 | concat_encodings = F.dropout(self.encoder(**kwargs), ratio=self.dropout) 30 | concat_outputs = F.sigmoid(self.output(concat_encodings)) 31 | 32 | return concat_outputs 33 | 34 | 35 | class MultiTaskRegressor(chainer.Chain): 36 | 37 | def __init__(self, encoder, dropout=0.): 38 | super(MultiTaskRegressor, self).__init__() 39 | with self.init_scope(): 40 | self.encoder = encoder 41 | self.output = L.Linear(encoder.out_units, 2) 42 | 43 | self.dropout = dropout 44 | 45 | def forward(self, ys, **kwargs): 46 | 47 | pred_click, pred_cv = self.predict(**kwargs) 48 | ys = F.stack(ys) 49 | true_click, true_cv = ys[:, 0], ys[:, 1] 50 | 51 | loss_click = F.mean_squared_error(pred_click, true_click) 52 | loss_cv = F.mean_squared_error(pred_cv, true_cv) 53 | loss = loss_click + loss_cv 54 | 55 | reporter.report({'loss': loss.data}, self) 56 | reporter.report({'loss_click': loss_click.data}, self) 57 | reporter.report({'loss_cv': loss_cv.data}, self) 58 | 59 | return loss 60 | 61 | def predict(self, **kwargs): 62 | 63 | concat_encodings = F.dropout(self.encoder(**kwargs), ratio=self.dropout) 64 | output = F.sigmoid(self.output(concat_encodings)) 65 | output_click, output_cv = output[:, 0], output[:, 1] 66 | 67 | return output_click, output_cv 68 | -------------------------------------------------------------------------------- /src/train_deep.py: -------------------------------------------------------------------------------- 1 | import matplotlib # NOQA # isort:skip 2 | matplotlib.use("Agg") # NOQA # isort:skip 3 | 4 | from collections import OrderedDict 5 | 6 | import chainer 7 | import logzero 8 | import matplotlib.pyplot as plt 9 | import numpy as np 10 | from chainer import training 11 | from chainer.training import extensions 12 | from gensim.models import KeyedVectors 13 | 14 | from models import ARCHS, MODEL_WRAPPERS 15 | from util import const 16 | from util.args import parse_train_args as parse_args 17 | from util.cross_validation import kfold_iter 18 | from util.evaluate import EVALUATE_PHASES 19 | from util.extensions import ( 20 | setup_optim_trigger, 21 | setup_plot_report_loss_entries, 22 | setup_print_report_entries, 23 | setup_record_trigger 24 | ) 25 | from util.load import load_data 26 | from util.notify import notify_exception, notify_result 27 | from util.preprocessed_dataset import ( 28 | PreprocessedDataset, 29 | convert_seq, 30 | prepare_vectorizer 31 | ) 32 | from util.resource import Resource 33 | from util.seed import reset_seed 34 | 35 | plt.style.use('ggplot') 36 | chainer.cuda.set_max_workspace_size(1024 * 1024 * 1024) 37 | chainer.global_config.autotune = True 38 | 39 | 40 | def main(args): 41 | 42 | reset_seed(args.seed) 43 | 44 | res = Resource(args, train=True) 45 | 46 | pretrained_word2vec = KeyedVectors.load_word2vec_format( 47 | str(const.PRETRAINED_WORD2VEC_FPATH), binary=True) 48 | 49 | vectorizer = prepare_vectorizer(pretrained_word2vec, 50 | args.training_type, 51 | norm_imp=const.NORMALIZED_IMPRESSION, 52 | is_impnorm=args.imp_norm, 53 | is_logarithm=True) 54 | 55 | df = load_data(const.TRAIN_DATA_FPATH) 56 | 57 | kf = kfold_iter(X=df, y=df[args.objective], 58 | n_splits=args.fold, 59 | random_state=args.seed, 60 | is_campaign_group=args.group, 61 | training_type=args.training_type, 62 | campaign_ids=df['campaign_id'].values) 63 | 64 | scores = OrderedDict([(metric, []) for metric 65 | in const.EVALUATION_METRIC[args.training_type]]) 66 | 67 | res.loginfo('Start training') 68 | for i, (train_idx, val_idx) in enumerate(kf): 69 | res.loginfo('Fold: {}'.format(i + 1)) 70 | 71 | df_train = df.iloc[train_idx] 72 | df_val = df.iloc[val_idx] 73 | 74 | df_train = vectorizer.fit_transform(df_train) 75 | df_val = vectorizer.transform(df_val) 76 | 77 | n_genre = len(vectorizer.named_steps.genre.le.classes_) 78 | n_gender = len(vectorizer.named_steps.gender.lb.classes_) 79 | 80 | res.logdebug('# of genres: {}'.format(n_genre)) 81 | res.logdebug('# of gender: {}'.format(n_gender)) 82 | 83 | train_pairs = PreprocessedDataset(df_train, args.training_type, 84 | output_cols=args.objective) 85 | val_pairs = PreprocessedDataset(df_val, args.training_type, 86 | output_cols=args.objective) 87 | 88 | train_iter = chainer.iterators.SerialIterator( 89 | train_pairs, args.batchsize) 90 | val_iter = chainer.iterators.SerialIterator( 91 | val_pairs, args.batchsize, repeat=False, shuffle=False) 92 | 93 | is_update_w2v = args.word_embedding == const.WORD2VEC_UPDATE 94 | encoder = ARCHS[args.arch](n_layers=args.layer, 95 | n_genre=n_genre, 96 | n_vocab=len(pretrained_word2vec.index2word), 97 | pretrained_w2v=pretrained_word2vec, 98 | is_update_w2v=is_update_w2v, 99 | dropout=args.dropout) 100 | model = MODEL_WRAPPERS[args.training_type]( 101 | encoder=encoder, dropout=args.dropout) 102 | 103 | if args.gpu >= 0: 104 | chainer.cuda.get_device_from_id(args.gpu).use() 105 | model.to_gpu() 106 | 107 | # Setup an optimizer 108 | optimizer = chainer.optimizers.Adam() 109 | optimizer.setup(model) 110 | optimizer.add_hook(chainer.optimizer.WeightDecay(args.weight_decay)) 111 | 112 | # Set up a trainer 113 | updater = training.updaters.StandardUpdater( 114 | train_iter, optimizer, device=args.gpu, 115 | converter=convert_seq) 116 | trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=str(res.output_dir)) 117 | 118 | # Evaluate the model with the test dataset for each epoch 119 | trainer.extend(extensions.Evaluator( 120 | val_iter, model, device=args.gpu, 121 | converter=convert_seq)) 122 | 123 | # Take a best snapshot 124 | record_trigger = setup_record_trigger(args.training_type) 125 | optim_trigger = setup_optim_trigger(args.training_type) 126 | 127 | model_fname = '{}_{}_{}-fold_{}_best_model.npz'.format( 128 | res.sdtime, args.training_type, i + 1, args.arch) 129 | trainer.extend(extensions.snapshot_object( 130 | model, model_fname), 131 | trigger=record_trigger) 132 | 133 | trainer.extend(extensions.ExponentialShift('alpha', 0.9), trigger=optim_trigger) 134 | 135 | # Write a log of evaluation statistics for each epoch 136 | trainer_log_name = '{}_{}_{}-fold_{}_reporter.json'.format( 137 | res.sdtime, args.training_type, i + 1, args.arch) 138 | trainer.extend(extensions.LogReport(log_name=trainer_log_name)) 139 | trainer.extend(extensions.observe_lr()) 140 | 141 | fig_loss_fpath = res.fig_loss_dir / '{}_{}_{}-fold_loss.png'.format( 142 | res.sdtime, args.training_type, i + 1) 143 | fig_loss_path = fig_loss_fpath.relative_to(res.output_dir) 144 | plot_loss_entries = setup_plot_report_loss_entries(args.training_type) 145 | trainer.extend(extensions.PlotReport(plot_loss_entries, 'epoch', 146 | file_name=str(fig_loss_path), grid=False)) 147 | 148 | entries = setup_print_report_entries(args.training_type) 149 | trainer.extend(extensions.PrintReport(entries)) 150 | 151 | trainer.extend(extensions.ProgressBar(update_interval=10)) 152 | 153 | # Run the training 154 | trainer.run() 155 | 156 | if args.test: 157 | res.logger.debug('Start test phase') 158 | snapshot_best_model = res.output_dir / model_fname 159 | chainer.serializers.load_npz(str(snapshot_best_model), model) 160 | res.logdebug('Load: {}'.format(str(snapshot_best_model))) 161 | 162 | calculated_scores = EVALUATE_PHASES[args.training_type]( 163 | res=res, model=model, pairs=val_pairs, fold=i + 1, 164 | converter=convert_seq, vectorizer=vectorizer).test() 165 | 166 | for metric in scores.keys(): 167 | scores[metric].append(calculated_scores[metric]) 168 | 169 | if not args.cv: 170 | break 171 | 172 | res.dump_command_info() 173 | logger = logzero.setup_logger( 174 | name='test', 175 | logfile=str(res.log_dir / f'{res.sdtime}_test.log'), 176 | ) 177 | for metric in scores.keys(): 178 | logger.info(f'Average {metric}: {np.mean(scores[metric]):.6f}, var: {np.var(scores[metric]):.6f}') 179 | 180 | res.dump_duration() 181 | 182 | notify_result(res) 183 | 184 | 185 | if __name__ == '__main__': 186 | 187 | args = parse_args() 188 | try: 189 | main(args) 190 | except Exception as err: 191 | if not args.debug: 192 | notify_exception(err) 193 | raise err 194 | -------------------------------------------------------------------------------- /src/util/args.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from models import ARCHS 4 | from util import const 5 | 6 | 7 | def common_args(): 8 | 9 | parser = argparse.ArgumentParser(description='training for analysing creatives') 10 | parser.add_argument('--seed', 11 | type=int, 12 | default=19950815) 13 | parser.add_argument('--test', 14 | action='store_true', 15 | default=False) 16 | parser.add_argument('--debug', 17 | action='store_true', 18 | default=False) 19 | parser.add_argument('--out', 20 | type=str, 21 | default='result') 22 | parser.add_argument('--arch', 23 | type=str, 24 | default='gru', 25 | choices=ARCHS.keys()) 26 | parser.add_argument('--gpu', 27 | type=int, 28 | default=-1) 29 | parser.add_argument('--fold', 30 | type=int, 31 | default=5) 32 | parser.add_argument('--group', 33 | action='store_true', 34 | default=False) 35 | parser.add_argument('--training_type', 36 | type=str, 37 | choices=const.TRAINING_TYPES, 38 | default=const.TRAINING_TYPES_REGRESSION) 39 | parser.add_argument('--workers', default=8, type=int) 40 | parser.add_argument('--resume', '-r', default='') 41 | 42 | return parser 43 | 44 | 45 | def parse_train_args(): 46 | 47 | parser = common_args() 48 | parser.add_argument('--epoch', 49 | type=int, 50 | default=50) 51 | parser.add_argument('--batchsize', 52 | type=int, 53 | default=32) 54 | parser.add_argument('--weight_decay', 55 | type=float, 56 | default=0.0001) 57 | parser.add_argument('--cv', 58 | action='store_true', 59 | default=False) 60 | parser.add_argument('--layer', 61 | type=int, 62 | default=1) 63 | parser.add_argument('--genre_unit', 64 | type=int, 65 | default=5) 66 | parser.add_argument('--dropout', 67 | type=float, 68 | default=0.2) 69 | parser.add_argument('--num_class', 70 | type=int, 71 | default=3) 72 | parser.add_argument('--word_embedding', 73 | choices=const.WORD2VEC_TYPES, 74 | default=const.WORD2VEC_UPDATE) 75 | 76 | parser.add_argument('--objective', 77 | choices=['conversion', 'cvr', 'click'], 78 | nargs='+', 79 | default=['conversion']) 80 | 81 | return parser.parse_args() 82 | -------------------------------------------------------------------------------- /src/util/const.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | ROOT_DIR = pathlib.Path(__file__).parents[2] 4 | DATA_DIR = ROOT_DIR / 'data' 5 | 6 | TRAIN_DATA_FNAME = 'train.csv' 7 | TRAIN_DATA_FPATH = DATA_DIR / TRAIN_DATA_FNAME 8 | 9 | PRETRAINED_WORD2VEC_FNAME = 'entity_vector.model.bin' 10 | PRETRAINED_WORD2VEC_FPATH = DATA_DIR / PRETRAINED_WORD2VEC_FNAME 11 | 12 | NEOLOGD_DIR = '/var/lib/mecab/dic/mecab-ipadic-neologd' 13 | 14 | DATASET_FROM_DATE = '2017-08-01' 15 | DATASET_TO_DATE = '2018-08-01' 16 | 17 | PREPROCESS_GENDER_TARGET_NUM = 3 18 | 19 | NORMALIZED_IMPRESSION = 30000 20 | 21 | TRAINING_TYPES_REGRESSION = 'regression' 22 | TRAINING_TYPES_MULTI_REGRESSION = 'multi_regression' 23 | TRAINING_TYPES = [ 24 | TRAINING_TYPES_REGRESSION, 25 | TRAINING_TYPES_MULTI_REGRESSION, 26 | ] 27 | 28 | WORD2VEC_UPDATE = 'word2vec_update' 29 | WORD2VEC_FREEZE = 'word2vec_freeze' 30 | FROM_SCRATCH = 'scratch' 31 | WORD2VEC_TYPES = [ 32 | WORD2VEC_UPDATE, 33 | WORD2VEC_FREEZE, 34 | FROM_SCRATCH, 35 | ] 36 | 37 | EVALUATION_METRIC = { 38 | TRAINING_TYPES_REGRESSION: [ 39 | 'MSE_CV', 'MSE_gt_1', 'MAP_0', 'MAP_10', 'NDCG_CV', 40 | 'MSE_CV_top_50', 'MSE_CV_top_25', 'MSE_CV_top_10', 'MSE_CV_top_5', 'MSE_CV_top_1', 41 | 'NDCG_CV_top_50', 'NDCG_CV_top_25', 'NDCG_CV_top_10', 'NDCG_CV_top_5', 'NDCG_CV_top_1', 42 | ], 43 | TRAINING_TYPES_MULTI_REGRESSION: [ 44 | 'MSE_click', 'MSE_CV', 'MSE_multi', 'MSE_CVR', 'MAP_0', 'MAP_10', 'MAP_CVR', 'NDCG_CV', 'NDCG_CVR', 45 | 'MSE_CV_top_50', 'MSE_CV_top_25', 'MSE_CV_top_10', 'MSE_CV_top_5', 'MSE_CV_top_1', 46 | 'NDCG_CV_top_50', 'NDCG_CV_top_25', 'NDCG_CV_top_10', 'NDCG_CV_top_5', 'NDCG_CV_top_1', 47 | ], 48 | } 49 | -------------------------------------------------------------------------------- /src/util/cross_validation.py: -------------------------------------------------------------------------------- 1 | from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold 2 | 3 | 4 | def kfold_iter( 5 | X, y, n_splits, 6 | random_state, 7 | is_campaign_group, 8 | campaign_ids=None, 9 | training_type='classification'): 10 | 11 | if is_campaign_group and campaign_ids is not None: 12 | return GroupKFold(n_splits=n_splits).split(X, y, campaign_ids) 13 | 14 | if training_type == 'classification': 15 | return StratifiedKFold(n_splits=n_splits, 16 | random_state=random_state, 17 | shuffle=True).split(X, y) 18 | elif training_type in ('regression', 'multi_regression'): 19 | return KFold(n_splits=n_splits, 20 | random_state=random_state, 21 | shuffle=True).split(X, y) 22 | else: 23 | raise ValueError('Invalid training type: {}'.format(training_type)) 24 | -------------------------------------------------------------------------------- /src/util/extensions.py: -------------------------------------------------------------------------------- 1 | from chainer.training import triggers 2 | 3 | 4 | def setup_record_trigger(training_type): 5 | 6 | if training_type == 'regression' or training_type == 'multi_regression': 7 | return triggers.MinValueTrigger('validation/main/loss') 8 | 9 | else: 10 | raise ValueError('Invalid training type: {}'.format(training_type)) 11 | 12 | 13 | def setup_optim_trigger(training_type): 14 | 15 | if training_type == 'regression' or training_type == 'multi_regression': 16 | return triggers.MinValueTrigger('validation/main/loss') 17 | 18 | else: 19 | raise ValueError('Invalid training type: {}'.format(training_type)) 20 | 21 | 22 | def setup_print_report_entries(training_type): 23 | 24 | if training_type == 'regression': 25 | entries = [ 26 | 'epoch', 27 | 'main/loss', 'validation/main/loss', 28 | 'elapsed_time', 'lr', 29 | ] 30 | elif training_type == 'multi_regression': 31 | entries = [ 32 | 'epoch', 33 | 'main/loss', 'validation/main/loss', 34 | 'main/loss_click', 'validation/main/loss_click', 35 | 'main/loss_cv', 'validation/main/loss_cv', 36 | 'elapsed_time', 'lr', 37 | ] 38 | else: 39 | raise ValueError('Invalid training type: {}'.format(training_type)) 40 | 41 | return entries 42 | 43 | 44 | def setup_plot_report_loss_entries(training_type): 45 | 46 | if training_type == 'classification' or training_type == 'regression': 47 | entries = ['main/loss', 'val/main/loss'] 48 | 49 | elif training_type == 'multi_regression': 50 | entries = [ 51 | 'main/loss', 'validation/main/loss', 52 | 'main/loss_click', 'validation/main/loss_click', 53 | 'main/loss_cv', 'validation/main/loss_cv', 54 | ] 55 | else: 56 | raise ValueError('Invalid training type: {}'.format(training_type)) 57 | 58 | return entries 59 | -------------------------------------------------------------------------------- /src/util/load.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | def load_data(dataset_fpath): 5 | 6 | df = pd.read_csv(str(dataset_fpath)) 7 | print(f'Raw data size: {df.shape}') 8 | 9 | df['cvr'] = df[['conversion', 'click']].apply( 10 | lambda x: (x[0] / x[1]) * 100 if x[1] > 0 else 0, axis=1) 11 | 12 | return df 13 | -------------------------------------------------------------------------------- /src/util/ndcg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def ndcg_score(y_true, y_pred, k=None, powered=False): 5 | def dcg(scores, k=None, powered=False): 6 | if k is None: 7 | k = scores.shape[0] 8 | if not powered: 9 | ret = scores[0] 10 | for i in range(1, k): 11 | ret += scores[i] / np.log2(i + 1) 12 | return ret 13 | else: 14 | ret = 0 15 | for i in range(k): 16 | ret += (2 ** scores[i] - 1) / np.log2(i + 2) 17 | return ret 18 | 19 | ideal_sorted_scores = np.sort(y_true)[::-1] 20 | ideal_dcg_score = dcg(ideal_sorted_scores, k=k, powered=powered) 21 | 22 | pred_sorted_ind = np.argsort(y_pred)[::-1] 23 | pred_sorted_scores = y_true[pred_sorted_ind] 24 | dcg_score = dcg(pred_sorted_scores, k=k, powered=powered) 25 | 26 | return dcg_score / ideal_dcg_score 27 | 28 | 29 | def ndcg1_score(y_true, y_pred, k=None): 30 | return ndcg_score(y_true, y_pred, k=k, powered=False) 31 | -------------------------------------------------------------------------------- /src/util/nlp_utils.py: -------------------------------------------------------------------------------- 1 | import chainer.functions as F 2 | import numpy as np 3 | 4 | 5 | def sequence_embed(embed, xs, dropout=0.): 6 | 7 | x_len = [len(x) for x in xs] 8 | x_section = np.cumsum(x_len[:-1]) 9 | 10 | ex = embed(F.concat(xs, axis=0)) 11 | ex = F.dropout(ex, ratio=dropout) 12 | exs = F.split_axis(ex, x_section, 0) 13 | return exs 14 | 15 | 16 | def block_embed(embed, x, dropout=0.): 17 | 18 | e = embed(x) 19 | e = F.dropout(e, ratio=dropout) 20 | e = F.transpose(e, (0, 2, 1)) 21 | e = e[:, :, :, None] 22 | return e 23 | -------------------------------------------------------------------------------- /src/util/preprocessed_dataset.py: -------------------------------------------------------------------------------- 1 | import chainer 2 | import numpy as np 3 | from chainer import cuda 4 | from sklearn.pipeline import Pipeline 5 | 6 | from util import const 7 | from util.transforms import ( 8 | GenderTransformer, 9 | GenreTransformer, 10 | LimitDataTransformer, 11 | MinMaxScaleTransformer, 12 | ToLogarithmTransformer, 13 | TypeConvertTransformer, 14 | Word2VecTransformer 15 | ) 16 | 17 | 18 | def prepare_vectorizer(pretrain_w2v, 19 | training_type, 20 | norm_imp=None, 21 | is_impnorm=False, 22 | is_logarithm=False, 23 | ): 24 | steps = [ 25 | ('lmit', LimitDataTransformer()), 26 | ('genre', GenreTransformer()), 27 | ('gender', GenderTransformer()), 28 | ('w2v', Word2VecTransformer( 29 | columns=['title_text', 'content_text'], 30 | pretrain_w2v=pretrain_w2v)), 31 | ] 32 | 33 | if is_logarithm: 34 | steps.extend([ 35 | ('log_click', ToLogarithmTransformer(column='click')), 36 | ('loc_cv', ToLogarithmTransformer(column='conversion')), 37 | ]) 38 | 39 | if training_type in [const.TRAINING_TYPES_REGRESSION, 40 | const.TRAINING_TYPES_MULTI_REGRESSION]: 41 | if is_impnorm: 42 | click_col = 'imp_norm_click' 43 | cv_col = 'imp_norm_conversion' 44 | else: 45 | click_col = 'click' 46 | cv_col = 'conversion' 47 | 48 | scaler = [ 49 | ('mm_click', MinMaxScaleTransformer(column=click_col)), 50 | ('mm_conversion', MinMaxScaleTransformer(column=cv_col)), 51 | ('mm_cvr', MinMaxScaleTransformer(column='cvr')), 52 | ] 53 | steps.extend(scaler) 54 | 55 | steps.extend([ 56 | ('type_convert', TypeConvertTransformer( 57 | columns=['product_id', 'genre'], 58 | dtype=np.int32)), 59 | ]) 60 | 61 | return Pipeline(steps) 62 | 63 | 64 | class PreprocessedDataset(chainer.dataset.DatasetMixin): 65 | 66 | def __init__(self, df, 67 | training_type, 68 | output_cols): 69 | 70 | self.X = df.drop(output_cols, axis=1) 71 | self.y = df[output_cols] 72 | 73 | self.training_type = training_type 74 | 75 | def __len__(self): 76 | return len(self.X) 77 | 78 | def get_example(self, i): 79 | 80 | X = self.X.iloc[i] 81 | y = self.y.iloc[i] 82 | 83 | X = X.to_dict() 84 | y = y.values.astype(np.float32) 85 | 86 | return X, y 87 | 88 | 89 | def convert_seq(batch, device=None, with_label=True): 90 | 91 | def to_device_batch(batch): 92 | if device is None: 93 | return batch 94 | elif device < 0: 95 | return [chainer.dataset.to_device(device, x) for x in batch] 96 | else: 97 | xp = cuda.cupy.get_array_module(*batch) 98 | concat = xp.concatenate(batch, axis=0) 99 | sections = np.cumsum([len(x) 100 | for x in batch[:-1]], dtype=np.int32) 101 | concat_dev = chainer.dataset.to_device(device, concat) 102 | batch_dev = cuda.cupy.split(concat_dev, sections) 103 | return batch_dev 104 | 105 | if with_label: 106 | return { 107 | 'product_xs': to_device_batch([[x['product_id']] for x, _ in batch]), 108 | 'genre_xs': to_device_batch([np.asarray([x['genre']]) for x, _ in batch]), 109 | 'gender_xs': to_device_batch([x['gender_target'] for x, _ in batch]), 110 | 'title_xs': to_device_batch([x['title_text'] for x, _ in batch]), 111 | 'content_xs': to_device_batch([x['content_text'] for x, _ in batch]), 112 | 'ys': to_device_batch([y for _, y in batch]), 113 | } 114 | 115 | else: 116 | return to_device_batch([x for x in batch]) 117 | -------------------------------------------------------------------------------- /src/util/resource.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging 3 | import pathlib 4 | import socket 5 | import sys 6 | 7 | import logzero 8 | 9 | from util import const 10 | 11 | 12 | class Resource(object): 13 | 14 | def __init__(self, args, train=True): 15 | self.args = args 16 | self.start_time = datetime.datetime.now() 17 | self.logger = logzero.setup_default_logger() 18 | 19 | # test=False and train=False 20 | if not args.test and train: 21 | self.logger.warn('Test option is {}'.format(args.test)) 22 | 23 | # setup experiment directory 24 | self.output_dir = self._setup_output_dir() 25 | self.log_dir = self._setup_log_dir() 26 | 27 | if train: 28 | self.fig_dir = self._setup_fig_dir() 29 | log_filename = '{}_train.log'.format(self.sdtime) 30 | else: 31 | log_filename = '{}_inference.log'.format(self.sdtime) 32 | 33 | log_name = self.log_dir / log_filename 34 | 35 | logzero.logfile(str(log_name), loglevel=logging.INFO) 36 | 37 | self.log_name = log_name 38 | self.logger.info('Log filename: {}'.format(str(log_name))) 39 | self.logger.info('Server name: {}'.format(socket.gethostname())) 40 | self.dump_common_info() 41 | 42 | @property 43 | def stime(self): 44 | return self.start_time.strftime('%Y-%m-%d-%H-%M-%S') 45 | 46 | @property 47 | def sdtime(self): 48 | return self.start_time.strftime('%H-%M-%S') 49 | 50 | @property 51 | def sytime(self): 52 | return self.start_time.strftime('%Y-%m-%d') 53 | 54 | @property 55 | def duration(self): 56 | end_time = datetime.datetime.now() 57 | duration = end_time - self.start_time 58 | return duration 59 | 60 | def logdebug(self, msg): 61 | self.logger.debug(msg) 62 | 63 | def loginfo(self, msg): 64 | self.logger.info(msg) 65 | 66 | def _setup_output_dir(self): 67 | output_dir = pathlib.Path(self.args.out) / self.sytime 68 | 69 | if not output_dir.exists(): 70 | output_dir.mkdir(parents=True) 71 | self.logger.debug('Output dir is created at [{}]'.format(str(output_dir))) 72 | else: 73 | self.logger.debug('Output dir is already exists.'.format(str(output_dir))) 74 | 75 | return output_dir 76 | 77 | def _setup_log_dir(self): 78 | log_dir = self.output_dir / 'log' 79 | 80 | if not log_dir.exists(): 81 | log_dir.mkdir() 82 | self.logger.debug('Log dir is created at [{}]'.format(str(log_dir))) 83 | else: 84 | self.logger.debug('Log dir is already exists.'.format(str(log_dir))) 85 | 86 | return log_dir 87 | 88 | def _setup_fig_dir(self): 89 | fig_dir = self.output_dir / 'fig' 90 | 91 | self.fig_acc_dir = fig_dir / 'accuracy' 92 | self.fig_loss_dir = fig_dir / 'loss' 93 | self.fig_heatmap_dir = fig_dir / 'heatmap' 94 | self.fig_coef_dir = fig_dir / 'coef' 95 | 96 | if not fig_dir.exists(): 97 | self.logger.debug('Fig dir is created at [{}]'.format(str(fig_dir))) 98 | fig_dir.mkdir() 99 | # also make directories 100 | self.fig_acc_dir.mkdir() 101 | self.fig_loss_dir.mkdir() 102 | self.fig_heatmap_dir.mkdir() 103 | self.fig_coef_dir.mkdir() 104 | else: 105 | self.logger.debug('Fig dir is already exists.'.format(str(fig_dir))) 106 | 107 | return fig_dir 108 | 109 | def dump_common_info(self): 110 | logger = logzero.setup_logger( 111 | name='preprocess', 112 | logfile=str(self.log_dir / f'{self.sdtime}_preprocess.log')) 113 | 114 | logger.info('=== Common informations ===') 115 | logger.info('Model: {}'.format(self.args.arch)) 116 | logger.info('Word embedding: {}'.format(self.args.word_embedding)) 117 | logger.info('Training type: {}'.format(self.args.training_type)) 118 | 119 | logger.info('# of epoch: {}'.format(self.args.epoch)) 120 | logger.info('# of batchsize: {}'.format(self.args.batchsize)) 121 | logger.info('# of encoder layers: {}'.format(self.args.layer)) 122 | logger.info('# of genre embedding dim: {}'.format(self.args.genre_unit)) 123 | logger.info('Dropout ratio: {}'.format(self.args.dropout)) 124 | logger.info('Weight decay: {}'.format(self.args.weight_decay)) 125 | logger.info('GPU ID: {}'.format(self.args.gpu)) 126 | logger.info('Cross validation: {}, # of folds: {}'.format(self.args.cv, self.args.fold)) 127 | logger.info('Seed: {}'.format(self.args.seed)) 128 | logger.info('GroupKFold: {}'.format(self.args.group)) 129 | logger.info('Apply impression normalization: {}'.format(self.args.imp_norm)) 130 | logger.info('Target objective: {}'.format(self.args.objective)) 131 | 132 | def dump_duration(self): 133 | logger = logzero.setup_logger( 134 | name='test', 135 | logfile=str(self.log_dir / f'{self.sdtime}_test.log') 136 | ) 137 | end_time = datetime.datetime.now() 138 | logger.info('Exit time: {}'.format(end_time.strftime('%Y/%m/%d - %H:%M:%S'))) 139 | logger.info('Duration: {}'.format(self.duration)) 140 | logger.info('Remember: log is saved to {}'.format(str(self.log_name))) 141 | 142 | def dump_command_info(self): 143 | logger = logzero.setup_logger( 144 | name='test', 145 | logfile=str(self.log_dir / f'{self.sdtime}_test.log') 146 | ) 147 | logger.info('Command name: {}'.format(' '.join(sys.argv))) 148 | -------------------------------------------------------------------------------- /src/util/seed.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import chainer 4 | import numpy as np 5 | 6 | 7 | def reset_seed(seed): 8 | 9 | random.seed(seed) 10 | np.random.seed(seed) 11 | 12 | if chainer.cuda.available: 13 | chainer.cuda.cupy.random.seed(seed) 14 | -------------------------------------------------------------------------------- /src/util/transforms.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.base import BaseEstimator, TransformerMixin 4 | from sklearn.preprocessing import LabelBinarizer, LabelEncoder, MinMaxScaler 5 | 6 | from util.preprocess import wakati 7 | 8 | 9 | class LimitDataTransformer(BaseEstimator, TransformerMixin): 10 | 11 | def __init__(self, num_imp): 12 | self.num_imp = num_imp 13 | 14 | def fit(self, X, *args): 15 | return self 16 | 17 | def transform(self, df): 18 | df = df[df['impression'] > self.num_imp] 19 | return df 20 | 21 | 22 | class WakatiTransformer(BaseEstimator, TransformerMixin): 23 | 24 | def __init__(self, column, wakati_func=wakati): 25 | self.column = column 26 | self.wakati_func = wakati 27 | 28 | def fit(self, X, *args): 29 | return self 30 | 31 | def transform(self, X): 32 | X[self.column] = X[self.column].apply(self.wakati_func) 33 | return X 34 | 35 | 36 | class Word2VecTransformer(BaseEstimator, TransformerMixin): 37 | 38 | def __init__(self, columns, pretrain_w2v): 39 | self.columns = columns 40 | self.w2v = pretrain_w2v 41 | self.w2i = {w: i for i, w in enumerate(pretrain_w2v.index2word)} 42 | 43 | def fit(self, X, *args): 44 | return self 45 | 46 | def word2id(self, words): 47 | return np.asarray([ 48 | self.w2i[w] for w in words 49 | if w in self.w2v.vocab], dtype=np.int32) 50 | 51 | def transform(self, X): 52 | for col in self.columns: 53 | X[col] = X[col].apply(self.word2id) 54 | return X 55 | 56 | 57 | class GenreTransformer(BaseEstimator, TransformerMixin): 58 | 59 | def __init__(self): 60 | self.le = None 61 | 62 | def fit(self, X, *args): 63 | self.le = LabelEncoder().fit(X['genre']) 64 | return self 65 | 66 | def transform(self, X): 67 | X['genre'] = self.le.transform(X['genre']) 68 | return X 69 | 70 | 71 | class GenderTransformer(BaseEstimator, TransformerMixin): 72 | 73 | def __init__(self): 74 | self.lb = None 75 | 76 | def fit(self, X, *args): 77 | self.lb = LabelBinarizer().fit(X['gender_target']) 78 | return self 79 | 80 | def transform(self, X): 81 | X['gender_target'] = self.lb.transform(X['gender_target']).tolist() 82 | X['gender_target'] = X['gender_target'].apply( 83 | lambda x: np.asarray(x, dtype=np.float32)) 84 | return X 85 | 86 | 87 | class MinMaxScaleTransformer(BaseEstimator, TransformerMixin): 88 | 89 | def __init__(self, column): 90 | self.column = column 91 | self.mm = None 92 | 93 | def fit(self, X, *args): 94 | self.mm = MinMaxScaler().fit(X[[self.column]]) 95 | return self 96 | 97 | def transform(self, X): 98 | X[self.column] = self.mm.transform(X[[self.column]]) 99 | return X 100 | 101 | 102 | class ToLogarithmTransformer(BaseEstimator, TransformerMixin): 103 | 104 | def __init__(self, column): 105 | self.column = column 106 | 107 | def fit(self, X, *args): 108 | return self 109 | 110 | def transform(self, X): 111 | X[self.column] = np.log1p(X[self.column]) 112 | return X 113 | 114 | def inverse_transform(self, X): 115 | X[self.colum] = np.expm1(X[self.column]) 116 | return X 117 | 118 | 119 | class TypeConvertTransformer(BaseEstimator, TransformerMixin): 120 | 121 | def __init__(self, columns, dtype): 122 | self.columns = columns 123 | self.dtype = dtype 124 | 125 | def fit(self, X, *args): 126 | return self 127 | 128 | def transform(self, X): 129 | for col in self.columns: 130 | X[col] = X[col].astype(self.dtype) 131 | return X 132 | --------------------------------------------------------------------------------