├── .gitignore
├── Dockerfile
├── README.md
└── src
    ├── models
        ├── __init__.py
        ├── gru.py
        └── regressor.py
    ├── train_deep.py
    └── util
        ├── args.py
        ├── const.py
        ├── cross_validation.py
        ├── extensions.py
        ├── load.py
        ├── ndcg.py
        ├── nlp_utils.py
        ├── preprocessed_dataset.py
        ├── resource.py
        ├── seed.py
        └── transforms.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # gitignore for gunosy-creative-evaluation
  2 | *.csv
  3 | *.png
  4 | *.jpg
  5 | *.pkl
  6 | *.zip
  7 | entity_vector.model.bin
  8 | 
  9 | # for TeXs
 10 | *.cut
 11 | *.aux
 12 | *.out
 13 | *.synctex.gz
 14 | *.xcp
 15 | 
 16 | # Byte-compiled / optimized / DLL files
 17 | __pycache__/
 18 | *.py[cod]
 19 | *$py.class
 20 | 
 21 | # C extensions
 22 | *.so
 23 | 
 24 | # Distribution / packaging
 25 | .Python
 26 | build/
 27 | develop-eggs/
 28 | dist/
 29 | downloads/
 30 | eggs/
 31 | .eggs/
 32 | lib/
 33 | lib64/
 34 | parts/
 35 | sdist/
 36 | var/
 37 | wheels/
 38 | *.egg-info/
 39 | .installed.cfg
 40 | *.egg
 41 | MANIFEST
 42 | 
 43 | # PyInstaller
 44 | #  Usually these files are written by a python script from a template
 45 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 46 | *.manifest
 47 | *.spec
 48 | 
 49 | # Installer logs
 50 | pip-log.txt
 51 | pip-delete-this-directory.txt
 52 | 
 53 | # Unit test / coverage reports
 54 | htmlcov/
 55 | .tox/
 56 | .coverage
 57 | .coverage.*
 58 | .cache
 59 | nosetests.xml
 60 | coverage.xml
 61 | *.cover
 62 | .hypothesis/
 63 | .pytest_cache/
 64 | 
 65 | # Translations
 66 | *.mo
 67 | *.pot
 68 | 
 69 | # Django stuff:
 70 | *.log
 71 | local_settings.py
 72 | db.sqlite3
 73 | 
 74 | # Flask stuff:
 75 | instance/
 76 | .webassets-cache
 77 | 
 78 | # Scrapy stuff:
 79 | .scrapy
 80 | 
 81 | # Sphinx documentation
 82 | docs/_build/
 83 | 
 84 | # PyBuilder
 85 | target/
 86 | 
 87 | # Jupyter Notebook
 88 | .ipynb_checkpoints
 89 | 
 90 | # IPython
 91 | profile_default/
 92 | ipython_config.py
 93 | 
 94 | # pyenv
 95 | .python-version
 96 | 
 97 | # celery beat schedule file
 98 | celerybeat-schedule
 99 | 
100 | # SageMath parsed files
101 | *.sage.py
102 | 
103 | # Environments
104 | .env
105 | .venv
106 | env/
107 | venv/
108 | ENV/
109 | env.bak/
110 | venv.bak/
111 | 
112 | # Spyder project settings
113 | .spyderproject
114 | .spyproject
115 | 
116 | # Rope project settings
117 | .ropeproject
118 | 
119 | # mkdocs documentation
120 | /site
121 | 
122 | # mypy
123 | .mypy_cache/
124 | .dmypy.json
125 | dmypy.json


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | From nvidia/cuda:9.2-cudnn7-devel-ubuntu16.04
 2 | 
 3 | ENV MYSQL_PWD root
 4 | RUN echo "mysql-server mysql-server/root_password password $MYSQL_PWD" | debconf-set-selections
 5 | RUN echo "mysql-server mysql-server/root_password_again password $MYSQL_PWD" | debconf-set-selections
 6 | 
 7 | RUN apt-get update && apt-get install -y \
 8 |     software-properties-common && \
 9 |     add-apt-repository ppa:jonathonf/python-3.6 -y && \
10 |     apt-get -y update
11 | 
12 | RUN apt-get install -y \
13 |     build-essential \
14 |     tmux \
15 |     python3.6 \
16 |     python3.6-dev \
17 |     python3-pip \
18 |     python3-wheel \
19 |     python3-setuptools \
20 |     python3-tk \
21 |     mysql-client \
22 |     mysql-server \
23 |     libmysqlclient-dev \
24 |     libssl-dev \
25 |     sudo \
26 |     mecab \
27 |     libmecab-dev \
28 |     mecab-ipadic-utf8 \
29 |     git \
30 |     make \
31 |     curl \
32 |     xz-utils \
33 |     file \
34 |     swig \
35 |     language-pack-ja-base \
36 |     language-pack-ja \
37 |     locales \
38 |     && locale-gen ja_JP.UTF-8 \
39 |     && localedef -f UTF-8 -i ja_JP ja_JP
40 | 
41 | ENV TZ Asia/Tokyo
42 | ENV LANG ja_JP.UTF-8
43 | ENV LANGUAGE ja_JP:jp
44 | ENV LC_ALL ja_JP.UTF-8
45 | RUN ln -fns /usr/bin/python3.6 /usr/bin/python && \
46 |     ln -fns /usr/bin/python3.6 /usr/bin/python3 && \
47 |     ln -fns /usr/bin/pip3 /usr/bin/pip
48 | 
49 | # install chainer and cupy
50 | RUN pip install --no-cache-dir cupy-cuda92 chainer
51 | 
52 | # install mecab-ipadic-neologd
53 | RUN git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git && \
54 |     cd mecab-ipadic-neologd && \
55 |     bin/install-mecab-ipadic-neologd -n -y -p /var/lib/mecab/dic/mecab-ipadic-neologd
56 | 
57 | # install mecab-python3
58 | RUN pip install --no-cache-dir mecab-python3
59 | 
60 | # settings for Japanese
61 | # RUN update-locale LANG=ja_JP.UTF-8 LANGUAGE=ja_JP:ja
62 | 
63 | RUN pip install --no-cache-dir jupyterlab
64 | EXPOSE 8888
65 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Multi-task Conditional Attention Networks
 2 | 
 3 | A prototype version of our submitted paper: `Conversion Prediction Using Multi-task Conditional Attention Networks to Support the Creation of Effective Ad Creatives`.
 4 | 
 5 | ## Setup using Docker
 6 | 
 7 | ```shell
 8 | $ docker build -t multi-task-cond-net-env .
 9 | $ docker create -it -v /data:/data --name datavolume busybox
10 | $ docker run -it -p 8888:8888 --runtime=nvidia --volumes-from datavolume --rm --name multi-task-cond-net multi-task-cond-net-env
11 | ```
12 | 


--------------------------------------------------------------------------------
/src/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from models import gru
 2 | from models.mlp import MLPEncoder
 3 | from models.regressor import MultiTaskRegressor, Regressor
 4 | 
 5 | ARCHS = {
 6 |     'mlp': MLPEncoder,
 7 |     'gru': gru.GRUEncoder,
 8 |     'gru_attn': gru.AttentionGRUEncoder,
 9 |     'gru_attn_cond': gru.ConditionalAttentionGRUEncoder,
10 |     'gru_attn_word_cond': gru.ConditionalWordAttentionGRUEncoder,
11 | }
12 | 
13 | MODEL_WRAPPERS = {
14 |     'regression': Regressor,
15 |     'multi_regression': MultiTaskRegressor,
16 | }
17 | 


--------------------------------------------------------------------------------
/src/models/gru.py:
--------------------------------------------------------------------------------
  1 | import chainer
  2 | import chainer.functions as F
  3 | import chainer.links as L
  4 | 
  5 | from models.base_embedding_layer import BaseEmbeddingLayer
  6 | from util import const
  7 | 
  8 | 
  9 | class GRUEncoder(chainer.Chain):
 10 | 
 11 |     def __init__(self,
 12 |                  n_layers,
 13 |                  n_vocab,
 14 |                  n_genre,
 15 |                  pretrained_w2v,
 16 |                  is_update_w2v,
 17 |                  dropout,
 18 |                  genre_units=5):
 19 | 
 20 |         super(GRUEncoder, self).__init__()
 21 |         with self.init_scope():
 22 |             self.base_embedding_layer = BaseEmbeddingLayer(
 23 |                 n_vocab=n_vocab,
 24 |                 n_genre=n_genre, genre_units=genre_units,
 25 |                 pretrained_w2v=pretrained_w2v,
 26 |                 is_update_w2v=is_update_w2v,
 27 |                 dropout=dropout)
 28 | 
 29 |             self.title_encoder = L.NStepGRU(
 30 |                 n_layers,
 31 |                 self.base_embedding_layer.n_units,
 32 |                 self.base_embedding_layer.n_units, dropout)
 33 |             self.content_encoder = L.NStepGRU(
 34 |                 n_layers,
 35 |                 self.base_embedding_layer.n_units,
 36 |                 self.base_embedding_layer.n_units, dropout)
 37 | 
 38 |         self.out_units = self.base_embedding_layer.n_units * 2 \
 39 |             + genre_units \
 40 |             + const.PREPROCESS_GENDER_TARGET_NUM
 41 | 
 42 |         self.n_layers = n_layers
 43 |         self.dropout = dropout
 44 | 
 45 |     def forward(self,
 46 |                 genre_xs,
 47 |                 gender_xs,
 48 |                 title_xs,
 49 |                 content_xs,
 50 |                 **kwargs):
 51 | 
 52 |         embeddings = self.base_embedding_layer(
 53 |             title_xs=title_xs, content_xs=content_xs,
 54 |             genre_xs=genre_xs)
 55 |         title_exs, content_exs, genre_exs = embeddings
 56 |         gender_exs = F.stack(gender_xs)
 57 | 
 58 |         last_title_h, title_ys = self.title_encoder(None, title_exs)
 59 |         last_content_h, content_ys = self.content_encoder(None, content_exs)
 60 | 
 61 |         concat_outputs = F.concat((
 62 |             genre_exs,
 63 |             gender_exs,
 64 |             last_title_h[-1],
 65 |             last_content_h[-1],
 66 |         ))
 67 | 
 68 |         return concat_outputs
 69 | 
 70 | 
 71 | class AttentionGRUEncoder(GRUEncoder):
 72 | 
 73 |     def __init__(self,
 74 |                  n_layers,
 75 |                  n_vocab,
 76 |                  n_genre,
 77 |                  pretrained_w2v,
 78 |                  is_update_w2v,
 79 |                  dropout,
 80 |                  genre_units=5):
 81 | 
 82 |         super(AttentionGRUEncoder, self).__init__(
 83 |             n_layers=n_layers,
 84 |             n_vocab=n_vocab,
 85 |             n_genre=n_genre,
 86 |             pretrained_w2v=pretrained_w2v,
 87 |             is_update_w2v=is_update_w2v,
 88 |             dropout=dropout,
 89 |             genre_units=genre_units)
 90 | 
 91 |         with self.init_scope():
 92 |             self.attn_title = L.Linear(self.base_embedding_layer.n_units, 1)
 93 |             self.attn_content = L.Linear(self.base_embedding_layer.n_units, 1)
 94 | 
 95 |     def calc_attention(self, xs, ys, attn_linear):
 96 | 
 97 |         concat_ys = F.concat(ys, axis=0)
 98 |         attn_ys = attn_linear(F.tanh(concat_ys))
 99 | 
100 |         cumsum_ys = self.xp.cumsum(self.xp.array([len(x) for x in xs], dtype=self.xp.int32))
101 | 
102 |         split_attn_ys = F.split_axis(attn_ys, cumsum_ys[:-1].tolist(), axis=0)
103 |         split_attn_ys_pad = F.pad_sequence(split_attn_ys, padding=-1024)
104 |         attn_softmax = F.softmax(split_attn_ys_pad, axis=1)
105 | 
106 |         return attn_softmax
107 | 
108 |     def apply_attention(self, ys, attn_softmax):
109 |         batchsize = len(ys)
110 | 
111 |         ys_pad = F.pad_sequence(ys, padding=0.0)
112 |         ys_pad_reshape = F.reshape(ys_pad, (-1, ys_pad.shape[-1]))
113 | 
114 |         attn_softmax_reshape = F.broadcast_to(
115 |             F.reshape(attn_softmax, (-1, attn_softmax.shape[-1])), ys_pad_reshape.shape)
116 | 
117 |         attn_hidden = ys_pad_reshape * attn_softmax_reshape
118 |         attn_hidden_reshape = F.reshape(attn_hidden, (batchsize, -1, attn_hidden.shape[-1]))
119 | 
120 |         return F.sum(attn_hidden_reshape, axis=1)
121 | 
122 |     def forward(self,
123 |                 genre_xs,
124 |                 gender_xs,
125 |                 title_xs,
126 |                 content_xs,
127 |                 **kwargs):
128 | 
129 |         embedding = self.base_embedding_layer(
130 |             title_xs=title_xs, content_xs=content_xs,
131 |             genre_xs=genre_xs)
132 |         title_exs, content_exs, genre_exs = embedding
133 |         gender_exs = F.stack(gender_xs)
134 | 
135 |         last_title_h, title_ys = self.title_encoder(None, title_exs)
136 |         last_content_h, content_ys = self.content_encoder(None, content_exs)
137 | 
138 |         attn_title = self.calc_attention(title_xs, title_ys, self.attn_title)
139 |         attn_title_h = self.apply_attention(title_ys, attn_title)
140 | 
141 |         attn_content = self.calc_attention(content_xs, content_ys, self.attn_content)
142 |         attn_content_h = self.apply_attention(content_ys, attn_content)
143 | 
144 |         concat_outputs = F.concat((
145 |             genre_exs,
146 |             gender_exs,
147 |             attn_title_h,
148 |             attn_content_h,
149 |         ))
150 | 
151 |         return concat_outputs
152 | 
153 | 
154 | class ConditionalAttentionGRUEncoder(AttentionGRUEncoder):
155 | 
156 |     def __init__(self,
157 |                  n_layers,
158 |                  n_vocab,
159 |                  n_genre,
160 |                  pretrained_w2v,
161 |                  is_update_w2v,
162 |                  dropout,
163 |                  genre_units=5):
164 | 
165 |         super(ConditionalAttentionGRUEncoder, self).__init__(
166 |             n_layers=n_layers,
167 |             n_vocab=n_vocab,
168 |             n_genre=n_genre,
169 |             pretrained_w2v=pretrained_w2v,
170 |             is_update_w2v=is_update_w2v,
171 |             dropout=dropout,
172 |             genre_units=genre_units)
173 | 
174 |         with self.init_scope():
175 |             self.proj_cond = L.Linear(None, 1, nobias=True)
176 | 
177 |     def calc_attention(self, xs, ys, genre_exs, gender_exs, attn_linear):
178 | 
179 |         concat_ys = F.concat(ys, axis=0)  # -> (total len of batched sentence, word embedding dim)
180 |         attn_ys = attn_linear(F.tanh(concat_ys))
181 |         cond_feature = self.proj_cond(F.concat((genre_exs, gender_exs)))  # -> (batchsize, proj_cond dim)
182 | 
183 |         cumsum_ys = self.xp.cumsum(self.xp.array([len(x) for x in xs], dtype=self.xp.int32))
184 |         split_attn_ys = F.split_axis(attn_ys, cumsum_ys[:-1].tolist(), axis=0)
185 |         split_attn_ys_pad = F.pad_sequence(split_attn_ys, padding=-1024)
186 | 
187 |         bool_cond = split_attn_ys_pad.array == -1024
188 |         split_attn_ys_pad = split_attn_ys_pad * F.expand_dims(
189 |             F.broadcast_to(cond_feature, (split_attn_ys_pad.shape[:-1])), axis=-1)
190 | 
191 |         padding_array = self.xp.full(split_attn_ys_pad.shape, -1024, dtype=self.xp.float32)
192 | 
193 |         split_attn_ys_pad = F.where(bool_cond, padding_array, split_attn_ys_pad)
194 | 
195 |         attn_softmax = F.softmax(split_attn_ys_pad, axis=1)
196 | 
197 |         return attn_softmax
198 | 
199 |     def apply_attention(self, ys, attn_softmax):
200 |         batchsize = len(ys)
201 | 
202 |         ys_pad = F.pad_sequence(ys, padding=0.0)
203 |         ys_pad_reshape = F.reshape(ys_pad, (-1, ys_pad.shape[-1]))
204 | 
205 |         attn_softmax_reshape = F.broadcast_to(
206 |             F.reshape(attn_softmax, (-1, attn_softmax.shape[-1])), ys_pad_reshape.shape)
207 | 
208 |         attn_hidden = ys_pad_reshape * attn_softmax_reshape
209 |         attn_hidden_reshape = F.reshape(attn_hidden, (batchsize, -1, attn_hidden.shape[-1]))
210 | 
211 |         return F.sum(attn_hidden_reshape, axis=1)
212 | 
213 |     def forward(self,
214 |                 genre_xs,
215 |                 gender_xs,
216 |                 title_xs,
217 |                 content_xs,
218 |                 **kwargs):
219 | 
220 |         embedding = self.base_embedding_layer(
221 |             title_xs=title_xs, content_xs=content_xs,
222 |             genre_xs=genre_xs)
223 |         title_exs, content_exs, genre_exs = embedding
224 |         gender_exs = F.stack(gender_xs)
225 | 
226 |         last_title_h, title_ys = self.title_encoder(None, title_exs)
227 |         last_content_h, content_ys = self.content_encoder(None, content_exs)
228 | 
229 |         attn_title = self.calc_attention(title_xs, title_ys, genre_exs,
230 |                                          gender_exs, self.attn_title)
231 |         attn_title_h = self.apply_attention(title_ys, attn_title)
232 | 
233 |         attn_content = self.calc_attention(content_xs, content_ys, genre_exs,
234 |                                            gender_exs, self.attn_content)
235 |         attn_content_h = self.apply_attention(content_ys, attn_content)
236 | 
237 |         concat_outputs = F.concat((
238 |             genre_exs,
239 |             gender_exs,
240 |             attn_title_h,
241 |             attn_content_h,
242 |         ))
243 | 
244 |         return concat_outputs
245 | 


--------------------------------------------------------------------------------
/src/models/regressor.py:
--------------------------------------------------------------------------------
 1 | import chainer
 2 | import chainer.functions as F
 3 | import chainer.links as L
 4 | from chainer import reporter
 5 | 
 6 | 
 7 | class Regressor(chainer.Chain):
 8 | 
 9 |     def __init__(self, encoder, dropout=0.):
10 |         super(Regressor, self).__init__()
11 |         with self.init_scope():
12 |             self.encoder = encoder
13 |             self.output = L.Linear(encoder.out_units, 1)
14 | 
15 |         self.dropout = dropout
16 | 
17 |     def forward(self, ys, **kwargs):
18 | 
19 |         concat_outputs = F.concat(self.predict(**kwargs), axis=0)
20 |         concat_truths = F.concat(ys, axis=0)
21 | 
22 |         loss = F.mean_squared_error(concat_outputs, concat_truths)
23 |         reporter.report({'loss': loss.data}, self)
24 | 
25 |         return loss
26 | 
27 |     def predict(self, **kwargs):
28 | 
29 |         concat_encodings = F.dropout(self.encoder(**kwargs), ratio=self.dropout)
30 |         concat_outputs = F.sigmoid(self.output(concat_encodings))
31 | 
32 |         return concat_outputs
33 | 
34 | 
35 | class MultiTaskRegressor(chainer.Chain):
36 | 
37 |     def __init__(self, encoder, dropout=0.):
38 |         super(MultiTaskRegressor, self).__init__()
39 |         with self.init_scope():
40 |             self.encoder = encoder
41 |             self.output = L.Linear(encoder.out_units, 2)
42 | 
43 |         self.dropout = dropout
44 | 
45 |     def forward(self, ys, **kwargs):
46 | 
47 |         pred_click, pred_cv = self.predict(**kwargs)
48 |         ys = F.stack(ys)
49 |         true_click, true_cv = ys[:, 0], ys[:, 1]
50 | 
51 |         loss_click = F.mean_squared_error(pred_click, true_click)
52 |         loss_cv = F.mean_squared_error(pred_cv, true_cv)
53 |         loss = loss_click + loss_cv
54 | 
55 |         reporter.report({'loss': loss.data}, self)
56 |         reporter.report({'loss_click': loss_click.data}, self)
57 |         reporter.report({'loss_cv': loss_cv.data}, self)
58 | 
59 |         return loss
60 | 
61 |     def predict(self, **kwargs):
62 | 
63 |         concat_encodings = F.dropout(self.encoder(**kwargs), ratio=self.dropout)
64 |         output = F.sigmoid(self.output(concat_encodings))
65 |         output_click, output_cv = output[:, 0], output[:, 1]
66 | 
67 |         return output_click, output_cv
68 | 


--------------------------------------------------------------------------------
/src/train_deep.py:
--------------------------------------------------------------------------------
  1 | import matplotlib  # NOQA # isort:skip
  2 | matplotlib.use("Agg")  # NOQA # isort:skip
  3 | 
  4 | from collections import OrderedDict
  5 | 
  6 | import chainer
  7 | import logzero
  8 | import matplotlib.pyplot as plt
  9 | import numpy as np
 10 | from chainer import training
 11 | from chainer.training import extensions
 12 | from gensim.models import KeyedVectors
 13 | 
 14 | from models import ARCHS, MODEL_WRAPPERS
 15 | from util import const
 16 | from util.args import parse_train_args as parse_args
 17 | from util.cross_validation import kfold_iter
 18 | from util.evaluate import EVALUATE_PHASES
 19 | from util.extensions import (
 20 |     setup_optim_trigger,
 21 |     setup_plot_report_loss_entries,
 22 |     setup_print_report_entries,
 23 |     setup_record_trigger
 24 | )
 25 | from util.load import load_data
 26 | from util.notify import notify_exception, notify_result
 27 | from util.preprocessed_dataset import (
 28 |     PreprocessedDataset,
 29 |     convert_seq,
 30 |     prepare_vectorizer
 31 | )
 32 | from util.resource import Resource
 33 | from util.seed import reset_seed
 34 | 
 35 | plt.style.use('ggplot')
 36 | chainer.cuda.set_max_workspace_size(1024 * 1024 * 1024)
 37 | chainer.global_config.autotune = True
 38 | 
 39 | 
 40 | def main(args):
 41 | 
 42 |     reset_seed(args.seed)
 43 | 
 44 |     res = Resource(args, train=True)
 45 | 
 46 |     pretrained_word2vec = KeyedVectors.load_word2vec_format(
 47 |         str(const.PRETRAINED_WORD2VEC_FPATH), binary=True)
 48 | 
 49 |     vectorizer = prepare_vectorizer(pretrained_word2vec,
 50 |                                     args.training_type,
 51 |                                     norm_imp=const.NORMALIZED_IMPRESSION,
 52 |                                     is_impnorm=args.imp_norm,
 53 |                                     is_logarithm=True)
 54 | 
 55 |     df = load_data(const.TRAIN_DATA_FPATH)
 56 | 
 57 |     kf = kfold_iter(X=df, y=df[args.objective],
 58 |                     n_splits=args.fold,
 59 |                     random_state=args.seed,
 60 |                     is_campaign_group=args.group,
 61 |                     training_type=args.training_type,
 62 |                     campaign_ids=df['campaign_id'].values)
 63 | 
 64 |     scores = OrderedDict([(metric, []) for metric
 65 |                           in const.EVALUATION_METRIC[args.training_type]])
 66 | 
 67 |     res.loginfo('Start training')
 68 |     for i, (train_idx, val_idx) in enumerate(kf):
 69 |         res.loginfo('Fold: {}'.format(i + 1))
 70 | 
 71 |         df_train = df.iloc[train_idx]
 72 |         df_val = df.iloc[val_idx]
 73 | 
 74 |         df_train = vectorizer.fit_transform(df_train)
 75 |         df_val = vectorizer.transform(df_val)
 76 | 
 77 |         n_genre = len(vectorizer.named_steps.genre.le.classes_)
 78 |         n_gender = len(vectorizer.named_steps.gender.lb.classes_)
 79 | 
 80 |         res.logdebug('# of genres: {}'.format(n_genre))
 81 |         res.logdebug('# of gender: {}'.format(n_gender))
 82 | 
 83 |         train_pairs = PreprocessedDataset(df_train, args.training_type,
 84 |                                           output_cols=args.objective)
 85 |         val_pairs = PreprocessedDataset(df_val, args.training_type,
 86 |                                         output_cols=args.objective)
 87 | 
 88 |         train_iter = chainer.iterators.SerialIterator(
 89 |             train_pairs, args.batchsize)
 90 |         val_iter = chainer.iterators.SerialIterator(
 91 |             val_pairs, args.batchsize, repeat=False, shuffle=False)
 92 | 
 93 |         is_update_w2v = args.word_embedding == const.WORD2VEC_UPDATE
 94 |         encoder = ARCHS[args.arch](n_layers=args.layer,
 95 |                                    n_genre=n_genre,
 96 |                                    n_vocab=len(pretrained_word2vec.index2word),
 97 |                                    pretrained_w2v=pretrained_word2vec,
 98 |                                    is_update_w2v=is_update_w2v,
 99 |                                    dropout=args.dropout)
100 |         model = MODEL_WRAPPERS[args.training_type](
101 |             encoder=encoder, dropout=args.dropout)
102 | 
103 |         if args.gpu >= 0:
104 |             chainer.cuda.get_device_from_id(args.gpu).use()
105 |             model.to_gpu()
106 | 
107 |         # Setup an optimizer
108 |         optimizer = chainer.optimizers.Adam()
109 |         optimizer.setup(model)
110 |         optimizer.add_hook(chainer.optimizer.WeightDecay(args.weight_decay))
111 | 
112 |         # Set up a trainer
113 |         updater = training.updaters.StandardUpdater(
114 |             train_iter, optimizer, device=args.gpu,
115 |             converter=convert_seq)
116 |         trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=str(res.output_dir))
117 | 
118 |         # Evaluate the model with the test dataset for each epoch
119 |         trainer.extend(extensions.Evaluator(
120 |             val_iter, model, device=args.gpu,
121 |             converter=convert_seq))
122 | 
123 |         # Take a best snapshot
124 |         record_trigger = setup_record_trigger(args.training_type)
125 |         optim_trigger = setup_optim_trigger(args.training_type)
126 | 
127 |         model_fname = '{}_{}_{}-fold_{}_best_model.npz'.format(
128 |             res.sdtime, args.training_type, i + 1, args.arch)
129 |         trainer.extend(extensions.snapshot_object(
130 |             model, model_fname),
131 |             trigger=record_trigger)
132 | 
133 |         trainer.extend(extensions.ExponentialShift('alpha', 0.9), trigger=optim_trigger)
134 | 
135 |         # Write a log of evaluation statistics for each epoch
136 |         trainer_log_name = '{}_{}_{}-fold_{}_reporter.json'.format(
137 |             res.sdtime, args.training_type, i + 1, args.arch)
138 |         trainer.extend(extensions.LogReport(log_name=trainer_log_name))
139 |         trainer.extend(extensions.observe_lr())
140 | 
141 |         fig_loss_fpath = res.fig_loss_dir / '{}_{}_{}-fold_loss.png'.format(
142 |             res.sdtime, args.training_type, i + 1)
143 |         fig_loss_path = fig_loss_fpath.relative_to(res.output_dir)
144 |         plot_loss_entries = setup_plot_report_loss_entries(args.training_type)
145 |         trainer.extend(extensions.PlotReport(plot_loss_entries, 'epoch',
146 |                                              file_name=str(fig_loss_path), grid=False))
147 | 
148 |         entries = setup_print_report_entries(args.training_type)
149 |         trainer.extend(extensions.PrintReport(entries))
150 | 
151 |         trainer.extend(extensions.ProgressBar(update_interval=10))
152 | 
153 |         # Run the training
154 |         trainer.run()
155 | 
156 |         if args.test:
157 |             res.logger.debug('Start test phase')
158 |             snapshot_best_model = res.output_dir / model_fname
159 |             chainer.serializers.load_npz(str(snapshot_best_model), model)
160 |             res.logdebug('Load: {}'.format(str(snapshot_best_model)))
161 | 
162 |             calculated_scores = EVALUATE_PHASES[args.training_type](
163 |                 res=res, model=model, pairs=val_pairs, fold=i + 1,
164 |                 converter=convert_seq, vectorizer=vectorizer).test()
165 | 
166 |             for metric in scores.keys():
167 |                 scores[metric].append(calculated_scores[metric])
168 | 
169 |         if not args.cv:
170 |             break
171 | 
172 |     res.dump_command_info()
173 |     logger = logzero.setup_logger(
174 |         name='test',
175 |         logfile=str(res.log_dir / f'{res.sdtime}_test.log'),
176 |     )
177 |     for metric in scores.keys():
178 |         logger.info(f'Average {metric}: {np.mean(scores[metric]):.6f}, var: {np.var(scores[metric]):.6f}')
179 | 
180 |     res.dump_duration()
181 | 
182 |     notify_result(res)
183 | 
184 | 
185 | if __name__ == '__main__':
186 | 
187 |     args = parse_args()
188 |     try:
189 |         main(args)
190 |     except Exception as err:
191 |         if not args.debug:
192 |             notify_exception(err)
193 |         raise err
194 | 


--------------------------------------------------------------------------------
/src/util/args.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from models import ARCHS
 4 | from util import const
 5 | 
 6 | 
 7 | def common_args():
 8 | 
 9 |     parser = argparse.ArgumentParser(description='training for analysing creatives')
10 |     parser.add_argument('--seed',
11 |                         type=int,
12 |                         default=19950815)
13 |     parser.add_argument('--test',
14 |                         action='store_true',
15 |                         default=False)
16 |     parser.add_argument('--debug',
17 |                         action='store_true',
18 |                         default=False)
19 |     parser.add_argument('--out',
20 |                         type=str,
21 |                         default='result')
22 |     parser.add_argument('--arch',
23 |                         type=str,
24 |                         default='gru',
25 |                         choices=ARCHS.keys())
26 |     parser.add_argument('--gpu',
27 |                         type=int,
28 |                         default=-1)
29 |     parser.add_argument('--fold',
30 |                         type=int,
31 |                         default=5)
32 |     parser.add_argument('--group',
33 |                         action='store_true',
34 |                         default=False)
35 |     parser.add_argument('--training_type',
36 |                         type=str,
37 |                         choices=const.TRAINING_TYPES,
38 |                         default=const.TRAINING_TYPES_REGRESSION)
39 |     parser.add_argument('--workers', default=8, type=int)
40 |     parser.add_argument('--resume', '-r', default='')
41 | 
42 |     return parser
43 | 
44 | 
45 | def parse_train_args():
46 | 
47 |     parser = common_args()
48 |     parser.add_argument('--epoch',
49 |                         type=int,
50 |                         default=50)
51 |     parser.add_argument('--batchsize',
52 |                         type=int,
53 |                         default=32)
54 |     parser.add_argument('--weight_decay',
55 |                         type=float,
56 |                         default=0.0001)
57 |     parser.add_argument('--cv',
58 |                         action='store_true',
59 |                         default=False)
60 |     parser.add_argument('--layer',
61 |                         type=int,
62 |                         default=1)
63 |     parser.add_argument('--genre_unit',
64 |                         type=int,
65 |                         default=5)
66 |     parser.add_argument('--dropout',
67 |                         type=float,
68 |                         default=0.2)
69 |     parser.add_argument('--num_class',
70 |                         type=int,
71 |                         default=3)
72 |     parser.add_argument('--word_embedding',
73 |                         choices=const.WORD2VEC_TYPES,
74 |                         default=const.WORD2VEC_UPDATE)
75 | 
76 |     parser.add_argument('--objective',
77 |                         choices=['conversion', 'cvr', 'click'],
78 |                         nargs='+',
79 |                         default=['conversion'])
80 | 
81 |     return parser.parse_args()
82 | 


--------------------------------------------------------------------------------
/src/util/const.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | 
 3 | ROOT_DIR = pathlib.Path(__file__).parents[2]
 4 | DATA_DIR = ROOT_DIR / 'data'
 5 | 
 6 | TRAIN_DATA_FNAME = 'train.csv'
 7 | TRAIN_DATA_FPATH = DATA_DIR / TRAIN_DATA_FNAME
 8 | 
 9 | PRETRAINED_WORD2VEC_FNAME = 'entity_vector.model.bin'
10 | PRETRAINED_WORD2VEC_FPATH = DATA_DIR / PRETRAINED_WORD2VEC_FNAME
11 | 
12 | NEOLOGD_DIR = '/var/lib/mecab/dic/mecab-ipadic-neologd'
13 | 
14 | DATASET_FROM_DATE = '2017-08-01'
15 | DATASET_TO_DATE = '2018-08-01'
16 | 
17 | PREPROCESS_GENDER_TARGET_NUM = 3
18 | 
19 | NORMALIZED_IMPRESSION = 30000
20 | 
21 | TRAINING_TYPES_REGRESSION = 'regression'
22 | TRAINING_TYPES_MULTI_REGRESSION = 'multi_regression'
23 | TRAINING_TYPES = [
24 |     TRAINING_TYPES_REGRESSION,
25 |     TRAINING_TYPES_MULTI_REGRESSION,
26 | ]
27 | 
28 | WORD2VEC_UPDATE = 'word2vec_update'
29 | WORD2VEC_FREEZE = 'word2vec_freeze'
30 | FROM_SCRATCH = 'scratch'
31 | WORD2VEC_TYPES = [
32 |     WORD2VEC_UPDATE,
33 |     WORD2VEC_FREEZE,
34 |     FROM_SCRATCH,
35 | ]
36 | 
37 | EVALUATION_METRIC = {
38 |     TRAINING_TYPES_REGRESSION: [
39 |         'MSE_CV', 'MSE_gt_1', 'MAP_0', 'MAP_10', 'NDCG_CV',
40 |         'MSE_CV_top_50', 'MSE_CV_top_25', 'MSE_CV_top_10', 'MSE_CV_top_5', 'MSE_CV_top_1',
41 |         'NDCG_CV_top_50', 'NDCG_CV_top_25', 'NDCG_CV_top_10', 'NDCG_CV_top_5', 'NDCG_CV_top_1',
42 |     ],
43 |     TRAINING_TYPES_MULTI_REGRESSION: [
44 |         'MSE_click', 'MSE_CV', 'MSE_multi', 'MSE_CVR', 'MAP_0', 'MAP_10', 'MAP_CVR', 'NDCG_CV', 'NDCG_CVR',
45 |         'MSE_CV_top_50', 'MSE_CV_top_25', 'MSE_CV_top_10', 'MSE_CV_top_5', 'MSE_CV_top_1',
46 |         'NDCG_CV_top_50', 'NDCG_CV_top_25', 'NDCG_CV_top_10', 'NDCG_CV_top_5', 'NDCG_CV_top_1',
47 |     ],
48 | }
49 | 


--------------------------------------------------------------------------------
/src/util/cross_validation.py:
--------------------------------------------------------------------------------
 1 | from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
 2 | 
 3 | 
 4 | def kfold_iter(
 5 |         X, y, n_splits,
 6 |         random_state,
 7 |         is_campaign_group,
 8 |         campaign_ids=None,
 9 |         training_type='classification'):
10 | 
11 |     if is_campaign_group and campaign_ids is not None:
12 |         return GroupKFold(n_splits=n_splits).split(X, y, campaign_ids)
13 | 
14 |     if training_type == 'classification':
15 |         return StratifiedKFold(n_splits=n_splits,
16 |                                random_state=random_state,
17 |                                shuffle=True).split(X, y)
18 |     elif training_type in ('regression', 'multi_regression'):
19 |         return KFold(n_splits=n_splits,
20 |                      random_state=random_state,
21 |                      shuffle=True).split(X, y)
22 |     else:
23 |         raise ValueError('Invalid training type: {}'.format(training_type))
24 | 


--------------------------------------------------------------------------------
/src/util/extensions.py:
--------------------------------------------------------------------------------
 1 | from chainer.training import triggers
 2 | 
 3 | 
 4 | def setup_record_trigger(training_type):
 5 | 
 6 |     if training_type == 'regression' or training_type == 'multi_regression':
 7 |         return triggers.MinValueTrigger('validation/main/loss')
 8 | 
 9 |     else:
10 |         raise ValueError('Invalid training type: {}'.format(training_type))
11 | 
12 | 
13 | def setup_optim_trigger(training_type):
14 | 
15 |     if training_type == 'regression' or training_type == 'multi_regression':
16 |         return triggers.MinValueTrigger('validation/main/loss')
17 | 
18 |     else:
19 |         raise ValueError('Invalid training type: {}'.format(training_type))
20 | 
21 | 
22 | def setup_print_report_entries(training_type):
23 | 
24 |     if training_type == 'regression':
25 |         entries = [
26 |             'epoch',
27 |             'main/loss', 'validation/main/loss',
28 |             'elapsed_time', 'lr',
29 |         ]
30 |     elif training_type == 'multi_regression':
31 |         entries = [
32 |             'epoch',
33 |             'main/loss', 'validation/main/loss',
34 |             'main/loss_click', 'validation/main/loss_click',
35 |             'main/loss_cv', 'validation/main/loss_cv',
36 |             'elapsed_time', 'lr',
37 |         ]
38 |     else:
39 |         raise ValueError('Invalid training type: {}'.format(training_type))
40 | 
41 |     return entries
42 | 
43 | 
44 | def setup_plot_report_loss_entries(training_type):
45 | 
46 |     if training_type == 'classification' or training_type == 'regression':
47 |         entries = ['main/loss', 'val/main/loss']
48 | 
49 |     elif training_type == 'multi_regression':
50 |         entries = [
51 |             'main/loss', 'validation/main/loss',
52 |             'main/loss_click', 'validation/main/loss_click',
53 |             'main/loss_cv', 'validation/main/loss_cv',
54 |         ]
55 |     else:
56 |         raise ValueError('Invalid training type: {}'.format(training_type))
57 | 
58 |     return entries
59 | 


--------------------------------------------------------------------------------
/src/util/load.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | 
 4 | def load_data(dataset_fpath):
 5 | 
 6 |     df = pd.read_csv(str(dataset_fpath))
 7 |     print(f'Raw data size: {df.shape}')
 8 | 
 9 |     df['cvr'] = df[['conversion', 'click']].apply(
10 |         lambda x: (x[0] / x[1]) * 100 if x[1] > 0 else 0, axis=1)
11 | 
12 |     return df
13 | 


--------------------------------------------------------------------------------
/src/util/ndcg.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def ndcg_score(y_true, y_pred, k=None, powered=False):
 5 |     def dcg(scores, k=None, powered=False):
 6 |         if k is None:
 7 |             k = scores.shape[0]
 8 |         if not powered:
 9 |             ret = scores[0]
10 |             for i in range(1, k):
11 |                 ret += scores[i] / np.log2(i + 1)
12 |             return ret
13 |         else:
14 |             ret = 0
15 |             for i in range(k):
16 |                 ret += (2 ** scores[i] - 1) / np.log2(i + 2)
17 |             return ret
18 | 
19 |     ideal_sorted_scores = np.sort(y_true)[::-1]
20 |     ideal_dcg_score = dcg(ideal_sorted_scores, k=k, powered=powered)
21 | 
22 |     pred_sorted_ind = np.argsort(y_pred)[::-1]
23 |     pred_sorted_scores = y_true[pred_sorted_ind]
24 |     dcg_score = dcg(pred_sorted_scores, k=k, powered=powered)
25 | 
26 |     return dcg_score / ideal_dcg_score
27 | 
28 | 
29 | def ndcg1_score(y_true, y_pred, k=None):
30 |     return ndcg_score(y_true, y_pred, k=k, powered=False)
31 | 


--------------------------------------------------------------------------------
/src/util/nlp_utils.py:
--------------------------------------------------------------------------------
 1 | import chainer.functions as F
 2 | import numpy as np
 3 | 
 4 | 
 5 | def sequence_embed(embed, xs, dropout=0.):
 6 | 
 7 |     x_len = [len(x) for x in xs]
 8 |     x_section = np.cumsum(x_len[:-1])
 9 | 
10 |     ex = embed(F.concat(xs, axis=0))
11 |     ex = F.dropout(ex, ratio=dropout)
12 |     exs = F.split_axis(ex, x_section, 0)
13 |     return exs
14 | 
15 | 
16 | def block_embed(embed, x, dropout=0.):
17 | 
18 |     e = embed(x)
19 |     e = F.dropout(e, ratio=dropout)
20 |     e = F.transpose(e, (0, 2, 1))
21 |     e = e[:, :, :, None]
22 |     return e
23 | 


--------------------------------------------------------------------------------
/src/util/preprocessed_dataset.py:
--------------------------------------------------------------------------------
  1 | import chainer
  2 | import numpy as np
  3 | from chainer import cuda
  4 | from sklearn.pipeline import Pipeline
  5 | 
  6 | from util import const
  7 | from util.transforms import (
  8 |     GenderTransformer,
  9 |     GenreTransformer,
 10 |     LimitDataTransformer,
 11 |     MinMaxScaleTransformer,
 12 |     ToLogarithmTransformer,
 13 |     TypeConvertTransformer,
 14 |     Word2VecTransformer
 15 | )
 16 | 
 17 | 
 18 | def prepare_vectorizer(pretrain_w2v,
 19 |                        training_type,
 20 |                        norm_imp=None,
 21 |                        is_impnorm=False,
 22 |                        is_logarithm=False,
 23 |                        ):
 24 |     steps = [
 25 |         ('lmit', LimitDataTransformer()),
 26 |         ('genre', GenreTransformer()),
 27 |         ('gender', GenderTransformer()),
 28 |         ('w2v', Word2VecTransformer(
 29 |             columns=['title_text', 'content_text'],
 30 |             pretrain_w2v=pretrain_w2v)),
 31 |     ]
 32 | 
 33 |     if is_logarithm:
 34 |         steps.extend([
 35 |             ('log_click', ToLogarithmTransformer(column='click')),
 36 |             ('loc_cv', ToLogarithmTransformer(column='conversion')),
 37 |         ])
 38 | 
 39 |     if training_type in [const.TRAINING_TYPES_REGRESSION,
 40 |                          const.TRAINING_TYPES_MULTI_REGRESSION]:
 41 |         if is_impnorm:
 42 |             click_col = 'imp_norm_click'
 43 |             cv_col = 'imp_norm_conversion'
 44 |         else:
 45 |             click_col = 'click'
 46 |             cv_col = 'conversion'
 47 | 
 48 |         scaler = [
 49 |             ('mm_click', MinMaxScaleTransformer(column=click_col)),
 50 |             ('mm_conversion', MinMaxScaleTransformer(column=cv_col)),
 51 |             ('mm_cvr', MinMaxScaleTransformer(column='cvr')),
 52 |         ]
 53 |         steps.extend(scaler)
 54 | 
 55 |     steps.extend([
 56 |         ('type_convert', TypeConvertTransformer(
 57 |             columns=['product_id', 'genre'],
 58 |             dtype=np.int32)),
 59 |     ])
 60 | 
 61 |     return Pipeline(steps)
 62 | 
 63 | 
 64 | class PreprocessedDataset(chainer.dataset.DatasetMixin):
 65 | 
 66 |     def __init__(self, df,
 67 |                  training_type,
 68 |                  output_cols):
 69 | 
 70 |         self.X = df.drop(output_cols, axis=1)
 71 |         self.y = df[output_cols]
 72 | 
 73 |         self.training_type = training_type
 74 | 
 75 |     def __len__(self):
 76 |         return len(self.X)
 77 | 
 78 |     def get_example(self, i):
 79 | 
 80 |         X = self.X.iloc[i]
 81 |         y = self.y.iloc[i]
 82 | 
 83 |         X = X.to_dict()
 84 |         y = y.values.astype(np.float32)
 85 | 
 86 |         return X, y
 87 | 
 88 | 
 89 | def convert_seq(batch, device=None, with_label=True):
 90 | 
 91 |     def to_device_batch(batch):
 92 |         if device is None:
 93 |             return batch
 94 |         elif device < 0:
 95 |             return [chainer.dataset.to_device(device, x) for x in batch]
 96 |         else:
 97 |             xp = cuda.cupy.get_array_module(*batch)
 98 |             concat = xp.concatenate(batch, axis=0)
 99 |             sections = np.cumsum([len(x)
100 |                                   for x in batch[:-1]], dtype=np.int32)
101 |             concat_dev = chainer.dataset.to_device(device, concat)
102 |             batch_dev = cuda.cupy.split(concat_dev, sections)
103 |             return batch_dev
104 | 
105 |     if with_label:
106 |         return {
107 |             'product_xs': to_device_batch([[x['product_id']] for x, _ in batch]),
108 |             'genre_xs': to_device_batch([np.asarray([x['genre']]) for x, _ in batch]),
109 |             'gender_xs': to_device_batch([x['gender_target'] for x, _ in batch]),
110 |             'title_xs': to_device_batch([x['title_text'] for x, _ in batch]),
111 |             'content_xs': to_device_batch([x['content_text'] for x, _ in batch]),
112 |             'ys': to_device_batch([y for _, y in batch]),
113 |         }
114 | 
115 |     else:
116 |         return to_device_batch([x for x in batch])
117 | 


--------------------------------------------------------------------------------
/src/util/resource.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import logging
  3 | import pathlib
  4 | import socket
  5 | import sys
  6 | 
  7 | import logzero
  8 | 
  9 | from util import const
 10 | 
 11 | 
 12 | class Resource(object):
 13 | 
 14 |     def __init__(self, args, train=True):
 15 |         self.args = args
 16 |         self.start_time = datetime.datetime.now()
 17 |         self.logger = logzero.setup_default_logger()
 18 | 
 19 |         # test=False and train=False
 20 |         if not args.test and train:
 21 |             self.logger.warn('Test option is {}'.format(args.test))
 22 | 
 23 |         # setup experiment directory
 24 |         self.output_dir = self._setup_output_dir()
 25 |         self.log_dir = self._setup_log_dir()
 26 | 
 27 |         if train:
 28 |             self.fig_dir = self._setup_fig_dir()
 29 |             log_filename = '{}_train.log'.format(self.sdtime)
 30 |         else:
 31 |             log_filename = '{}_inference.log'.format(self.sdtime)
 32 | 
 33 |         log_name = self.log_dir / log_filename
 34 | 
 35 |         logzero.logfile(str(log_name), loglevel=logging.INFO)
 36 | 
 37 |         self.log_name = log_name
 38 |         self.logger.info('Log filename: {}'.format(str(log_name)))
 39 |         self.logger.info('Server name: {}'.format(socket.gethostname()))
 40 |         self.dump_common_info()
 41 | 
 42 |     @property
 43 |     def stime(self):
 44 |         return self.start_time.strftime('%Y-%m-%d-%H-%M-%S')
 45 | 
 46 |     @property
 47 |     def sdtime(self):
 48 |         return self.start_time.strftime('%H-%M-%S')
 49 | 
 50 |     @property
 51 |     def sytime(self):
 52 |         return self.start_time.strftime('%Y-%m-%d')
 53 | 
 54 |     @property
 55 |     def duration(self):
 56 |         end_time = datetime.datetime.now()
 57 |         duration = end_time - self.start_time
 58 |         return duration
 59 | 
 60 |     def logdebug(self, msg):
 61 |         self.logger.debug(msg)
 62 | 
 63 |     def loginfo(self, msg):
 64 |         self.logger.info(msg)
 65 | 
 66 |     def _setup_output_dir(self):
 67 |         output_dir = pathlib.Path(self.args.out) / self.sytime
 68 | 
 69 |         if not output_dir.exists():
 70 |             output_dir.mkdir(parents=True)
 71 |             self.logger.debug('Output dir is created at [{}]'.format(str(output_dir)))
 72 |         else:
 73 |             self.logger.debug('Output dir is already exists.'.format(str(output_dir)))
 74 | 
 75 |         return output_dir
 76 | 
 77 |     def _setup_log_dir(self):
 78 |         log_dir = self.output_dir / 'log'
 79 | 
 80 |         if not log_dir.exists():
 81 |             log_dir.mkdir()
 82 |             self.logger.debug('Log dir is created at [{}]'.format(str(log_dir)))
 83 |         else:
 84 |             self.logger.debug('Log dir is already exists.'.format(str(log_dir)))
 85 | 
 86 |         return log_dir
 87 | 
 88 |     def _setup_fig_dir(self):
 89 |         fig_dir = self.output_dir / 'fig'
 90 | 
 91 |         self.fig_acc_dir = fig_dir / 'accuracy'
 92 |         self.fig_loss_dir = fig_dir / 'loss'
 93 |         self.fig_heatmap_dir = fig_dir / 'heatmap'
 94 |         self.fig_coef_dir = fig_dir / 'coef'
 95 | 
 96 |         if not fig_dir.exists():
 97 |             self.logger.debug('Fig dir is created at [{}]'.format(str(fig_dir)))
 98 |             fig_dir.mkdir()
 99 |             # also make directories
100 |             self.fig_acc_dir.mkdir()
101 |             self.fig_loss_dir.mkdir()
102 |             self.fig_heatmap_dir.mkdir()
103 |             self.fig_coef_dir.mkdir()
104 |         else:
105 |             self.logger.debug('Fig dir is already exists.'.format(str(fig_dir)))
106 | 
107 |         return fig_dir
108 | 
109 |     def dump_common_info(self):
110 |         logger = logzero.setup_logger(
111 |             name='preprocess',
112 |             logfile=str(self.log_dir / f'{self.sdtime}_preprocess.log'))
113 | 
114 |         logger.info('=== Common informations ===')
115 |         logger.info('Model: {}'.format(self.args.arch))
116 |         logger.info('Word embedding: {}'.format(self.args.word_embedding))
117 |         logger.info('Training type: {}'.format(self.args.training_type))
118 | 
119 |         logger.info('# of epoch: {}'.format(self.args.epoch))
120 |         logger.info('# of batchsize: {}'.format(self.args.batchsize))
121 |         logger.info('# of encoder layers: {}'.format(self.args.layer))
122 |         logger.info('# of genre embedding dim: {}'.format(self.args.genre_unit))
123 |         logger.info('Dropout ratio: {}'.format(self.args.dropout))
124 |         logger.info('Weight decay: {}'.format(self.args.weight_decay))
125 |         logger.info('GPU ID: {}'.format(self.args.gpu))
126 |         logger.info('Cross validation: {}, # of folds: {}'.format(self.args.cv, self.args.fold))
127 |         logger.info('Seed: {}'.format(self.args.seed))
128 |         logger.info('GroupKFold: {}'.format(self.args.group))
129 |         logger.info('Apply impression normalization: {}'.format(self.args.imp_norm))
130 |         logger.info('Target objective: {}'.format(self.args.objective))
131 | 
132 |     def dump_duration(self):
133 |         logger = logzero.setup_logger(
134 |             name='test',
135 |             logfile=str(self.log_dir / f'{self.sdtime}_test.log')
136 |         )
137 |         end_time = datetime.datetime.now()
138 |         logger.info('Exit time: {}'.format(end_time.strftime('%Y/%m/%d - %H:%M:%S')))
139 |         logger.info('Duration: {}'.format(self.duration))
140 |         logger.info('Remember: log is saved to {}'.format(str(self.log_name)))
141 | 
142 |     def dump_command_info(self):
143 |         logger = logzero.setup_logger(
144 |             name='test',
145 |             logfile=str(self.log_dir / f'{self.sdtime}_test.log')
146 |         )
147 |         logger.info('Command name: {}'.format(' '.join(sys.argv)))
148 | 


--------------------------------------------------------------------------------
/src/util/seed.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | import chainer
 4 | import numpy as np
 5 | 
 6 | 
 7 | def reset_seed(seed):
 8 | 
 9 |     random.seed(seed)
10 |     np.random.seed(seed)
11 | 
12 |     if chainer.cuda.available:
13 |         chainer.cuda.cupy.random.seed(seed)
14 | 


--------------------------------------------------------------------------------
/src/util/transforms.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from sklearn.base import BaseEstimator, TransformerMixin
  4 | from sklearn.preprocessing import LabelBinarizer, LabelEncoder, MinMaxScaler
  5 | 
  6 | from util.preprocess import wakati
  7 | 
  8 | 
  9 | class LimitDataTransformer(BaseEstimator, TransformerMixin):
 10 | 
 11 |     def __init__(self, num_imp):
 12 |         self.num_imp = num_imp
 13 | 
 14 |     def fit(self, X, *args):
 15 |         return self
 16 | 
 17 |     def transform(self, df):
 18 |         df = df[df['impression'] > self.num_imp]
 19 |         return df
 20 | 
 21 | 
 22 | class WakatiTransformer(BaseEstimator, TransformerMixin):
 23 | 
 24 |     def __init__(self, column, wakati_func=wakati):
 25 |         self.column = column
 26 |         self.wakati_func = wakati
 27 | 
 28 |     def fit(self, X, *args):
 29 |         return self
 30 | 
 31 |     def transform(self, X):
 32 |         X[self.column] = X[self.column].apply(self.wakati_func)
 33 |         return X
 34 | 
 35 | 
 36 | class Word2VecTransformer(BaseEstimator, TransformerMixin):
 37 | 
 38 |     def __init__(self, columns, pretrain_w2v):
 39 |         self.columns = columns
 40 |         self.w2v = pretrain_w2v
 41 |         self.w2i = {w: i for i, w in enumerate(pretrain_w2v.index2word)}
 42 | 
 43 |     def fit(self, X, *args):
 44 |         return self
 45 | 
 46 |     def word2id(self, words):
 47 |         return np.asarray([
 48 |             self.w2i[w] for w in words
 49 |             if w in self.w2v.vocab], dtype=np.int32)
 50 | 
 51 |     def transform(self, X):
 52 |         for col in self.columns:
 53 |             X[col] = X[col].apply(self.word2id)
 54 |         return X
 55 | 
 56 | 
 57 | class GenreTransformer(BaseEstimator, TransformerMixin):
 58 | 
 59 |     def __init__(self):
 60 |         self.le = None
 61 | 
 62 |     def fit(self, X, *args):
 63 |         self.le = LabelEncoder().fit(X['genre'])
 64 |         return self
 65 | 
 66 |     def transform(self, X):
 67 |         X['genre'] = self.le.transform(X['genre'])
 68 |         return X
 69 | 
 70 | 
 71 | class GenderTransformer(BaseEstimator, TransformerMixin):
 72 | 
 73 |     def __init__(self):
 74 |         self.lb = None
 75 | 
 76 |     def fit(self, X, *args):
 77 |         self.lb = LabelBinarizer().fit(X['gender_target'])
 78 |         return self
 79 | 
 80 |     def transform(self, X):
 81 |         X['gender_target'] = self.lb.transform(X['gender_target']).tolist()
 82 |         X['gender_target'] = X['gender_target'].apply(
 83 |             lambda x: np.asarray(x, dtype=np.float32))
 84 |         return X
 85 | 
 86 | 
 87 | class MinMaxScaleTransformer(BaseEstimator, TransformerMixin):
 88 | 
 89 |     def __init__(self, column):
 90 |         self.column = column
 91 |         self.mm = None
 92 | 
 93 |     def fit(self, X, *args):
 94 |         self.mm = MinMaxScaler().fit(X[[self.column]])
 95 |         return self
 96 | 
 97 |     def transform(self, X):
 98 |         X[self.column] = self.mm.transform(X[[self.column]])
 99 |         return X
100 | 
101 | 
102 | class ToLogarithmTransformer(BaseEstimator, TransformerMixin):
103 | 
104 |     def __init__(self, column):
105 |         self.column = column
106 | 
107 |     def fit(self, X, *args):
108 |         return self
109 | 
110 |     def transform(self, X):
111 |         X[self.column] = np.log1p(X[self.column])
112 |         return X
113 | 
114 |     def inverse_transform(self, X):
115 |         X[self.colum] = np.expm1(X[self.column])
116 |         return X
117 | 
118 | 
119 | class TypeConvertTransformer(BaseEstimator, TransformerMixin):
120 | 
121 |     def __init__(self, columns, dtype):
122 |         self.columns = columns
123 |         self.dtype = dtype
124 | 
125 |     def fit(self, X, *args):
126 |         return self
127 | 
128 |     def transform(self, X):
129 |         for col in self.columns:
130 |             X[col] = X[col].astype(self.dtype)
131 |         return X
132 | 


--------------------------------------------------------------------------------