├── cache
    └── .keepme
├── data
    └── .keepme
├── quest
    ├── __init__.py
    ├── metrics.py
    ├── post_processing.py
    ├── train_folds.py
    ├── dataset.py
    ├── eval_tpu.py
    ├── eval.py
    ├── inference.py
    ├── models.py
    ├── prepare_reference_data.py
    ├── train.py
    └── prepare_tfrecords.py
├── imgs
    └── submission.png
├── requirements.txt
├── tf-helper-bot
    ├── README.md
    ├── tf_helper_bot
    │   ├── __init__.py
    │   ├── utils.py
    │   ├── logger.py
    │   ├── mixup.py
    │   ├── metrics.py
    │   ├── lr_schedulers.py
    │   ├── callbacks.py
    │   ├── optimizers.py
    │   └── bot.py
    ├── .gitignore
    ├── setup.py
    └── LICENSE
├── .gitignore
├── setup.py
├── LICENSE
└── README.md


/cache/.keepme:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/.keepme:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/quest/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/imgs/submission.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ceshine/kaggle-quest/master/imgs/submission.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | fire
2 | joblib
3 | transformers==2.3.0
4 | python-telegram-bot
5 | tensorflow>=2.1.0


--------------------------------------------------------------------------------
/tf-helper-bot/README.md:
--------------------------------------------------------------------------------
1 | # TF Helper Bot (WIP)
2 | 
3 | Writing powerful custom training loops for Tensorflow 2.x with less code.
4 | 
5 | (This is basically a TF port of [pytorch-helper-bot](https://github.com/ceshine/pytorch-helper-bot))
6 | 


--------------------------------------------------------------------------------
/tf-helper-bot/tf_helper_bot/__init__.py:
--------------------------------------------------------------------------------
1 | from .bot import BaseBot, BaseDistributedBot
2 | from .callbacks import *
3 | from .logger import Logger
4 | from .metrics import (Metric, FBeta, AUC)
5 | from .lr_schedulers import CosineDecayWithWarmup
6 | 


--------------------------------------------------------------------------------
/tf-helper-bot/.gitignore:
--------------------------------------------------------------------------------
 1 | *.csv
 2 | *#
 3 | *~
 4 | cache
 5 | __pycache__
 6 | .dir-locals.el
 7 | .idea/
 8 | .vscode/
 9 | .ipynb_checkpoints/
10 | *.7z
11 | *.html
12 | *.gz
13 | *.out
14 | runs/
15 | data/
16 | plots
17 | *.zip
18 | .mypy_cache
19 | pylintrc
20 | *.egg-info/
21 | .cache/
22 | core
23 | .nv/
24 | .bash_history
25 | data
26 | wandb/
27 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | 
 7 | # submission file
 8 | *.csv
 9 | 
10 | # pyCharm files
11 | .idea/
12 | 
13 | .mypy_cache/
14 | .vscode/
15 | 
16 | *.7z
17 | *.zip
18 | 
19 | bot
20 | input
21 | 
22 | data/
23 | notebooks/.ipynb_checkpoints/
24 | docs/
25 | build/
26 | dist/
27 | *.egg-info/
28 | 
29 | cache/
30 | logs/
31 | 
32 | env.fish


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name="quest",
 5 |     version='0.0.1',
 6 |     author="Ceshine Lee",
 7 |     author_email="ceshine@ceshine.net",
 8 |     description="",
 9 |     license="MIT",
10 |     url="",
11 |     packages=["quest"],
12 |     install_requires=[],
13 |     classifiers=[
14 |         "Development Status :: 4 - Beta",
15 |         "Intended Audience :: Science/Research",
16 |         "Programming Language :: Python :: 3.6",
17 |         "Topic :: Scientific/Engineering :: Artificial Intelligence"
18 |     ],
19 |     keywords=""
20 | )
21 | 


--------------------------------------------------------------------------------
/tf-helper-bot/tf_helper_bot/utils.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | 
 4 | def prepare_tpu():
 5 |     try:
 6 |         tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
 7 |         print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
 8 |     except ValueError:
 9 |         tpu = None
10 |     strategy = tf.distribute.get_strategy()
11 |     if tpu:
12 |         tf.config.experimental_connect_to_cluster(tpu)
13 |         tf.tpu.experimental.initialize_tpu_system(tpu)
14 |         strategy = tf.distribute.experimental.TPUStrategy(tpu)
15 |     return strategy, tpu
16 | 


--------------------------------------------------------------------------------
/quest/metrics.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import sklearn.metrics
 3 | from scipy.stats import spearmanr
 4 | from scipy.special import expit
 5 | from tf_helper_bot import Metric
 6 | 
 7 | 
 8 | class SpearmanCorr(Metric):
 9 |     name = "spearman"
10 | 
11 |     def __init__(self, add_sigmoid: bool = False):
12 |         self.add_sigmoid = add_sigmoid
13 | 
14 |     def __call__(self, truth: np.ndarray, pred: np.ndarray):
15 |         if self.add_sigmoid:
16 |             pred = expit(pred)
17 |         corrs = []
18 |         for i in range(pred.shape[1]):
19 |             if len(np.unique(truth[:, i])) == 1:
20 |                 continue
21 |             corrs.append(
22 |                 spearmanr(
23 |                     truth[:, i],
24 |                     pred[:, i]
25 |                 ).correlation
26 | 
27 |             )
28 |         score = np.mean(corrs)
29 |         return score * -1, f"{score * 100:.2f}"
30 | 


--------------------------------------------------------------------------------
/tf-helper-bot/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name='tf_helper_bot',
 5 |     version='0.0.1',
 6 |     packages=['tf_helper_bot'],
 7 |     install_requires=[],
 8 |     classifiers=[  # Optional
 9 |         # How mature is this project? Common values are
10 |         #   3 - Alpha
11 |         #   4 - Beta
12 |         #   5 - Production/Stable
13 |         'Development Status :: 3 - Alpha',
14 | 
15 |         # Indicate who your project is intended for
16 |         'Intended Audience :: Developers',
17 | 
18 |         # Pick your license as you wish
19 |         'License :: OSI Approved :: MIT License',
20 | 
21 |         # Specify the Python versions you support here. In particular, ensure
22 |         # that you indicate whether you support Python 2, Python 3 or both.
23 |         'Programming Language :: Python :: 3.7',
24 |         'Programming Language :: Python :: 3.8'
25 |     ],
26 | )
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2020 Ceshine Lee
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/tf-helper-bot/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2020 Ceshine Lee
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/quest/post_processing.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | 
 3 | import numpy as np
 4 | from scipy.stats import spearmanr
 5 | from sklearn.preprocessing import MinMaxScaler
 6 | 
 7 | 
 8 | def prevent_nan(pred):
 9 |     for i in range(pred.shape[1]):
10 |         if len(np.unique(pred[:, i])) == 1:
11 |             pred[0, i] = np.random.rand()
12 |             pred[-1, i] = np.random.rand()
13 |     return pred
14 | 
15 | 
16 | def find_best_bins(y_true, y_pred):
17 |     scaler = MinMaxScaler()
18 |     y_pred = scaler.fit_transform(y_pred)
19 |     y = np.copy(y_pred)
20 |     list_of_bins = []
21 |     for i in (range(y_pred.shape[1])):
22 |         best_score = 0  # initilize score for the the column i
23 |         best_bins = 1
24 |         history_score = []
25 |         for max_voters in range(2, 200):
26 |             y[:, i] = np.round(
27 |                 y_pred[:, i] * max_voters
28 |             ) / max_voters
29 |             y[:, i] = prevent_nan(y[:, i:i+1])[:, 0]
30 |             score = spearmanr(y_true[:, i], y[:, i]).correlation
31 |             history_score.append(score)
32 |             if score > best_score:
33 |                 best_score = score
34 |                 best_bins = max_voters
35 |         list_of_bins.append(best_bins)
36 |         y[:, i] = np.round(y_pred[:, i] * best_bins) / best_bins
37 |     return np.mean([
38 |         spearmanr(y_true[:, ind], y[:, ind]).correlation
39 |         for ind in range(y.shape[1])
40 |     ]), list_of_bins, scaler
41 | 


--------------------------------------------------------------------------------
/tf-helper-bot/tf_helper_bot/logger.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | import logging
 3 | from pathlib import Path
 4 | 
 5 | 
 6 | class Logger:
 7 |     def __init__(self, model_name, log_dir: Path, level=logging.INFO, echo=False):
 8 |         self.model_name = model_name
 9 |         (log_dir / "summaries").mkdir(parents=True, exist_ok=True)
10 |         date_str = datetime.now().strftime('%Y%m%d_%H%M')
11 |         log_file = 'log_{}.txt'.format(date_str)
12 |         formatter = logging.Formatter(
13 |             '[%(levelname)s][%(asctime)s] %(message)s',
14 |             datefmt='%m/%d/%Y %H:%M:%S'
15 |         )
16 |         self.logger = logging.getLogger("bot")
17 |         # Remove all existing handlers
18 |         self.logger.handlers = []
19 |         # Initialize handlers
20 |         fh = logging.FileHandler(log_dir / log_file)
21 |         fh.setFormatter(formatter)
22 |         self.logger.addHandler(fh)
23 |         if echo:
24 |             sh = logging.StreamHandler()
25 |             sh.setFormatter(formatter)
26 |             self.logger.addHandler(sh)
27 |         self.logger.setLevel(level)
28 |         self.logger.propagate = False
29 | 
30 |     def info(self, msg, *args):
31 |         self.logger.info(msg, *args)
32 | 
33 |     def warning(self, msg, *args):
34 |         self.logger.warning(msg, *args)
35 | 
36 |     def debug(self, msg, *args):
37 |         self.logger.debug(msg, *args)
38 | 
39 |     def error(self, msg, *args):
40 |         self.logger.error(msg, *args)
41 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # TPU-Ready TF 2.1 Solution to Google QUEST Q&A Labeling using Siamese RoBERTa Encoder Model
 2 | 
 3 | The 5-fold models can be trained in about an hour using Colab TPU. The model performance after post-processing the predictions (to optimize the Spearman correlation to the target):
 4 | 
 5 | ![Submission Score](imgs/submission.png)
 6 | 
 7 | This is at around 65th place on the private leaderboard. The post-processing (which unfortunately I did not use in the competition) gives an almost 0.03 score boost.
 8 | 
 9 | [Inference Kernel on Kaggle](https://www.kaggle.com/ceshine/quest-roberta-inference?scriptVersionId=28553401)
10 | 
11 | ## Train on Colab TPU
12 | 
13 | [The Notebook](https://gist.github.com/ceshine/752c77742973a013320a9f20384528a1) used the generate the above submission is on Github Gist, and can be opened in Colab.
14 | 
15 | ### Preparation
16 | 
17 | #### Build the wheels
18 | 
19 | Run this command in the project root director and in the `tf-helper-bot` subdirectory:
20 | 
21 | `python setup.py sdist bdist_wheel`
22 | 
23 | And upload the `.whl` files in the `dist` directory to Google Cloud Storage.
24 | 
25 | #### Create the TFRecord files
26 | 
27 | Run this command and then upload the content in `cache/tfrecords` to Google Cloud Storage:
28 | 
29 | `python -m quest.prepare_tfrecords --model-name roberta-base -n-folds 5`
30 | 
31 | (Note: check [requirements.txt](requirements.txt) for missing dependencies.)
32 | 
33 | ## Acknowledgements
34 | 
35 | Some of the TPU resources used in the project is generously sponsored by [TensorFlow Research Cloud](https://www.tensorflow.org/tfrc).
36 | 


--------------------------------------------------------------------------------
/tf-helper-bot/tf_helper_bot/mixup.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import tensorflow_probability as tfp
 3 | 
 4 | 
 5 | def mixup_augment(alpha: float):
 6 |     """
 7 |        Adapted from:
 8 |        https://github.com/tensorpack/tensorpack/blob/master/examples/ResNet/cifar10-preact18-mixup.py
 9 |     """
10 |     dist = tfp.distributions.Beta(alpha, alpha)
11 | 
12 |     def _mixup_augment(images, labels):
13 |         batch_size = tf.shape(images)[0]
14 |         lambd = dist.sample([batch_size])
15 |         lambd = tf.math.reduce_max(
16 |             tf.stack([lambd, 1-lambd]), axis=0
17 |         )
18 |         lambd = tf.reshape(lambd, [batch_size, 1, 1, 1])
19 |         index = tf.random.shuffle(tf.range(batch_size))
20 |         new_images = images * lambd + tf.gather(images, index) * (1 - lambd)
21 |         return new_images, {"labels_1": labels, "labels_2": tf.gather(labels, index), "lambd": lambd[:, 0, 0, 0]}
22 |     return _mixup_augment
23 | 
24 | 
25 | def mixup_loss_fn(y_true, y_pred):
26 |     if isinstance(y_true, dict):
27 |         loss_1 = tf.keras.losses.sparse_categorical_crossentropy(
28 |             y_true["labels_1"],
29 |             y_pred
30 |         )
31 |         loss_2 = tf.keras.losses.sparse_categorical_crossentropy(
32 |             y_true["labels_2"],
33 |             y_pred
34 |         )
35 |         loss = tf.reduce_mean(
36 |             y_true["lambd"] * loss_1 + (1 - y_true["lambd"]) * loss_2
37 |         )
38 |     else:
39 |         loss = tf.reduce_mean(
40 |             tf.keras.losses.sparse_categorical_crossentropy(
41 |                 y_true,
42 |                 y_pred
43 |             )
44 |         )
45 |     return loss
46 | 


--------------------------------------------------------------------------------
/quest/train_folds.py:
--------------------------------------------------------------------------------
 1 | import fire
 2 | import tensorflow as tf
 3 | import numpy as np
 4 | 
 5 | from .train import train_model
 6 | 
 7 | 
 8 | def main(
 9 |     train_path_pattern: str = "cache/tfrecords/train-%d-*.tfrec",
10 |     valid_path_pattern: str = "cache/tfrecords/valid-%d-*.tfrec",
11 |     model_name: str = "bert-large-uncased-whole-word-masking",
12 |     output_path_pattern: str = "cache/bert-fold-%d/",
13 |     batch_size: int = 8, grad_accu: int = 2,
14 |     log_interval: int = 200, steps: int = 1000,
15 |     checkpoint_interval: int = 500,
16 |     min_lr: float = 1e-6, max_lr: float = 3e-5,
17 |     n_folds: int = 5, freeze: int = 0
18 | ):
19 |     scores = []
20 |     for fold in range(n_folds):
21 |         tmp = list(tf.io.gfile.glob(train_path_pattern % fold))
22 |         assert len(tmp) == 1
23 |         train_path = tmp[0]
24 |         tmp = list(tf.io.gfile.glob(valid_path_pattern % fold))
25 |         assert len(tmp) == 1
26 |         valid_path = tmp[0]
27 |         output_path = output_path_pattern % fold
28 |         print("=" * 20)
29 |         print(f"Training Fold {fold+1}")
30 |         print("=" * 20)
31 |         best_score = train_model(
32 |             train_path=train_path,
33 |             valid_path=valid_path,
34 |             model_name=model_name,
35 |             output_path=output_path,
36 |             batch_size=batch_size,
37 |             grad_accu=grad_accu,
38 |             log_interval=log_interval,
39 |             steps=steps,
40 |             checkpoint_interval=checkpoint_interval,
41 |             min_lr=min_lr,
42 |             max_lr=max_lr,
43 |             freeze=freeze
44 |         )
45 |         scores.append(best_score)
46 |     print(f"Scores: {np.mean(scores)} +- {np.std(scores)}")
47 | 
48 | 
49 | if __name__ == '__main__':
50 |     fire.Fire(main)
51 | 


--------------------------------------------------------------------------------
/tf-helper-bot/tf_helper_bot/metrics.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | from typing import Tuple, Union
 3 | 
 4 | import numpy as np
 5 | # import tensorflow as tf
 6 | from sklearn.metrics import fbeta_score, roc_auc_score
 7 | from sklearn.exceptions import UndefinedMetricWarning
 8 | 
 9 | 
10 | class Metric:
11 |     name = "metric"
12 | 
13 |     def __call__(self, truth: np.ndarray, pred: np.ndarray) -> Tuple[float, str]:
14 |         """Calculate the metric from truth and prediction tensors
15 | 
16 |         Parameters
17 |         ----------
18 |         truth : numpy.ndarray
19 |         pred : numpy.ndarray
20 | 
21 |         Returns
22 |         -------
23 |         Tuple[float, str]
24 |             (metric value(to be minimized), formatted string)
25 |         """
26 |         raise NotImplementedError()
27 | 
28 | 
29 | class FBeta(Metric):
30 |     """FBeta for binary targets"""
31 |     name = "fbeta"
32 | 
33 |     def __init__(self, step, beta=2, average="binary"):
34 |         self.step = step
35 |         self.beta = beta
36 |         self.average = average
37 | 
38 |     def __call__(self, truth: np.ndarray, pred: np.ndarray) -> Tuple[float, str]:
39 |         best_fbeta, best_thres = self.find_best_fbeta_threshold(
40 |             truth, pred,
41 |             step=self.step, beta=self.beta)
42 |         return best_fbeta * -1, f"{best_fbeta:.4f} @ {best_thres:.2f}"
43 | 
44 |     def find_best_fbeta_threshold(self, truth, probs, beta=2, step=0.05):
45 |         best, best_thres = 0, -1
46 |         with warnings.catch_warnings():
47 |             warnings.simplefilter('ignore', category=UndefinedMetricWarning)
48 |             for thres in np.arange(step, 1, step):
49 |                 current = fbeta_score(
50 |                     truth, (probs >= thres).astype("int8"),
51 |                     beta=beta, average=self.average)
52 |                 if current > best:
53 |                     best = current
54 |                     best_thres = thres
55 |         return best, best_thres
56 | 
57 | 
58 | class AUC(Metric):
59 |     """AUC for binary targets"""
60 |     name = "auc"
61 | 
62 |     def __call__(self, truth: np.ndarray, pred: np.ndarray) -> Tuple[float, str]:
63 |         auc_score = roc_auc_score(
64 |             truth.astype("int"), pred)
65 |         return auc_score * -1, f"{auc_score * 100:.2f}"
66 | 


--------------------------------------------------------------------------------
/quest/dataset.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | 
 4 | AUTOTUNE = tf.data.experimental.AUTOTUNE
 5 | 
 6 | 
 7 | def tfrecord_dataset(filename, batch_size, strategy, is_train: bool = True):
 8 |     opt = tf.data.Options()
 9 |     opt.experimental_deterministic = False
10 | 
11 |     name = filename.split("/")[-1]
12 |     max_q_len = int(name.split("-")[3].split(".")[0])
13 |     max_a_len = int(name.split("-")[4].split(".")[0])
14 |     cnt = int(name.split("-")[2])
15 | 
16 |     features_description = {
17 |         "input_ids_question": tf.io.FixedLenFeature([max_q_len], tf.int64),
18 |         "input_mask_question": tf.io.FixedLenFeature([max_q_len], tf.int64),
19 |         "input_ids_answer": tf.io.FixedLenFeature([max_a_len], tf.int64),
20 |         "input_mask_answer": tf.io.FixedLenFeature([max_a_len], tf.int64),
21 |         "labels": tf.io.FixedLenFeature([30], tf.float32),
22 |     }
23 | 
24 |     def _parse_function(example_proto):
25 |         # Parse the input `tf.Example` proto using the dictionary above.
26 |         example = tf.io.parse_single_example(
27 |             example_proto, features_description)
28 |         return (
29 |             {
30 |                 'input_ids_question': tf.cast(example['input_ids_question'], tf.int32),
31 |                 'attention_mask_question': tf.cast(example['input_mask_question'], tf.int32),
32 |                 'input_ids_answer': tf.cast(example['input_ids_answer'], tf.int32),
33 |                 'attention_mask_answer': tf.cast(example['input_mask_answer'], tf.int32),
34 |             },
35 |             example["labels"]
36 |         )
37 | 
38 |     raw_dataset = tf.data.TFRecordDataset(
39 |         filename, num_parallel_reads=4
40 |     ).with_options(opt)
41 |     dataset = raw_dataset.map(
42 |         _parse_function, num_parallel_calls=AUTOTUNE
43 |     ).cache()
44 |     if is_train:
45 |         dataset = dataset.shuffle(
46 |             2048, reshuffle_each_iteration=True
47 |         ).repeat()
48 |     else:
49 |         # usually fewer validation files than workers so disable FILE auto-sharding on validation
50 |         # option not useful if there is no sharding (not harmful either)
51 |         if strategy.num_replicas_in_sync > 1:
52 |             opt = tf.data.Options()
53 |             opt.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA
54 |             dataset = dataset.with_options(opt)
55 |     dataset = dataset.batch(
56 |         batch_size
57 |         # drop_remainder=is_train
58 |     )
59 |     dataset = dataset.prefetch(AUTOTUNE)
60 |     print("cnt:", cnt, "batch size:", batch_size)
61 |     return dataset, int(np.ceil(cnt / batch_size))
62 | 


--------------------------------------------------------------------------------
/tf-helper-bot/tf_helper_bot/lr_schedulers.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | from tensorflow.python.framework import constant_op
 4 | from tensorflow.python.framework import ops
 5 | from tensorflow.python.ops import control_flow_ops
 6 | from tensorflow.python.ops import math_ops
 7 | from tensorflow.python.keras.optimizer_v2.learning_rate_schedule import LearningRateSchedule
 8 | 
 9 | 
10 | class CosineDecayWithWarmup(LearningRateSchedule):
11 |     """A LearningRateSchedule that uses a cosine decay schedule."""
12 | 
13 |     def __init__(
14 |             self,
15 |             initial_learning_rate,
16 |             max_learning_rate,
17 |             warmup_steps,
18 |             decay_steps,
19 |             alpha=0.0,
20 |             name=None):
21 |         super().__init__()
22 |         self.initial_learning_rate = initial_learning_rate
23 |         self.max_learning_rate = max_learning_rate
24 |         self.warmup_steps = warmup_steps
25 |         self.decay_steps = decay_steps
26 |         self.alpha = alpha
27 |         self.name = name
28 | 
29 |     @staticmethod
30 |     def lr_warmup(steps, warmup_steps, max_learning_rate, initial_learning_rate):
31 |         return initial_learning_rate + (
32 |             max_learning_rate - initial_learning_rate
33 |         ) * (steps / warmup_steps)
34 | 
35 |     @staticmethod
36 |     def cosine_decay(steps, warmup_steps, decay_steps, max_learning_rate, alpha):
37 |         completed_fraction = (
38 |             steps - warmup_steps) / decay_steps
39 |         cosine_decayed = 0.5 * (1.0 + math_ops.cos(
40 |             constant_op.constant(math.pi) * completed_fraction))
41 |         decayed = (1 - alpha) * cosine_decayed + alpha
42 |         return math_ops.multiply(max_learning_rate, decayed)
43 | 
44 |     def __call__(self, step):
45 |         with ops.name_scope_v2(self.name or "CosineDecayWithWarmup"):
46 |             initial_learning_rate = ops.convert_to_tensor(
47 |                 self.initial_learning_rate, name="initial_learning_rate")
48 |             max_learning_rate = ops.convert_to_tensor(
49 |                 self.max_learning_rate, name="initial_learning_rate")
50 |             dtype = initial_learning_rate.dtype
51 |             decay_steps = math_ops.cast(self.decay_steps, dtype)
52 |             warmup_steps = math_ops.cast(self.warmup_steps, dtype)
53 |             total_steps = decay_steps + warmup_steps
54 | 
55 |             global_step_recomp = math_ops.cast(step, dtype)
56 |             global_step_recomp = math_ops.minimum(
57 |                 global_step_recomp, total_steps)
58 | 
59 |             return control_flow_ops.cond(
60 |                 math_ops.less_equal(global_step_recomp, warmup_steps),
61 |                 lambda: self.lr_warmup(
62 |                     global_step_recomp, warmup_steps, max_learning_rate,
63 |                     initial_learning_rate
64 |                 ),
65 |                 lambda: self.cosine_decay(
66 |                     global_step_recomp, warmup_steps, decay_steps,
67 |                     max_learning_rate, self.alpha
68 |                 )
69 |             )
70 | 
71 |     def get_config(self):
72 |         return {
73 |             "initial_learning_rate": self.initial_learning_rate,
74 |             "decay_steps": self.decay_steps,
75 |             "alpha": self.alpha,
76 |             "name": self.name
77 |         }
78 | 


--------------------------------------------------------------------------------
/quest/eval_tpu.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | 
  3 | import fire
  4 | import joblib
  5 | import numpy as np
  6 | import pandas as pd
  7 | from tqdm import tqdm
  8 | import tensorflow as tf
  9 | from scipy.special import expit
 10 | from transformers import AutoTokenizer, RobertaConfig
 11 | from tf_helper_bot.utils import prepare_tpu
 12 | 
 13 | from .models import DualRobertaModel
 14 | from .metrics import SpearmanCorr
 15 | from .prepare_tfrecords import Preprocessor, OUTPUT_COLUMNS, INPUT_COLUMNS
 16 | from .inference import ROBERTA_CONFIG, get_batch
 17 | from .post_processing import find_best_bins
 18 | from .dataset import tfrecord_dataset
 19 | 
 20 | 
 21 | def eval_fold(
 22 |     valid_path: str,
 23 |     model_path: str = "cache/roberta-base-fold-0.h5",
 24 |     batch_size: int = 8
 25 | ):
 26 |     strategy, tpu = prepare_tpu()
 27 |     if tpu:
 28 |         batch_size *= strategy.num_replicas_in_sync
 29 |     valid_ds, valid_steps = tfrecord_dataset(
 30 |         valid_path, batch_size, strategy, is_train=False)
 31 |     valid_dist_ds = strategy.experimental_distribute_dataset(
 32 |         valid_ds)
 33 | 
 34 |     model_name = Path(model_path).name
 35 |     if model_name.lower().startswith("roberta-base"):
 36 |         config = RobertaConfig.from_dict(
 37 |             ROBERTA_CONFIG)
 38 |         model = DualRobertaModel(
 39 |             model_name="roberta-base", config=config, pretrained=False
 40 |         )
 41 |         # build
 42 |         model(next(iter(valid_ds))[0], training=False)
 43 |         model.load_weights(model_path)
 44 |     else:
 45 |         raise ValueError("Unknown model.")
 46 |     spearman = SpearmanCorr()
 47 | 
 48 |     @tf.function
 49 |     def predict_batch(inputs):
 50 |         return model(inputs, training=False)[0]
 51 | 
 52 |     preds, labels = [], []
 53 |     for batch_, labels_ in tqdm(valid_dist_ds, total=valid_steps, ncols=100):
 54 |         tmp = strategy.experimental_run_v2(
 55 |             predict_batch,
 56 |             args=(batch_,)
 57 |         ).values
 58 |         preds.append(
 59 |             tf.concat(
 60 |                 tmp, axis=0
 61 |             ).numpy()
 62 |         )
 63 |         labels.append(tf.concat(
 64 |             strategy.experimental_local_results(labels_),
 65 |             axis=0
 66 |         ).numpy())
 67 |     preds = np.concatenate(preds)
 68 |     labels = np.concatenate(labels)
 69 | 
 70 |     score = spearman(labels, preds)[0] * -1
 71 |     print(f"Raw Spearman: {score * 100 : .2f}")
 72 |     return labels, preds
 73 | 
 74 | 
 75 | def eval_folds(
 76 |     n_folds: int = 5,
 77 |     valid_pattern: str = "gs://ceshine-colab-tmp-2/quest/valid-%d-*.tfrec",
 78 |     model_pattern: str = "cache/roberta-base-fold-%d.h5",
 79 |     batch_size: int = 8
 80 | ):
 81 |     if Path("cache/oof.jl").exists():
 82 |         labels, preds = joblib.load("cache/oof.jl")
 83 |     else:
 84 |         labels, preds = [], []
 85 |         for fold in range(n_folds):
 86 |             matches = list(tf.io.gfile.glob(valid_pattern % fold))
 87 |             assert len(matches) == 1
 88 |             labels_tmp, preds_tmp = eval_fold(
 89 |                 matches[0],
 90 |                 model_pattern % fold,
 91 |                 batch_size
 92 |             )
 93 |             labels.append(labels_tmp)
 94 |             preds.append(preds_tmp)
 95 | 
 96 |         labels = np.concatenate(labels)
 97 |         preds = np.concatenate(preds)
 98 |         joblib.dump([labels, preds], "cache/oof.jl")
 99 |     spearman = SpearmanCorr()
100 |     score = spearman(labels, preds)[0] * -1
101 |     print(f"Raw Spearman: {score * 100 : .2f}")
102 |     best_score, best_bins, scaler = find_best_bins(labels, expit(preds))
103 |     print(f"Optimized Spearman: {best_score * 100 : .2f}")
104 |     print(best_bins)
105 |     joblib.dump([best_bins, scaler], "cache/best_bins.jl")
106 | 
107 | 
108 | if __name__ == '__main__':
109 |     fire.Fire(eval_folds)
110 | 


--------------------------------------------------------------------------------
/quest/eval.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from pathlib import Path
  3 | 
  4 | import fire
  5 | import joblib
  6 | import numpy as np
  7 | import pandas as pd
  8 | from tqdm import tqdm
  9 | import tensorflow as tf
 10 | from scipy.special import expit
 11 | from transformers import AutoTokenizer, RobertaConfig
 12 | 
 13 | from .models import DualRobertaModel
 14 | from .metrics import SpearmanCorr
 15 | from .prepare_tfrecords import Preprocessor, OUTPUT_COLUMNS, INPUT_COLUMNS
 16 | from .inference import ROBERTA_CONFIG, get_batch
 17 | from .post_processing import find_best_bins
 18 | 
 19 | 
 20 | def eval_fold(
 21 |     input_path: str = "data/",
 22 |     fold_path: str = "cache/tfrecords/fold_0.jl",
 23 |     model_path: str = "cache/roberta-base-fold-0.h5",
 24 |     tokenizer_path: str = "cache/tfrecords/tokenizer_roberta-base/",
 25 |     batch_size: int = 8
 26 | ):
 27 |     df_train = pd.read_csv(input_path + 'train.csv')
 28 |     tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
 29 |     processor = Preprocessor(tokenizer)
 30 |     labels = df_train.loc[:, OUTPUT_COLUMNS].values
 31 |     inputs = df_train.loc[:, INPUT_COLUMNS].values
 32 |     _, valid_idx = joblib.load(fold_path)
 33 |     # valid_idx = valid_idx[:100] # For faster debug
 34 |     labels, inputs = labels[valid_idx], inputs[valid_idx]
 35 |     tmp = []
 36 |     for i in tqdm(range(labels.shape[0]), ncols=100):
 37 |         tmp.append(processor.process_one_example(
 38 |             inputs[i, 0],
 39 |             inputs[i, 1],
 40 |             inputs[i, 2])
 41 |         )
 42 |     processed_inputs = np.array(tmp)
 43 |     del tmp, inputs
 44 | 
 45 |     model_name = Path(model_path).name
 46 |     if model_name.lower().startswith("roberta-base"):
 47 |         config = RobertaConfig.from_dict(
 48 |             ROBERTA_CONFIG)
 49 |         model = DualRobertaModel(
 50 |             model_name="roberta-base", config=config, pretrained=False
 51 |         )
 52 |         # build
 53 |         model(get_batch(processed_inputs[:2]), training=False)
 54 |         model.load_weights(model_path)
 55 |     else:
 56 |         raise ValueError("Unknown model.")
 57 |     spearman = SpearmanCorr()
 58 | 
 59 |     @tf.function
 60 |     def predict_batch(inputs):
 61 |         return model(inputs, training=False)[0]
 62 | 
 63 |     preds = []
 64 |     for i in tqdm(range(0, len(labels), batch_size), ncols=100):
 65 |         input_dicts = processed_inputs[i:i+batch_size]
 66 |         preds.append(predict_batch(get_batch(input_dicts)).numpy())
 67 |     preds = np.concatenate(preds)
 68 | 
 69 |     score = spearman(labels, preds)[0] * -1
 70 |     print(f"Raw Spearman: {score * 100 : .2f}")
 71 |     return labels, preds
 72 | 
 73 | 
 74 | def eval_folds(
 75 |     n_folds: int = 5,
 76 |     input_path: str = "data/",
 77 |     fold_pattern: str = "cache/tfrecords/fold_%d.jl",
 78 |     model_pattern: str = "cache/roberta-base-fold-%d.h5",
 79 |     tokenizer_path: str = "cache/tfrecords/tokenizer_roberta-base/",
 80 |     batch_size: int = 8
 81 | ):
 82 |     if Path("cache/oof.jl").exists():
 83 |         labels, preds = joblib.load("cache/oof.jl")
 84 |     else:
 85 |         labels, preds = [], []
 86 |         for fold in range(n_folds):
 87 |             labels_tmp, preds_tmp = eval_fold(
 88 |                 input_path, fold_pattern % fold,
 89 |                 model_pattern % fold,
 90 |                 tokenizer_path,
 91 |                 batch_size
 92 |             )
 93 |             labels.append(labels_tmp)
 94 |             preds.append(preds_tmp)
 95 | 
 96 |         labels = np.concatenate(labels)
 97 |         preds = np.concatenate(preds)
 98 |         joblib.dump([labels, preds], "cache/oof.jl")
 99 |     spearman = SpearmanCorr()
100 |     score = spearman(labels, preds)[0] * -1
101 |     print(f"Raw Spearman: {score * 100 : .2f}")
102 |     best_score, best_bins, scaler = find_best_bins(labels, expit(preds))
103 |     print(f"Optimized Spearman: {best_score * 100 : .2f}")
104 |     print(best_bins)
105 |     joblib.dump([best_bins, scaler], "cache/best_bins.jl")
106 | 
107 | 
108 | if __name__ == '__main__':
109 |     fire.Fire(eval_folds)
110 | 


--------------------------------------------------------------------------------
/quest/inference.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import logging
  3 | from pathlib import Path
  4 | 
  5 | import fire
  6 | import joblib
  7 | import numpy as np
  8 | import pandas as pd
  9 | from tqdm import tqdm
 10 | import tensorflow as tf
 11 | from scipy.special import expit
 12 | from transformers import AutoTokenizer
 13 | from transformers import RobertaConfig
 14 | 
 15 | from .models import DualRobertaModel
 16 | from .prepare_tfrecords import Preprocessor, INPUT_COLUMNS, OUTPUT_COLUMNS
 17 | from .post_processing import prevent_nan
 18 | 
 19 | ROBERTA_CONFIG = {
 20 |     "architectures": [
 21 |         "RobertaForMaskedLM"
 22 |     ],
 23 |     "attention_probs_dropout_prob": 0.1,
 24 |     "finetuning_task": None,
 25 |     "hidden_act": "gelu",
 26 |     "hidden_dropout_prob": 0.1,
 27 |     "hidden_size": 768,
 28 |     "id2label": {
 29 |         "0": "LABEL_0",
 30 |         "1": "LABEL_1"
 31 |     },
 32 |     "initializer_range": 0.02,
 33 |     "intermediate_size": 3072,
 34 |     "is_decoder": False,
 35 |     "label2id": {
 36 |         "LABEL_0": 0,
 37 |         "LABEL_1": 1
 38 |     },
 39 |     "layer_norm_eps": 1e-05,
 40 |     "max_position_embeddings": 514,
 41 |     "num_attention_heads": 12,
 42 |     "num_hidden_layers": 12,
 43 |     "num_labels": 30,
 44 |     "output_attentions": False,
 45 |     "output_hidden_states": False,
 46 |     "output_past": True,
 47 |     "pruned_heads": {},
 48 |     "torchscript": False,
 49 |     "type_vocab_size": 1,
 50 |     "use_bfloat16": False,
 51 |     "vocab_size": 50265
 52 | }
 53 | 
 54 | 
 55 | def get_batch(input_dicts):
 56 |     return {
 57 |         "input_ids_question": tf.convert_to_tensor(np.stack([
 58 |             x["input_ids_question"] for x in input_dicts
 59 |         ], axis=0)),
 60 |         "attention_mask_question": tf.convert_to_tensor(np.stack([
 61 |             x["input_mask_question"] for x in input_dicts
 62 |         ], axis=0)),
 63 |         "input_ids_answer": tf.convert_to_tensor(np.stack([
 64 |             x["input_ids_answer"] for x in input_dicts
 65 |         ], axis=0)),
 66 |         "attention_mask_answer": tf.convert_to_tensor(np.stack([
 67 |             x["input_mask_answer"] for x in input_dicts
 68 |         ], axis=0)),
 69 |     }
 70 | 
 71 | 
 72 | def main(
 73 |     input_path: str = "data/",
 74 |     tokenizer_path: str = "cache/tfrecords/tokenizer_roberta-base/",
 75 |     model_path_pattern: str = "cache/roberta-base-fold-*",
 76 |     best_bins_path: str = "cache/best_bins.jl",
 77 |     batch_size: int = 8, progress_bar: bool = True,
 78 |     add_sigmoid: bool = False, rank: bool = False
 79 | ):
 80 |     df_valid = pd.read_csv(input_path + 'test.csv')
 81 |     tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
 82 |     processor = Preprocessor(tokenizer)
 83 |     inputs = df_valid.loc[:, INPUT_COLUMNS].values
 84 |     tmp = []
 85 |     for i in tqdm(range(inputs.shape[0]), ncols=100, disable=not progress_bar):
 86 |         tmp.append(processor.process_one_example(
 87 |             inputs[i, 0],
 88 |             inputs[i, 1],
 89 |             inputs[i, 2])
 90 |         )
 91 |     processed_inputs = np.array(tmp)
 92 |     del tmp, inputs
 93 | 
 94 |     buffer = []
 95 |     for model_path in glob.glob(model_path_pattern):
 96 |         model_name = Path(model_path).name
 97 |         print(model_path, model_name)
 98 |         if model_name.lower().startswith("roberta-base"):
 99 |             config = RobertaConfig.from_dict(
100 |                 ROBERTA_CONFIG)
101 |             model = DualRobertaModel(
102 |                 model_name="roberta-base", config=config, pretrained=False)
103 |             # build
104 |             model(get_batch(processed_inputs[:2]), training=False)
105 |             model.load_weights(model_path)
106 |         else:
107 |             raise ValueError("Unknown model.")
108 | 
109 |         @tf.function
110 |         def predict_batch(inputs):
111 |             return model(inputs, training=False)[0]
112 | 
113 |         preds = []
114 |         for i in tqdm(range(
115 |             0, len(processed_inputs), batch_size
116 |         ), ncols=100, disable=not progress_bar):
117 |             input_dicts = processed_inputs[i:i+batch_size]
118 |             preds.append(predict_batch(get_batch(input_dicts)).numpy())
119 |         if add_sigmoid and not rank:
120 |             buffer.append(expit(np.concatenate(preds)))
121 |         elif rank:
122 |             tmp = np.concatenate(preds)
123 |             buffer.append(
124 |                 tmp.argsort(axis=0).argsort(axis=0) / tmp.shape[0]
125 |             )
126 |         else:
127 |             buffer.append(np.concatenate(preds))
128 | 
129 |     final_preds = np.mean(buffer, axis=0)
130 |     if add_sigmoid and not rank:
131 |         best_bins, scaler = joblib.load(best_bins_path)
132 |         best_bins = np.array(best_bins)[None, :]
133 |         # post-process
134 |         final_preds = np.clip(scaler.transform(final_preds), 0., 1.)
135 |         final_preds = prevent_nan(
136 |             np.round(final_preds * best_bins) / best_bins
137 |         )
138 | 
139 |     df_sub = pd.DataFrame(final_preds, columns=OUTPUT_COLUMNS)
140 |     df_sub["qa_id"] = df_valid["qa_id"].values
141 |     df_sub.to_csv("submission.csv", index=False)
142 | 
143 | 
144 | if __name__ == '__main__':
145 |     fire.Fire(main)
146 | 


--------------------------------------------------------------------------------
/quest/models.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | from transformers import TFRobertaPreTrainedModel
  4 | from transformers.modeling_tf_roberta import TFRobertaMainLayer, TFRobertaClassificationHead
  5 | 
  6 | from .prepare_tfrecords import QUESTION_COLUMNS, ANSWER_COLUMNS, JOINT_COLUMNS
  7 | 
  8 | 
  9 | class AveragePooling(tf.keras.layers.Layer):
 10 |     def call(self, states, mask):
 11 |         mask = tf.cast(tf.expand_dims(mask, 2), tf.float32)
 12 |         pooled = tf.reduce_sum(states * mask, axis=1)
 13 |         return pooled / tf.reduce_sum(mask, axis=1)
 14 | 
 15 | 
 16 | class SELayer(tf.keras.layers.Layer):
 17 |     def __init__(self, channels, reduction):
 18 |         super().__init__()
 19 |         self.fc1 = tf.keras.layers.Dense(
 20 |             channels // reduction,
 21 |             kernel_initializer=tf.keras.initializers.he_normal(seed=None),
 22 |             name="fc1",
 23 |             activation="relu"
 24 |         )
 25 |         self.fc2 = tf.keras.layers.Dense(
 26 |             channels,
 27 |             kernel_initializer=tf.keras.initializers.he_normal(seed=None),
 28 |             name="fc2",
 29 |             activation="sigmoid"
 30 |         )
 31 | 
 32 |     def call(self, x):
 33 |         tmp = self.fc1(x)
 34 |         tmp = self.fc2(tmp)
 35 |         return tmp * x
 36 | 
 37 | 
 38 | class RobertaEncoder(TFRobertaPreTrainedModel):
 39 |     def __init__(self, config, *inputs, **kwargs):
 40 |         super().__init__(config, *inputs, **kwargs)
 41 |         self.num_labels = config.num_labels
 42 |         self.roberta = TFRobertaMainLayer(config, name="roberta")
 43 |         self.pooling = AveragePooling()
 44 | 
 45 |     def call(self, inputs, **kwargs):
 46 |         if "attention_mask" not in inputs:
 47 |             inputs["attention_mask"] = tf.ones(
 48 |                 tf.shape(inputs["input_ids"])[:2], tf.int32
 49 |             )
 50 |         outputs = self.roberta(inputs, **kwargs)[0]
 51 |         return self.pooling(outputs, inputs["attention_mask"])
 52 | 
 53 | 
 54 | class DualRobertaModel(tf.keras.Model):
 55 |     def __init__(self, config, model_name, pretrained: bool = True):
 56 |         super().__init__()
 57 |         self.num_labels = config.num_labels
 58 | 
 59 |         if pretrained:
 60 |             self.roberta = RobertaEncoder.from_pretrained(
 61 |                 model_name, config=config, name="roberta_question")
 62 |         else:
 63 |             self.roberta = RobertaEncoder(
 64 |                 config=config, name="roberta_question")
 65 |         self.dropout = tf.keras.layers.Dropout(0.5)
 66 |         self.q_classifier = tf.keras.layers.Dense(
 67 |             len(QUESTION_COLUMNS),
 68 |             kernel_initializer=tf.keras.initializers.he_normal(seed=None),
 69 |             name="q_classifier",
 70 |             activation="linear"
 71 |         )
 72 |         self.a_classifier = tf.keras.layers.Dense(
 73 |             len(ANSWER_COLUMNS),
 74 |             kernel_initializer=tf.keras.initializers.he_normal(seed=None),
 75 |             name="a_classifier",
 76 |             activation="linear"
 77 |         )
 78 |         self.j_classifier = tf.keras.layers.Dense(
 79 |             len(JOINT_COLUMNS),
 80 |             kernel_initializer=tf.keras.initializers.he_normal(seed=None),
 81 |             name="j_classifier",
 82 |             activation="linear"
 83 |         )
 84 |         self.gating_q = SELayer(config.hidden_size, 4)
 85 |         self.gating_a = SELayer(config.hidden_size, 4)
 86 |         self.gating_j = SELayer(config.hidden_size * 3, 4)
 87 | 
 88 |     def freeze(self):
 89 |         self.roberta.trainable = False
 90 | 
 91 |     def unfreeze(self):
 92 |         self.roberta.trainable = True
 93 | 
 94 |     def call(self, inputs, **kwargs):
 95 |         pooled_output_question = self.roberta(
 96 |             {
 97 |                 "input_ids": inputs["input_ids_question"],
 98 |                 "attention_mask": inputs["attention_mask_question"]
 99 |             }, **kwargs
100 |         )
101 |         pooled_output_answer = self.roberta(
102 |             {
103 |                 "input_ids": inputs["input_ids_answer"],
104 |                 "attention_mask": inputs["attention_mask_answer"]
105 |             }, **kwargs
106 |         )
107 |         combined = tf.concat(
108 |             [
109 |                 pooled_output_question, pooled_output_answer,
110 |                 pooled_output_answer * pooled_output_question
111 |             ],
112 |             axis=1
113 |         )
114 |         q_logit = self.q_classifier(self.dropout(
115 |             self.gating_q(
116 |                 pooled_output_question
117 |             ), training=kwargs.get("training", False)
118 |         ))
119 |         a_logit = self.a_classifier(self.dropout(
120 |             self.gating_a(
121 |                 pooled_output_answer
122 |             ), training=kwargs.get("training", False)
123 |         ))
124 |         j_logit = self.j_classifier(self.dropout(
125 |             self.gating_j(
126 |                 combined
127 |             ), training=kwargs.get("training", False)
128 |         ))
129 |         logits = tf.concat(
130 |             [q_logit, a_logit, j_logit],
131 |             axis=1
132 |         )
133 |         # add hidden states and attention if they are here
134 |         outputs = (logits,)
135 |         return outputs
136 | 


--------------------------------------------------------------------------------
/quest/prepare_reference_data.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from math import floor, ceil
  3 | 
  4 | import fire
  5 | import joblib
  6 | import numpy as np
  7 | import pandas as pd
  8 | from tqdm import tqdm
  9 | from transformers import AutoTokenizer
 10 | from sklearn.model_selection import GroupKFold
 11 | 
 12 | tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
 13 | 
 14 | logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR)
 15 | 
 16 | 
 17 | def _get_masks(tokens, max_seq_length):
 18 |     """Mask for padding"""
 19 |     if len(tokens) > max_seq_length:
 20 |         raise IndexError("Token length more than max seq length!")
 21 |     return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))
 22 | 
 23 | 
 24 | def _get_segments(tokens, max_seq_length):
 25 |     """Segments: 0 for the first sequence, 1 for the second"""
 26 |     if len(tokens) > max_seq_length:
 27 |         raise IndexError("Token length more than max seq length!")
 28 |     segments = []
 29 |     first_sep = True
 30 |     current_segment_id = 0
 31 |     for token in tokens:
 32 |         segments.append(current_segment_id)
 33 |         if token == "[SEP]":
 34 |             if first_sep:
 35 |                 first_sep = False
 36 |             else:
 37 |                 current_segment_id = 1
 38 |     return segments + [0] * (max_seq_length - len(tokens))
 39 | 
 40 | 
 41 | def _get_ids(tokens, tokenizer, max_seq_length):
 42 |     """Token ids from Tokenizer vocab"""
 43 |     token_ids = tokenizer.convert_tokens_to_ids(tokens)
 44 |     input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
 45 |     return input_ids
 46 | 
 47 | 
 48 | def _trim_input(title, question, answer, max_sequence_length,
 49 |                 t_max_len=30, q_max_len=239, a_max_len=239):
 50 | 
 51 |     t = tokenizer.tokenize(title)
 52 |     q = tokenizer.tokenize(question)
 53 |     a = tokenizer.tokenize(answer)
 54 | 
 55 |     t_len = len(t)
 56 |     q_len = len(q)
 57 |     a_len = len(a)
 58 | 
 59 |     if (t_len+q_len+a_len+4) > max_sequence_length:
 60 | 
 61 |         if t_max_len > t_len:
 62 |             t_new_len = t_len
 63 |             a_max_len = a_max_len + floor((t_max_len - t_len)/2)
 64 |             q_max_len = q_max_len + ceil((t_max_len - t_len)/2)
 65 |         else:
 66 |             t_new_len = t_max_len
 67 | 
 68 |         if a_max_len > a_len:
 69 |             a_new_len = a_len
 70 |             q_new_len = q_max_len + (a_max_len - a_len)
 71 |         elif q_max_len > q_len:
 72 |             a_new_len = a_max_len + (q_max_len - q_len)
 73 |             q_new_len = q_len
 74 |         else:
 75 |             a_new_len = a_max_len
 76 |             q_new_len = q_max_len
 77 | 
 78 |         if t_new_len+a_new_len+q_new_len+4 != max_sequence_length:
 79 |             raise ValueError("New sequence length should be %d, but is %d"
 80 |                              % (max_sequence_length, (t_new_len+a_new_len+q_new_len+4)))
 81 | 
 82 |         t = t[:t_new_len]
 83 |         q = q[:q_new_len]
 84 |         a = a[:a_new_len]
 85 | 
 86 |     return t, q, a
 87 | 
 88 | 
 89 | def _convert_to_bert_inputs(title, question, answer, tokenizer, max_sequence_length):
 90 |     """Converts tokenized input to ids, masks and segments for BERT"""
 91 | 
 92 |     stoken = ["[CLS]"] + title + ["[SEP]"] + \
 93 |         question + ["[SEP]"] + answer + ["[SEP]"]
 94 | 
 95 |     input_ids = _get_ids(stoken, tokenizer, max_sequence_length)
 96 |     input_masks = _get_masks(stoken, max_sequence_length)
 97 |     input_segments = _get_segments(stoken, max_sequence_length)
 98 | 
 99 |     return [input_ids, input_masks, input_segments]
100 | 
101 | 
102 | def compute_input_arays(df, columns, tokenizer, max_sequence_length):
103 |     input_ids, input_masks, input_segments = [], [], []
104 |     for _, instance in tqdm(df[columns].iterrows(), total=df.shape[0]):
105 |         t, q, a = instance.question_title, instance.question_body, instance.answer
106 | 
107 |         t, q, a = _trim_input(t, q, a, max_sequence_length)
108 | 
109 |         ids, masks, segments = _convert_to_bert_inputs(
110 |             t, q, a, tokenizer, max_sequence_length)
111 |         input_ids.append(ids)
112 |         input_masks.append(masks)
113 |         input_segments.append(segments)
114 | 
115 |     return [np.asarray(input_ids, dtype=np.int32),
116 |             np.asarray(input_masks, dtype=np.int32),
117 |             np.asarray(input_segments, dtype=np.int32)]
118 | 
119 | 
120 | def compute_output_arrays(df, columns):
121 |     return np.asarray(df[columns])
122 | 
123 | 
124 | def main(
125 |     input_path: str = "data/",
126 |     max_sequence_length: int = 512
127 | ):
128 |     df_train = pd.read_csv(input_path + 'train.csv')
129 |     output_categories = list(df_train.columns[11:])
130 |     input_categories = list(df_train.columns[[1, 2, 5]])
131 | 
132 |     gkf = GroupKFold(n_splits=5).split(
133 |         X=df_train.question_body, groups=df_train.question_body)
134 |     outputs = compute_output_arrays(df_train, output_categories)
135 |     inputs = compute_input_arays(
136 |         df_train, input_categories, tokenizer, max_sequence_length)
137 | 
138 |     for fold, (train_idx, valid_idx) in enumerate(gkf):
139 |         joblib.dump(
140 |             [inputs[0][train_idx], inputs[1][train_idx],
141 |                 inputs[2][train_idx], outputs[train_idx]],
142 |             f"cache/tfrecords/train-{fold}.jl"
143 |         )
144 |         joblib.dump(
145 |             [inputs[0][valid_idx], inputs[1][valid_idx],
146 |                 inputs[2][valid_idx], outputs[valid_idx]],
147 |             f"cache/tfrecords/valid-{fold}.jl"
148 |         )
149 | 
150 | 
151 | if __name__ == '__main__':
152 |     fire.Fire(main)
153 | 


--------------------------------------------------------------------------------
/quest/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | from pathlib import Path
  4 | 
  5 | import fire
  6 | import tensorflow as tf
  7 | from transformers import BertConfig, RobertaConfig
  8 | from tf_helper_bot import (
  9 |     BaseBot, BaseDistributedBot,
 10 |     MovingAverageStatsTrackerCallback, CheckpointCallback, TelegramCallback,
 11 |     CosineDecayWithWarmup
 12 | )
 13 | from tf_helper_bot.utils import prepare_tpu
 14 | from tf_helper_bot.optimizers import RAdam
 15 | 
 16 | from .models import DualRobertaModel
 17 | from .dataset import tfrecord_dataset
 18 | from .metrics import SpearmanCorr
 19 | 
 20 | TELEGRAM_TOKEN = os.environ.get("TG_TOKEN", "")
 21 | TELEGRAM_CHAT_ID = os.environ.get("TG_CHAT_ID", "")
 22 | 
 23 | 
 24 | class QuestBot(BaseBot):
 25 |     def _extract_prediction(self, x):
 26 |         if isinstance(x, tuple):
 27 |             # the model returns a tuple when run inside tf.function
 28 |             return x[0]
 29 |         return x
 30 | 
 31 | 
 32 | class QuestDistributedBot(BaseDistributedBot):
 33 |     def _extract_prediction(self, x):
 34 |         if isinstance(x, tuple):
 35 |             return x[0]
 36 |         return x
 37 | 
 38 | 
 39 | def loss_fn(labels, predictions):
 40 |     return tf.math.reduce_mean(
 41 |         tf.keras.losses.binary_crossentropy(
 42 |             labels, predictions, from_logits=True
 43 |             # tf.keras.losses.mean_absolute_error(
 44 |             # tf.keras.losses.mean_squared_error(
 45 |             #     labels, predictions,
 46 |         ),
 47 |         axis=0
 48 |     )
 49 | 
 50 | 
 51 | def train_model(
 52 |     train_path: str = "cache/tfrecords/train-0-4863-288-320.tfrec",
 53 |     valid_path: str = "cache/tfrecords/valid-0-1216-288-320.tfrec",
 54 |     model_name: str = "bert-large-uncased-whole-word-masking",
 55 |     output_path: str = "cache/model",
 56 |     batch_size: int = 8, grad_accu: int = 2,
 57 |     log_interval: int = 200, steps: int = 1000,
 58 |     checkpoint_interval: int = 500,
 59 |     min_lr: float = 1e-6, max_lr: float = 3e-5,
 60 |     freeze: int = 0
 61 | ):
 62 |     # Path(output_path).mkdir(exist_ok=True, parents=True)
 63 |     strategy, tpu = prepare_tpu()
 64 |     print("REPLICAS: ", strategy.num_replicas_in_sync)
 65 | 
 66 |     valid_batch_size = batch_size * 2
 67 |     if strategy.num_replicas_in_sync == 8:  # single TPU
 68 |         valid_batch_size = batch_size * strategy.num_replicas_in_sync * 2
 69 |         batch_size = batch_size * strategy.num_replicas_in_sync
 70 |     logging.getLogger("tensorflow").setLevel(logging.WARNING)
 71 | 
 72 |     with strategy.scope():
 73 |         train_ds, train_steps = tfrecord_dataset(
 74 |             train_path, batch_size, strategy, is_train=True)
 75 |         valid_ds, valid_steps = tfrecord_dataset(
 76 |             valid_path, valid_batch_size, strategy, is_train=False)
 77 |         if model_name.lower().startswith("roberta"):
 78 |             config = RobertaConfig.from_pretrained(model_name, num_labels=30)
 79 |             model = DualRobertaModel(
 80 |                 model_name=model_name, config=config)
 81 |         else:
 82 |             raise ValueError("Unknown model!")
 83 |         lr_schedule = CosineDecayWithWarmup(
 84 |             initial_learning_rate=min_lr, max_learning_rate=max_lr,
 85 |             warmup_steps=int(steps * 0.1),
 86 |             decay_steps=steps - int(steps * 0.1),
 87 |             alpha=1e-4
 88 |         )
 89 |         # optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule, epsilon=1e-6)
 90 |         optimizer_1 = RAdam(learning_rate=1e-3, epsilon=1e-6)
 91 |         optimizer = RAdam(learning_rate=lr_schedule, epsilon=1e-6)
 92 |         # build the model
 93 |         model(next(iter(train_ds))[0])
 94 | 
 95 |     if freeze > 0:
 96 |         model.freeze()
 97 |         model.compile(
 98 |             optimizer=optimizer_1,
 99 |             loss=loss_fn
100 |         )
101 |         print(model.summary())
102 |         model.fit(
103 |             train_ds, epochs=1,
104 |             steps_per_epoch=train_steps * freeze
105 |         )
106 |     model.unfreeze()
107 |     model.compile(
108 |         optimizer=optimizer,
109 |         loss=loss_fn
110 |     )
111 |     print(model.summary())
112 | 
113 |     train_dist_ds = strategy.experimental_distribute_dataset(
114 |         train_ds)
115 |     valid_dist_ds = strategy.experimental_distribute_dataset(
116 |         valid_ds)
117 | 
118 |     checkpoints = CheckpointCallback(
119 |         keep_n_checkpoints=1,
120 |         checkpoint_dir="cache/model_cache/",
121 |         monitor_metric="spearman"
122 |     )
123 |     callbacks = [
124 |         MovingAverageStatsTrackerCallback(
125 |             avg_window=int(log_interval * 1.25),
126 |             log_interval=log_interval,
127 |         ),
128 |         checkpoints
129 |     ]
130 |     if TELEGRAM_TOKEN and TELEGRAM_CHAT_ID:
131 |         callbacks += [
132 |             TelegramCallback(
133 |                 token=TELEGRAM_TOKEN,
134 |                 chat_id=TELEGRAM_CHAT_ID,
135 |                 name="QuestFinetune",
136 |                 report_evals=False
137 |             )
138 |         ]
139 |     metrics = (SpearmanCorr(add_sigmoid=True),)
140 |     if tpu:
141 |         bot = QuestDistributedBot(
142 |             model=model,
143 |             criterion=loss_fn,
144 |             optimizer=optimizer,
145 |             train_dataset=train_dist_ds,
146 |             valid_dataset=valid_dist_ds,
147 |             steps_per_epoch=train_steps,
148 |             strategy=strategy,
149 |             gradient_accumulation_steps=1,
150 |             callbacks=callbacks,
151 |             metrics=metrics,
152 |             valid_steps=valid_steps,
153 |         )
154 |     else:
155 |         bot = QuestBot(
156 |             model=model,
157 |             criterion=loss_fn,
158 |             optimizer=optimizer,
159 |             train_dataset=train_dist_ds,
160 |             valid_dataset=valid_dist_ds,
161 |             steps_per_epoch=train_steps,
162 |             gradient_accumulation_steps=grad_accu,
163 |             callbacks=callbacks,
164 |             metrics=metrics,
165 |             valid_steps=valid_steps
166 |         )
167 |     print(f"Steps per epoch: {train_steps} | {valid_steps}")
168 | 
169 |     bot.train(checkpoint_interval=checkpoint_interval, n_steps=steps)
170 |     best_score = checkpoints.best_performers[0][0]
171 |     bot.model.load_weights(str(checkpoints.best_performers[0][1]))
172 |     checkpoints.remove_checkpoints(keep=0)
173 |     bot.model.save_weights(output_path + ".h5")
174 |     return best_score
175 | 
176 | 
177 | if __name__ == '__main__':
178 |     fire.Fire(train_model)
179 | 


--------------------------------------------------------------------------------
/quest/prepare_tfrecords.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import logging
  3 | from pathlib import Path
  4 | 
  5 | import fire
  6 | import joblib
  7 | import numpy as np
  8 | import pandas as pd
  9 | from tqdm import tqdm
 10 | import tensorflow as tf
 11 | from transformers import AutoTokenizer
 12 | from sklearn.model_selection import GroupKFold
 13 | 
 14 | QUESTION_COLUMNS = (
 15 |     'question_asker_intent_understanding', 'question_body_critical', 'question_conversational',
 16 |     'question_expect_short_answer', 'question_fact_seeking', 'question_has_commonly_accepted_answer',
 17 |     'question_interestingness_others', 'question_interestingness_self', 'question_multi_intent',
 18 |     'question_not_really_a_question', 'question_opinion_seeking', 'question_type_choice',
 19 |     'question_type_compare', 'question_type_consequence', 'question_type_definition',
 20 |     'question_type_entity', 'question_type_instructions', 'question_type_procedure',
 21 |     'question_type_reason_explanation', 'question_type_spelling', 'question_well_written',
 22 | )
 23 | ANSWER_COLUMNS = (
 24 |     'answer_type_instructions', 'answer_type_procedure', 'answer_type_reason_explanation',
 25 |     'answer_well_written', 'answer_level_of_information'
 26 | )
 27 | JOINT_COLUMNS = (
 28 |     'answer_helpful', 'answer_plausible', 'answer_relevance',
 29 |     'answer_satisfaction'
 30 | )
 31 | INPUT_COLUMNS = ('question_title', 'question_body', 'answer')
 32 | OUTPUT_COLUMNS = QUESTION_COLUMNS + ANSWER_COLUMNS + JOINT_COLUMNS
 33 | 
 34 | logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR)
 35 | 
 36 | 
 37 | class Preprocessor:
 38 |     def __init__(self, tokenizer, title_max_len=64, question_max_len=352-7, answer_max_len=480-5):
 39 |         self.tokenizer = tokenizer
 40 |         self.title_max_len = title_max_len
 41 |         self.question_max_len = question_max_len
 42 |         self.answer_max_len = answer_max_len
 43 |         self.bos_token_id = self.tokenizer.convert_tokens_to_ids(
 44 |             [self.tokenizer.bos_token])[0]
 45 |         self.eos_token_id = self.tokenizer.convert_tokens_to_ids(
 46 |             [self.tokenizer.eos_token])[0]
 47 |         self.pad_token_id = self.tokenizer.convert_tokens_to_ids(
 48 |             [self.tokenizer.pad_token])[0]
 49 |         self.question_head = self.tokenizer.encode(
 50 |             "question", add_special_tokens=False)
 51 |         self.answer_head = self.tokenizer.encode(
 52 |             "answer", add_special_tokens=False)
 53 |         self.max_q_len = (
 54 |             self.title_max_len + self.question_max_len +
 55 |             6 + len(self.question_head)
 56 |         )
 57 |         self.max_a_len = (
 58 |             self.answer_max_len + 4 + len(self.answer_head)
 59 |         )
 60 | 
 61 |     def _trim_input(self, title, question, answer):
 62 |         t = self.tokenizer.encode(title, add_special_tokens=False)
 63 |         q = self.tokenizer.encode(question, add_special_tokens=False)
 64 |         a = self.tokenizer.encode(answer, add_special_tokens=False)
 65 |         t = t[:self.title_max_len]
 66 |         q = q[:self.question_max_len]
 67 |         a = a[:self.answer_max_len]
 68 |         return t, q, a
 69 | 
 70 |     def process_one_example(self, title, question, answer):
 71 |         t_tokens, q_tokens, a_tokens = self._trim_input(
 72 |             title, question, answer)
 73 | 
 74 |         input_ids_question = np.zeros(
 75 |             self.max_q_len, dtype=np.int
 76 |         ) + self.pad_token_id
 77 |         input_ids_answer = np.zeros(
 78 |             self.max_a_len, dtype=np.int
 79 |         ) + self.pad_token_id
 80 |         question_tokens = np.asarray(
 81 |             [self.bos_token_id] + self.question_head +
 82 |             [self.eos_token_id, self.bos_token_id] +
 83 |             t_tokens + [self.eos_token_id, self.bos_token_id] +
 84 |             q_tokens + [self.eos_token_id]
 85 |         )
 86 |         answer_tokens = np.asarray(
 87 |             [self.bos_token_id] + self.answer_head +
 88 |             [self.eos_token_id, self.bos_token_id] +
 89 |             a_tokens + [self.eos_token_id]
 90 |         )
 91 |         assert len(question_tokens) <= len(input_ids_question)
 92 |         assert len(answer_tokens) <= len(input_ids_answer)
 93 |         input_ids_question[:len(question_tokens)] = question_tokens
 94 |         input_ids_answer[:len(answer_tokens)] = answer_tokens
 95 |         input_mask_question = np.zeros(len(input_ids_question), dtype=np.int)
 96 |         input_mask_question[:len(question_tokens)] = 1
 97 |         input_mask_answer = np.zeros(len(input_ids_answer), dtype=np.int)
 98 |         input_mask_answer[:len(answer_tokens)] = 1
 99 |         return {
100 |             "input_ids_question": input_ids_question,
101 |             "input_mask_question": input_mask_question,
102 |             "input_ids_answer": input_ids_answer,
103 |             "input_mask_answer": input_mask_answer
104 |         }
105 | 
106 | 
107 | def to_example(input_dict, labels):
108 |     feature = {
109 |         "input_ids_question": tf.train.Feature(
110 |             int64_list=tf.train.Int64List(
111 |                 value=input_dict["input_ids_question"])
112 |         ),
113 |         "input_mask_question": tf.train.Feature(
114 |             int64_list=tf.train.Int64List(
115 |                 value=input_dict["input_mask_question"])
116 |         ),
117 |         "input_ids_answer": tf.train.Feature(
118 |             int64_list=tf.train.Int64List(value=input_dict["input_ids_answer"])
119 |         ),
120 |         "input_mask_answer": tf.train.Feature(
121 |             int64_list=tf.train.Int64List(
122 |                 value=input_dict["input_mask_answer"])
123 |         ),
124 |         "labels": tf.train.Feature(
125 |             float_list=tf.train.FloatList(value=labels)
126 |         )
127 |     }
128 |     return tf.train.Example(features=tf.train.Features(feature=feature))
129 | 
130 | 
131 | def _write_tfrecords(inputs, labels, output_filepath):
132 |     with tf.io.TFRecordWriter(str(output_filepath)) as writer:
133 |         for input_dict, labels_single in zip(inputs, labels):
134 |             example = to_example(input_dict, labels_single)
135 |             writer.write(example.SerializeToString())
136 |     print("Wrote file {} containing {} records".format(
137 |         output_filepath, len(inputs)))
138 | 
139 | 
140 | def _write_arrays(inputs, labels, output_filepath):
141 |     input_ids, input_mask, token_type_ids = [], [], []
142 |     for input_dict in inputs:
143 |         input_ids.append(input_dict["input_ids"])
144 |         input_mask.append(input_dict["input_mask"])
145 |         token_type_ids.append(input_dict["token_type_ids"])
146 |     joblib.dump(
147 |         [np.stack(input_ids), np.stack(input_mask),
148 |          np.stack(token_type_ids), labels],
149 |         output_filepath)
150 | 
151 | 
152 | def main(
153 |     input_path: str = "data/", model_name: str = "roberta-base",
154 |     output_path: str = "cache/tfrecords/", n_folds: int = 5
155 | ):
156 |     output_path_ = Path(output_path)
157 |     output_path_.mkdir(exist_ok=True, parents=True)
158 |     (output_path_ / f"tokenizer_{model_name}").mkdir(exist_ok=True)
159 | 
160 |     df_train = pd.read_csv(input_path + 'train.csv')
161 |     tokenizer = AutoTokenizer.from_pretrained(model_name)
162 |     tokenizer.save_pretrained(str(output_path_ / f"tokenizer_{model_name}"))
163 |     print(tokenizer)
164 |     processor = Preprocessor(tokenizer)
165 |     labels = df_train.loc[
166 |         :, OUTPUT_COLUMNS
167 |     ].values
168 |     inputs = df_train.loc[:, INPUT_COLUMNS].values
169 |     tmp = []
170 |     for i in tqdm(range(df_train.shape[0]), ncols=100):
171 |         tmp.append(processor.process_one_example(
172 |             inputs[i, 0],
173 |             inputs[i, 1],
174 |             inputs[i, 2])
175 |         )
176 |     processed_inputs = np.array(tmp)
177 |     print(processed_inputs[0]["input_ids_question"])
178 |     del tmp
179 | 
180 |     gkf = GroupKFold(n_splits=n_folds).split(
181 |         X=df_train.question_body, groups=df_train.question_body)
182 |     for fold, (train_idx, valid_idx) in enumerate(gkf):
183 |         joblib.dump([train_idx, valid_idx], output_path_ / f"fold_{fold}.jl")
184 |         filepath = (
185 |             output_path_ /
186 |             f"train-{fold}-{len(train_idx)}-{processor.max_q_len}-{processor.max_a_len}.tfrec"
187 |         )
188 |         _write_tfrecords(
189 |             processed_inputs[train_idx], labels[train_idx], filepath)
190 |         filepath = (
191 |             output_path_ /
192 |             f"valid-{fold}-{len(valid_idx)}-{processor.max_q_len}-{processor.max_a_len}.tfrec"
193 |         )
194 |         _write_tfrecords(
195 |             processed_inputs[valid_idx], labels[valid_idx], filepath)
196 | 
197 | 
198 | if __name__ == '__main__':
199 |     fire.Fire(main)
200 | 


--------------------------------------------------------------------------------
/tf-helper-bot/tf_helper_bot/callbacks.py:
--------------------------------------------------------------------------------
  1 | import socket
  2 | from datetime import datetime, timedelta
  3 | from time import time
  4 | from collections import deque, defaultdict
  5 | from typing import Dict, Tuple, List, Optional, Union
  6 | from pathlib import Path
  7 | 
  8 | import numpy as np
  9 | import tensorflow as tf
 10 | try:
 11 |     import wandb
 12 |     WANDB = True
 13 | except ImportError:
 14 |     WANDB = False
 15 | 
 16 | from .bot import BaseBot
 17 | 
 18 | __all__ = [
 19 |     "Callback", "MovingAverageStatsTrackerCallback",
 20 |     "CheckpointCallback", "TelegramCallback", "WandbCallback"
 21 | ]
 22 | 
 23 | 
 24 | class Callback:
 25 |     def on_batch_inputs(self, bot: BaseBot, input_tensors: tf.Tensor, targets: tf.Tensor):
 26 |         return input_tensors, targets
 27 | 
 28 |     def on_train_starts(self, bot: BaseBot):
 29 |         return
 30 | 
 31 |     def on_train_ends(self, bot: BaseBot):
 32 |         return
 33 | 
 34 |     def on_epoch_ends(self, bot: BaseBot, epoch: int):
 35 |         return
 36 | 
 37 |     def on_eval_ends(self, bot: BaseBot, metrics: Dict[str, Tuple[float, str]]):
 38 |         return
 39 | 
 40 |     def on_step_ends(self, bot: BaseBot, train_loss: float, train_weight: int):
 41 |         return
 42 | 
 43 |     def on_load_checkpoint(self, **kwargs):
 44 |         return
 45 | 
 46 |     def on_save_checkpoint(self):
 47 |         return
 48 | 
 49 |     def reset(self):
 50 |         return
 51 | 
 52 | 
 53 | class MovingAverageStatsTrackerCallback(Callback):
 54 |     """Log moving average of training losses, and report evaluation metrics.
 55 |     """
 56 | 
 57 |     def __init__(self, avg_window: int, log_interval: int):
 58 |         super().__init__()
 59 |         self.avg_window = avg_window
 60 |         self.log_interval = log_interval
 61 |         self.reset()
 62 |         self.timer: float = 0.0
 63 | 
 64 |     def on_train_starts(self, bot: BaseBot):
 65 |         self.timer = time()
 66 | 
 67 |     def on_step_ends(self, bot: BaseBot, train_loss, train_weight):
 68 |         self.train_losses.append(train_loss)
 69 |         self.train_weights.append(train_weight)
 70 |         if bot.step % self.log_interval == 0:
 71 |             # print(len(self.train_weights), len(self.train_losses))
 72 |             train_loss_avg = np.average(
 73 |                 self.train_losses, weights=self.train_weights, axis=0)
 74 |             lr = (
 75 |                 bot.optimizer.lr(bot.step) if callable(bot.optimizer.lr)
 76 |                 else bot.optimizer.lr
 77 |             )
 78 |             if not isinstance(lr, float):
 79 |                 lr = lr.numpy()
 80 |             speed = (time() - self.timer) / self.log_interval
 81 |             # reset timer
 82 |             self.timer = time()
 83 |             bot.logger.info(
 84 |                 f"Step %5d | loss {bot.loss_format} | lr %.2e | %.3fs per step",
 85 |                 bot.step, train_loss_avg, lr, speed)
 86 |             self.train_logs.append(train_loss_avg)
 87 | 
 88 |     def on_eval_ends(self, bot: BaseBot, metrics: Dict[str, Tuple[float, str]]):
 89 |         self.metrics["step"].append(bot.step)
 90 |         history_length = len(self.metrics["step"])
 91 |         bot.logger.info(f"Metrics at step {bot.step}:")
 92 |         for metric_name, (metric_value, metric_string) in metrics.items():
 93 |             self.metrics[metric_name].append((metric_value, metric_string))
 94 |             assert history_length == len(
 95 |                 self.metrics[metric_name]), "Inconsistent metric found!"
 96 |             bot.logger.info(f"{metric_name}: {metric_string}")
 97 | 
 98 |     def on_train_ends(self, bot: BaseBot):
 99 |         if self.metrics["step"]:
100 |             bot.logger.info("Training finished. Best step(s):")
101 |             for metric_name, metric_values in self.metrics.items():
102 |                 if metric_name == "step":
103 |                     continue
104 |                 best_idx = np.argmin(
105 |                     np.array([x[0] for x in metric_values]))
106 |                 bot.logger.info(
107 |                     "%s: %s @ step %d",
108 |                     metric_name, metric_values[best_idx][1],
109 |                     self.metrics["step"][best_idx]
110 |                 )
111 | 
112 |     def reset(self):
113 |         self.train_losses = deque(maxlen=self.avg_window)
114 |         self.train_weights = deque(maxlen=self.avg_window)
115 |         self.metrics = defaultdict(list)
116 |         self.train_logs = []
117 | 
118 | 
119 | class CheckpointCallback(Callback):
120 |     """Save and manage checkpoints.
121 | 
122 |     TODO: Checkpoints that can be used to resume training
123 |     """
124 | 
125 |     def __init__(
126 |             self, keep_n_checkpoints: int = 1,
127 |             checkpoint_dir: Union[Path, str] = Path("./data/cache/model_cache/"),
128 |             monitor_metric: str = "loss"):
129 |         super().__init__()
130 |         assert keep_n_checkpoints > 0
131 |         self.keep_n_checkpoints = keep_n_checkpoints
132 |         self.checkpoint_dir = Path(checkpoint_dir)
133 |         self.monitor_metric = monitor_metric
134 |         self.best_performers: List[Tuple[float, Path, int]] = []
135 |         self.checkpoint_dir.mkdir(exist_ok=True, parents=True)
136 | 
137 |     def on_eval_ends(self, bot: BaseBot, metrics: Dict[str, Tuple[float, str]]):
138 |         target_value, target_string = metrics[self.monitor_metric]
139 |         target_path = (
140 |             self.checkpoint_dir /
141 |             "ckpt_{}_{}_{}_{}.h5".format(
142 |                 bot.name, target_string, bot.step,
143 |                 datetime.now().strftime("%m%d%H%M"))
144 |         )
145 |         bot.logger.debug("Saving checkpoint %s...", target_path)
146 |         if (
147 |             len(self.best_performers) < self.keep_n_checkpoints or
148 |             target_value < self.best_performers[-1][0]
149 |         ):
150 |             self.best_performers.append((target_value, target_path, bot.step))
151 |             self.remove_checkpoints(keep=self.keep_n_checkpoints)
152 |             bot.model.save_weights(str(target_path))
153 |             assert target_path.exists()
154 | 
155 |     def remove_checkpoints(self, keep):
156 |         self.best_performers = sorted(self.best_performers, key=lambda x: x[0])
157 |         for checkpoint in np.unique([
158 |                 x[1] for x in self.best_performers[keep:]]):
159 |             Path(checkpoint).unlink()
160 |         self.best_performers = self.best_performers[:keep]
161 | 
162 |     def reset(self, ignore_previous=False):
163 |         if ignore_previous:
164 |             self.best_performers = []
165 |         else:
166 |             self.remove_checkpoints(0)
167 | 
168 | 
169 | class TelegramCallback(Callback):
170 |     """A Telegram notification callback
171 | 
172 |     Reference: https://github.com/huggingface/knockknock
173 |     """
174 |     DATE_FORMAT = "%Y-%m-%d %H:%M:%d"
175 | 
176 |     def __init__(self, token: str, chat_id: int, name: str, report_evals: bool = False):
177 |         try:
178 |             import telegram
179 |         except ImportError:
180 |             raise ImportError(
181 |                 "Please install 'python-telegram-bot' before using TelegramCallback.")
182 |         self.telegram_bot = telegram.Bot(token=token)
183 |         self.host_name = socket.gethostname()
184 |         self.report_evals = report_evals
185 |         self.chat_id = chat_id
186 |         self.name = name
187 |         self.start_time = datetime.now()
188 | 
189 |     def on_train_starts(self, bot: BaseBot):
190 |         self.start_time = datetime.now()
191 |         contents = [
192 |             f'{self.name} has started training 🎬',
193 |             'Machine name: %s' % self.host_name,
194 |             'Starting date: %s' % self.start_time.strftime(
195 |                 TelegramCallback.DATE_FORMAT)
196 |         ]
197 |         text = '\n'.join(contents)
198 |         self.telegram_bot.send_message(chat_id=self.chat_id, text=text)
199 | 
200 |     def on_train_ends(self, bot: BaseBot):
201 |         end_time = datetime.now()
202 |         elapsed_time = end_time - self.start_time
203 |         contents = [
204 |             f'{self.name} has finished training 🎉',
205 |             'Machine name: %s' % self.host_name,
206 |             'Starting date: %s' % self.start_time.strftime(
207 |                 TelegramCallback.DATE_FORMAT),
208 |             'End date: %s' % end_time.strftime(
209 |                 TelegramCallback.DATE_FORMAT),
210 |             'Training duration: %s' % str(elapsed_time)
211 |         ]
212 |         text = '\n'.join(contents)
213 |         self.telegram_bot.send_message(chat_id=self.chat_id, text=text)
214 | 
215 |     def on_eval_ends(self, bot: BaseBot, metrics: Dict[str, Tuple[float, str]]):
216 |         if self.report_evals is False:
217 |             return
218 |         contents = [
219 |             f"Metrics from {self.name} at step {bot.step}:"
220 |         ]
221 |         contents += [
222 |             f"{metric_name}: {metric_string}"
223 |             for metric_name, (metric_value, metric_string) in metrics.items()
224 |         ]
225 |         text = '\n'.join(contents)
226 |         self.telegram_bot.send_message(chat_id=self.chat_id, text=text)
227 | 
228 | 
229 | class WandbCallback(Callback):
230 |     """ Callback for the Weights and Biases service
231 | 
232 |     Prerequisites: install `wandb` and run `wandb login`.
233 | 
234 |     Note: train a few more steps after the last eval to make sure the log is complete.
235 | 
236 |     WARNING: Resuming is not fully supported yet.
237 | 
238 |     Reference: https://github.com/wandb/client/raw/ef0911c47beebab0db8749d764802057d3480e69/wandb/fastai/__init__.py
239 |     """
240 | 
241 |     def __init__(self, config: Dict, name: str):
242 |         if WANDB is False:
243 |             raise ImportError(
244 |                 "Please install 'wandb' before using WandbCallback.")
245 |         # project name can only be in lower case
246 |         wandb.init(config=config, project=name.lower())
247 | 
248 |     def on_step_ends(self, bot: BaseBot, train_loss: float, train_weight: int):
249 |         wandb.log({"train_loss": train_loss}, step=bot.step)
250 | 
251 |     def on_eval_ends(self, bot: BaseBot, metrics: Dict[str, Tuple[float, str]]):
252 |         metrics_ = {
253 |             metric_name: metric_value
254 |             for metric_name, (metric_value, _) in metrics.items()
255 |         }
256 |         # Rename to avoid conflicts
257 |         metrics_["val_loss"] = metrics_["loss"]
258 |         del metrics_["loss"]
259 |         # NOTE: remember to train one more step to sync the final eval metrics to the server
260 |         wandb.log(metrics_, step=bot.step)
261 | 


--------------------------------------------------------------------------------
/tf-helper-bot/tf_helper_bot/optimizers.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from tensorflow.python.keras.optimizer_v2.optimizer_v2 import OptimizerV2
  3 | from tensorflow.python import ops, math_ops, state_ops, control_flow_ops
  4 | from tensorflow.python.keras import backend as K
  5 | 
  6 | __all__ = ['RAdam']
  7 | 
  8 | 
  9 | class RAdam(OptimizerV2):
 10 |     """RAdam optimizer.
 11 | 
 12 |     According to the paper
 13 |     [On The Variance Of The Adaptive Learning Rate And Beyond](https://arxiv.org/pdf/1908.03265v1.pdf).
 14 |     """
 15 | 
 16 |     def __init__(self,
 17 |                  learning_rate=0.001,
 18 |                  beta_1=0.9,
 19 |                  beta_2=0.999,
 20 |                  epsilon=1e-7,
 21 |                  weight_decay=0.,
 22 |                  amsgrad=False,
 23 |                  total_steps=0,
 24 |                  warmup_proportion=0.1,
 25 |                  min_lr=0.,
 26 |                  name='RAdam',
 27 |                  **kwargs):
 28 |         r"""Construct a new Adam optimizer.
 29 | 
 30 |         Args:
 31 |             learning_rate: A Tensor or a floating point value.    The learning rate.
 32 |             beta_1: A float value or a constant float tensor. The exponential decay
 33 |                 rate for the 1st moment estimates.
 34 |             beta_2: A float value or a constant float tensor. The exponential decay
 35 |                 rate for the 2nd moment estimates.
 36 |             epsilon: A small constant for numerical stability. This epsilon is
 37 |                 "epsilon hat" in the Kingma and Ba paper (in the formula just before
 38 |                 Section 2.1), not the epsilon in Algorithm 1 of the paper.
 39 |             weight_decay: A floating point value. Weight decay for each param.
 40 |             amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from
 41 |                 the paper "On the Convergence of Adam and beyond".
 42 |             total_steps: An integer. Total number of training steps.
 43 |                 Enable warmup by setting a positive value.
 44 |             warmup_proportion: A floating point value. The proportion of increasing steps.
 45 |             min_lr: A floating point value. Minimum learning rate after warmup.
 46 |             name: Optional name for the operations created when applying gradients.
 47 |                 Defaults to "Adam".    @compatibility(eager) When eager execution is
 48 |                 enabled, `learning_rate`, `beta_1`, `beta_2`, and `epsilon` can each be
 49 |                 a callable that takes no arguments and returns the actual value to use.
 50 |                 This can be useful for changing these values across different
 51 |                 invocations of optimizer functions. @end_compatibility
 52 |             **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
 53 |                 `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
 54 |                 gradients by value, `decay` is included for backward compatibility to
 55 |                 allow time inverse decay of learning rate. `lr` is included for backward
 56 |                 compatibility, recommended to use `learning_rate` instead.
 57 |         """
 58 | 
 59 |         super(RAdam, self).__init__(name, **kwargs)
 60 |         self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
 61 |         self._set_hyper('beta_1', beta_1)
 62 |         self._set_hyper('beta_2', beta_2)
 63 |         self._set_hyper('decay', self._initial_decay)
 64 |         self._set_hyper('weight_decay', weight_decay)
 65 |         self._set_hyper('total_steps', float(total_steps))
 66 |         self._set_hyper('warmup_proportion', warmup_proportion)
 67 |         self._set_hyper('min_lr', min_lr)
 68 |         self.epsilon = epsilon or K.epsilon()
 69 |         self.amsgrad = amsgrad
 70 |         self._initial_weight_decay = weight_decay
 71 |         self._initial_total_steps = total_steps
 72 | 
 73 |     def _create_slots(self, var_list):
 74 |         for var in var_list:
 75 |             self.add_slot(var, 'm')
 76 |         for var in var_list:
 77 |             self.add_slot(var, 'v')
 78 |         if self.amsgrad:
 79 |             for var in var_list:
 80 |                 self.add_slot(var, 'vhat')
 81 | 
 82 |     def set_weights(self, weights):
 83 |         params = self.weights
 84 |         num_vars = int((len(params) - 1) / 2)
 85 |         if len(weights) == 3 * num_vars + 1:
 86 |             weights = weights[:len(params)]
 87 |         super(RAdam, self).set_weights(weights)
 88 | 
 89 |     def _resource_apply_dense(self, grad, var):
 90 |         var_dtype = var.dtype.base_dtype
 91 |         lr_t = self._decayed_lr(var_dtype)
 92 |         m = self.get_slot(var, 'm')
 93 |         v = self.get_slot(var, 'v')
 94 |         beta_1_t = self._get_hyper('beta_1', var_dtype)
 95 |         beta_2_t = self._get_hyper('beta_2', var_dtype)
 96 |         epsilon_t = ops.convert_to_tensor(self.epsilon, var_dtype)
 97 |         local_step = math_ops.cast(self.iterations + 1, var_dtype)
 98 |         beta_1_power = math_ops.pow(beta_1_t, local_step)
 99 |         beta_2_power = math_ops.pow(beta_2_t, local_step)
100 | 
101 |         if self._initial_total_steps > 0:
102 |             total_steps = self._get_hyper('total_steps', var_dtype)
103 |             warmup_steps = total_steps * \
104 |                 self._get_hyper('warmup_proportion', var_dtype)
105 |             min_lr = self._get_hyper('min_lr', var_dtype)
106 |             decay_steps = K.maximum(total_steps - warmup_steps, 1)
107 |             decay_rate = (min_lr - lr_t) / decay_steps
108 |             lr_t = tf.where(
109 |                 local_step <= warmup_steps,
110 |                 lr_t * (local_step / warmup_steps),
111 |                 lr_t + decay_rate *
112 |                 K.minimum(local_step - warmup_steps, decay_steps),
113 |             )
114 | 
115 |         sma_inf = 2.0 / (1.0 - beta_2_t) - 1.0
116 |         sma_t = sma_inf - 2.0 * local_step * \
117 |             beta_2_power / (1.0 - beta_2_power)
118 | 
119 |         m_t = state_ops.assign(m,
120 |                                beta_1_t * m + (1.0 - beta_1_t) * grad,
121 |                                use_locking=self._use_locking)
122 |         m_corr_t = m_t / (1.0 - beta_1_power)
123 | 
124 |         v_t = state_ops.assign(v,
125 |                                beta_2_t * v + (1.0 - beta_2_t) *
126 |                                math_ops.square(grad),
127 |                                use_locking=self._use_locking)
128 |         if self.amsgrad:
129 |             vhat = self.get_slot(var, 'vhat')
130 |             vhat_t = state_ops.assign(vhat,
131 |                                       math_ops.maximum(vhat, v_t),
132 |                                       use_locking=self._use_locking)
133 |             v_corr_t = math_ops.sqrt(vhat_t / (1.0 - beta_2_power))
134 |         else:
135 |             vhat_t = None
136 |             v_corr_t = math_ops.sqrt(v_t / (1.0 - beta_2_power))
137 | 
138 |         r_t = math_ops.sqrt((sma_t - 4.0) / (sma_inf - 4.0) *
139 |                             (sma_t - 2.0) / (sma_inf - 2.0) *
140 |                             sma_inf / sma_t)
141 | 
142 |         var_t = tf.where(sma_t >= 5.0, r_t * m_corr_t /
143 |                          (v_corr_t + epsilon_t), m_corr_t)
144 | 
145 |         if self._initial_weight_decay > 0.0:
146 |             var_t += self._get_hyper('weight_decay', var_dtype) * var
147 | 
148 |         var_update = state_ops.assign_sub(var,
149 |                                           lr_t * var_t,
150 |                                           use_locking=self._use_locking)
151 | 
152 |         updates = [var_update, m_t, v_t]
153 |         if self.amsgrad:
154 |             updates.append(vhat_t)
155 |         return control_flow_ops.group(*updates)
156 | 
157 |     def _resource_apply_sparse(self, grad, var, indices):
158 |         var_dtype = var.dtype.base_dtype
159 |         lr_t = self._decayed_lr(var_dtype)
160 |         beta_1_t = self._get_hyper('beta_1', var_dtype)
161 |         beta_2_t = self._get_hyper('beta_2', var_dtype)
162 |         epsilon_t = ops.convert_to_tensor(self.epsilon, var_dtype)
163 |         local_step = math_ops.cast(self.iterations + 1, var_dtype)
164 |         beta_1_power = math_ops.pow(beta_1_t, local_step)
165 |         beta_2_power = math_ops.pow(beta_2_t, local_step)
166 | 
167 |         if self._initial_total_steps > 0:
168 |             total_steps = self._get_hyper('total_steps', var_dtype)
169 |             warmup_steps = total_steps * \
170 |                 self._get_hyper('warmup_proportion', var_dtype)
171 |             min_lr = self._get_hyper('min_lr', var_dtype)
172 |             decay_steps = K.maximum(total_steps - warmup_steps, 1)
173 |             decay_rate = (min_lr - lr_t) / decay_steps
174 |             lr_t = tf.where(
175 |                 local_step <= warmup_steps,
176 |                 lr_t * (local_step / warmup_steps),
177 |                 lr_t + decay_rate *
178 |                 K.minimum(local_step - warmup_steps, decay_steps),
179 |             )
180 | 
181 |         sma_inf = 2.0 / (1.0 - beta_2_t) - 1.0
182 |         sma_t = sma_inf - 2.0 * local_step * \
183 |             beta_2_power / (1.0 - beta_2_power)
184 | 
185 |         m = self.get_slot(var, 'm')
186 |         m_scaled_g_values = grad * (1 - beta_1_t)
187 |         m_t = state_ops.assign(m, m * beta_1_t, use_locking=self._use_locking)
188 |         with ops.control_dependencies([m_t]):
189 |             m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)
190 |         m_corr_t = m_t / (1.0 - beta_1_power)
191 | 
192 |         v = self.get_slot(var, 'v')
193 |         v_scaled_g_values = (grad * grad) * (1 - beta_2_t)
194 |         v_t = state_ops.assign(v, v * beta_2_t, use_locking=self._use_locking)
195 |         with ops.control_dependencies([v_t]):
196 |             v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)
197 | 
198 |         if self.amsgrad:
199 |             vhat = self.get_slot(var, 'vhat')
200 |             vhat_t = state_ops.assign(vhat,
201 |                                       math_ops.maximum(vhat, v_t),
202 |                                       use_locking=self._use_locking)
203 |             v_corr_t = math_ops.sqrt(vhat_t / (1.0 - beta_2_power))
204 |         else:
205 |             vhat_t = None
206 |             v_corr_t = math_ops.sqrt(v_t / (1.0 - beta_2_power))
207 | 
208 |         r_t = math_ops.sqrt((sma_t - 4.0) / (sma_inf - 4.0) *
209 |                             (sma_t - 2.0) / (sma_inf - 2.0) *
210 |                             sma_inf / sma_t)
211 | 
212 |         var_t = tf.where(sma_t >= 5.0, r_t * m_corr_t /
213 |                          (v_corr_t + epsilon_t), m_corr_t)
214 | 
215 |         if self._initial_weight_decay > 0.0:
216 |             var_t += self._get_hyper('weight_decay', var_dtype) * var
217 | 
218 |         var_update = self._resource_scatter_add(
219 |             var, indices, tf.gather(-lr_t * var_t, indices))
220 | 
221 |         updates = [var_update, m_t, v_t]
222 |         if self.amsgrad:
223 |             updates.append(vhat_t)
224 |         return control_flow_ops.group(*updates)
225 | 
226 |     def get_config(self):
227 |         config = super(RAdam, self).get_config()
228 |         config.update({
229 |             'learning_rate': self._serialize_hyperparameter('learning_rate'),
230 |             'beta_1': self._serialize_hyperparameter('beta_1'),
231 |             'beta_2': self._serialize_hyperparameter('beta_2'),
232 |             'decay': self._serialize_hyperparameter('decay'),
233 |             'weight_decay': self._serialize_hyperparameter('weight_decay'),
234 |             'epsilon': self.epsilon,
235 |             'amsgrad': self.amsgrad,
236 |             'total_steps': self._serialize_hyperparameter('total_steps'),
237 |             'warmup_proportion': self._serialize_hyperparameter('warmup_proportion'),
238 |             'min_lr': self._serialize_hyperparameter('min_lr'),
239 |         })
240 |         return config
241 | 


--------------------------------------------------------------------------------
/tf-helper-bot/tf_helper_bot/bot.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from pathlib import Path
  3 | from typing import Callable, Sequence, Union, Optional
  4 | 
  5 | import numpy as np
  6 | import tensorflow as tf
  7 | from dataclasses import dataclass, field
  8 | from tqdm.autonotebook import tqdm
  9 | 
 10 | from .logger import Logger
 11 | 
 12 | 
 13 | @dataclass
 14 | class BaseBot:
 15 |     """Base Interface to Model Training and Inference"""
 16 |     train_dataset: tf.data.Dataset
 17 |     valid_dataset: tf.data.Dataset
 18 |     steps_per_epoch: int
 19 |     criterion: Callable
 20 |     model: tf.keras.Model
 21 |     optimizer: tf.keras.optimizers.Optimizer
 22 |     name: str = "basebot"
 23 |     log_dir: Union[Path, str] = "./logs"
 24 |     log_level: int = logging.INFO
 25 |     loss_format: str = "%.4f"
 26 |     echo: bool = True
 27 |     pbar: bool = True
 28 |     step: int = 0
 29 |     total_steps: int = 0
 30 |     valid_steps: Optional[int] = None
 31 |     gradient_accumulation_steps: int = 1
 32 |     metrics: Sequence = ()
 33 |     callbacks: Sequence = ()
 34 |     mixed_precision: bool = False
 35 | 
 36 |     def __post_init__(self):
 37 |         self._gradients = []
 38 |         self.logger = Logger(
 39 |             self.name, Path(self.log_dir), self.log_level,
 40 |             echo=self.echo
 41 |         )
 42 | 
 43 |         @tf.function
 44 |         def get_gradient(input_tensors, target):
 45 |             with tf.GradientTape() as tape:
 46 |                 output = self.model(
 47 |                     input_tensors, training=True)
 48 |                 loss_raw = self.criterion(
 49 |                     target, self._extract_prediction(output)
 50 |                 )
 51 |                 loss_ = (
 52 |                     self.optimizer.get_scaled_loss(loss_raw)
 53 |                     if self.mixed_precision else loss_raw
 54 |                 )
 55 |             gradients_ = tape.gradient(
 56 |                 loss_, self.model.trainable_variables)
 57 |             if self.mixed_precision:
 58 |                 gradients_ = self.optimizer.get_unscaled_gradients(gradients_)
 59 |             return loss_raw, gradients_
 60 | 
 61 |         @tf.function
 62 |         def step_optimizer(gradients):
 63 |             self.optimizer.apply_gradients(
 64 |                 zip(
 65 |                     gradients,
 66 |                     self.model.trainable_variables
 67 |                 )
 68 |             )
 69 | 
 70 |         @tf.function
 71 |         def predict_batch(input_tensors):
 72 |             return self.model(input_tensors, training=False)
 73 | 
 74 |         self._get_gradient = get_gradient
 75 |         self._step_optimizer = step_optimizer
 76 |         self._predict_batch = predict_batch
 77 | 
 78 |     @staticmethod
 79 |     def _sum_indexed_slice(grad_1, grad_2, div_):
 80 |         values = tf.concat([grad_1.values, grad_2.values / div_], 0)
 81 |         indices = tf.concat([grad_1.indices, grad_2.indices], 0)
 82 |         return tf.IndexedSlices(values, indices)
 83 | 
 84 |     def train_one_step(self, input_tensor_list, target):
 85 |         loss, gradients = self._get_gradient(
 86 |             input_tensor_list[0], target)
 87 |         if self.gradient_accumulation_steps > 1:
 88 |             div_ = tf.constant(
 89 |                 self.gradient_accumulation_steps,
 90 |                 dtype=tf.float32
 91 |             )
 92 |             gradients = [x / div_ for x in gradients]
 93 |             loss, gradients = self._get_gradient(
 94 |                 input_tensor_list[0], target)
 95 |             for i in range(1, self.gradient_accumulation_steps):
 96 |                 loss_, gradients_ = self._get_gradient(
 97 |                     input_tensor_list[i], target)
 98 |                 gradients = [
 99 |                     grad_1 + grad_2 / div_ if not isinstance(grad_1, tf.IndexedSlices)
100 |                     else self._sum_indexed_slice(grad_1, grad_2, div_)
101 |                     for grad_1, grad_2 in zip(gradients, gradients_)
102 |                 ]
103 |                 loss = loss + loss_
104 |             loss = loss / tf.constant(
105 |                 self.gradient_accumulation_steps,
106 |                 dtype=tf.float32
107 |             )
108 |         self._step_optimizer(gradients)
109 |         return loss
110 | 
111 |     @staticmethod
112 |     def _extract_prediction(output):
113 |         """Can be overridden to act as a shortcut to transform model outputs.
114 | 
115 |         Useful when using a pretrained model whose outputs are not in the desired format.
116 |         """
117 |         return output
118 | 
119 |     def train(self, *, checkpoint_interval, n_steps=None, total_steps=None):
120 |         if total_steps:
121 |             self.total_steps = total_steps
122 |         if n_steps is None:
123 |             if self.total_steps is None:
124 |                 raise ValueError("n_steps and total_steps cannot both be None")
125 |             n_steps = self.total_steps - self.step
126 |         elif self.total_steps is None:
127 |             self.total_steps = n_steps
128 |         target_step = self.step + n_steps
129 |         input_tensor_list, cnt = [], 0
130 |         # Train starts
131 |         self.run_train_starts_callbacks()
132 |         try:
133 |             while self.step < target_step:
134 |                 for input_tensors, targets in self.train_dataset:
135 |                     self.step += 1
136 |                     input_tensors, targets = self.run_batch_inputs_callbacks(
137 |                         input_tensors, targets)
138 |                     input_tensor_list.append(input_tensors)
139 |                     cnt += self.get_batch_size(input_tensors)
140 |                     if len(input_tensor_list) == self.gradient_accumulation_steps:
141 |                         loss = self.train_one_step(
142 |                             input_tensor_list, targets
143 |                         )
144 |                         # Step ends
145 |                         self.run_step_ends_callbacks(loss.numpy(), cnt)
146 |                         input_tensor_list, cnt = [], 0
147 |                     if (
148 |                         (callable(checkpoint_interval) and checkpoint_interval(self.step)) or
149 |                         (
150 |                             not callable(checkpoint_interval) and
151 |                             self.step % checkpoint_interval == 0
152 |                         )
153 |                     ):
154 |                         # Eval starts
155 |                         metrics = self.eval(self.valid_dataset)
156 |                         # Eval ends
157 |                         self.run_eval_ends_callbacks(metrics)
158 |                     if self.step >= target_step:
159 |                         break
160 |                     # Epoch ends
161 |                     if self.step % self.steps_per_epoch == 0:
162 |                         self.run_epoch_ends_callbacks(
163 |                             self.step // self.steps_per_epoch)
164 |         except (KeyboardInterrupt):
165 |             pass
166 |         finally:
167 |             # Train ends
168 |             self.run_train_ends_callbacks()
169 | 
170 |     def predict_batch(self, input_tensors):
171 |         """To be overriden in distributed modes"""
172 |         return self._extract_prediction(
173 |             self._predict_batch(input_tensors)
174 |         )
175 | 
176 |     def _extract_target_for_eval(self, target):
177 |         return target
178 | 
179 |     def predict(self, dataset, *, return_y=False):
180 |         self.model.eval()
181 |         outputs, y_global = [], []
182 |         for *input_tensors, y_local in tqdm(dataset, disable=not self.pbar):
183 |             outputs.append(self.predict_batch(input_tensors).numpy())
184 |             if return_y:
185 |                 y_global.append(
186 |                     self._extract_target_for_eval(y_local).numpy())
187 |         outputs = np.concatenate(outputs, axis=0)
188 |         if return_y:
189 |             y_global = np.concatenate(y_global, axis=0)
190 |             return outputs, y_global
191 |         return outputs
192 | 
193 |     def eval(self, dataset):
194 |         """Warning: Only support datasets whose predictions and labels together fit in memory."""
195 |         preds, ys = [], []
196 |         losses, weights = [], []
197 |         self.logger.debug("Evaluating...")
198 |         for input_tensors, y_local in tqdm(dataset, disable=not self.pbar, total=self.valid_steps, ncols=100):
199 |             output = self.predict_batch(input_tensors)
200 |             y_local = self._extract_target_for_eval(y_local)
201 |             batch_loss = self.criterion(y_local, output)
202 |             losses.append(batch_loss.numpy())
203 |             weights.append(y_local.shape[0])
204 |             # Save batch labels and predictions
205 |             preds.append(output.numpy())
206 |             ys.append(y_local.numpy())
207 |         loss = np.average(losses, weights=weights)
208 |         metrics = {"loss": (loss, self.loss_format % loss)}
209 |         global_ys, global_preds = np.concatenate(ys), np.concatenate(preds)
210 |         for metric in self.metrics:
211 |             metric_loss, metric_string = metric(global_ys, global_preds)
212 |             metrics[metric.name] = (metric_loss, metric_string)
213 |         return metrics
214 | 
215 |     def get_batch_size(self, input_tensors):
216 |         if isinstance(input_tensors, list):
217 |             return self.get_batch_size(input_tensors[0])
218 |         elif isinstance(input_tensors, dict):
219 |             return self.get_batch_size(list(input_tensors.values())[0])
220 |         return input_tensors.shape[0]
221 | 
222 |     def run_batch_inputs_callbacks(self, input_tensors, targets):
223 |         for callback in self.callbacks:
224 |             input_tensors, targets = callback.on_batch_inputs(
225 |                 self, input_tensors, targets)
226 |         return input_tensors, targets
227 | 
228 |     def run_step_ends_callbacks(self, train_loss, train_weight):
229 |         for callback in self.callbacks:
230 |             callback.on_step_ends(self, train_loss, train_weight)
231 | 
232 |     def run_train_starts_callbacks(self):
233 |         for callback in self.callbacks:
234 |             callback.on_train_starts(self)
235 | 
236 |     def run_train_ends_callbacks(self):
237 |         for callback in self.callbacks:
238 |             callback.on_train_ends(self)
239 | 
240 |     def run_epoch_ends_callbacks(self, epoch):
241 |         for callback in self.callbacks:
242 |             callback.on_epoch_ends(self, epoch)
243 | 
244 |     def run_eval_ends_callbacks(self, metrics):
245 |         for callback in self.callbacks:
246 |             callback.on_eval_ends(self, metrics)
247 | 
248 | 
249 | @dataclass
250 | class BaseDistributedBot(BaseBot):
251 |     """Base Interface to Model Training and Inference"""
252 |     strategy: tf.distribute.Strategy = None
253 | 
254 |     def __post_init__(self):
255 |         assert self.strategy is not None
256 |         assert self.gradient_accumulation_steps == 1, (
257 |             "Distribution mode doesn't suppoprt gradient accumulation"
258 |         )
259 |         super().__post_init__()
260 |         @tf.function
261 |         def train_one_step(input_tensor_list, target):
262 |             loss, gradients = self._get_gradient(
263 |                 input_tensor_list[0], target)
264 |             self._step_optimizer(gradients)
265 |             return loss
266 | 
267 |         self._train_one_step = train_one_step
268 | 
269 |     def train_one_step(self, input_tensors, target):
270 |         loss = self.strategy.experimental_run_v2(
271 |             self._train_one_step,
272 |             args=(input_tensors, target)
273 |         )
274 |         return self.strategy.reduce(
275 |             tf.distribute.ReduceOp.MEAN, loss, axis=None
276 |         )
277 | 
278 |     def get_batch_size(self, input_tensors):
279 |         # Just use a rough estimate for speed
280 |         return 1
281 |         # the following can be slow (and unnecessary in most cases)
282 |         # if isinstance(input_tensors, list):
283 |         #     x_per_gpu_as_list = self.strategy.experimental_local_results(
284 |         #         input_tensors[0])
285 |         # else:
286 |         #     x_per_gpu_as_list = self.strategy.experimental_local_results(
287 |         #         input_tensors)
288 |         # batch_sizes = [tf.shape(x_gpu)[0] for x_gpu in x_per_gpu_as_list]
289 |         # return tf.reduce_sum(tf.stack(batch_sizes)).numpy()
290 | 
291 |     def _extract_target_for_eval(self, target):
292 |         return tf.concat(
293 |             self.strategy.experimental_local_results(target),
294 |             axis=0
295 |         )
296 | 
297 |     def predict_batch(self, input_tensors):
298 |         preds = self.strategy.experimental_run_v2(
299 |             self._predict_batch,
300 |             args=(input_tensors,)
301 |         )
302 |         if isinstance(preds, tuple):
303 |             # WARNING: This might not applicable in all situations
304 |             preds = preds[0]
305 |         preds_local = tf.concat(
306 |             preds.values, axis=0
307 |         )
308 |         return preds_local
309 | 


--------------------------------------------------------------------------------