├── cache └── .keepme ├── data └── .keepme ├── quest ├── __init__.py ├── metrics.py ├── post_processing.py ├── train_folds.py ├── dataset.py ├── eval_tpu.py ├── eval.py ├── inference.py ├── models.py ├── prepare_reference_data.py ├── train.py └── prepare_tfrecords.py ├── imgs └── submission.png ├── requirements.txt ├── tf-helper-bot ├── README.md ├── tf_helper_bot │ ├── __init__.py │ ├── utils.py │ ├── logger.py │ ├── mixup.py │ ├── metrics.py │ ├── lr_schedulers.py │ ├── callbacks.py │ ├── optimizers.py │ └── bot.py ├── .gitignore ├── setup.py └── LICENSE ├── .gitignore ├── setup.py ├── LICENSE └── README.md /cache/.keepme: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/.keepme: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /quest/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /imgs/submission.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceshine/kaggle-quest/master/imgs/submission.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | fire 2 | joblib 3 | transformers==2.3.0 4 | python-telegram-bot 5 | tensorflow>=2.1.0 -------------------------------------------------------------------------------- /tf-helper-bot/README.md: -------------------------------------------------------------------------------- 1 | # TF Helper Bot (WIP) 2 | 3 | Writing powerful custom training loops for Tensorflow 2.x with less code. 4 | 5 | (This is basically a TF port of [pytorch-helper-bot](https://github.com/ceshine/pytorch-helper-bot)) 6 | -------------------------------------------------------------------------------- /tf-helper-bot/tf_helper_bot/__init__.py: -------------------------------------------------------------------------------- 1 | from .bot import BaseBot, BaseDistributedBot 2 | from .callbacks import * 3 | from .logger import Logger 4 | from .metrics import (Metric, FBeta, AUC) 5 | from .lr_schedulers import CosineDecayWithWarmup 6 | -------------------------------------------------------------------------------- /tf-helper-bot/.gitignore: -------------------------------------------------------------------------------- 1 | *.csv 2 | *# 3 | *~ 4 | cache 5 | __pycache__ 6 | .dir-locals.el 7 | .idea/ 8 | .vscode/ 9 | .ipynb_checkpoints/ 10 | *.7z 11 | *.html 12 | *.gz 13 | *.out 14 | runs/ 15 | data/ 16 | plots 17 | *.zip 18 | .mypy_cache 19 | pylintrc 20 | *.egg-info/ 21 | .cache/ 22 | core 23 | .nv/ 24 | .bash_history 25 | data 26 | wandb/ 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | 7 | # submission file 8 | *.csv 9 | 10 | # pyCharm files 11 | .idea/ 12 | 13 | .mypy_cache/ 14 | .vscode/ 15 | 16 | *.7z 17 | *.zip 18 | 19 | bot 20 | input 21 | 22 | data/ 23 | notebooks/.ipynb_checkpoints/ 24 | docs/ 25 | build/ 26 | dist/ 27 | *.egg-info/ 28 | 29 | cache/ 30 | logs/ 31 | 32 | env.fish -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name="quest", 5 | version='0.0.1', 6 | author="Ceshine Lee", 7 | author_email="ceshine@ceshine.net", 8 | description="", 9 | license="MIT", 10 | url="", 11 | packages=["quest"], 12 | install_requires=[], 13 | classifiers=[ 14 | "Development Status :: 4 - Beta", 15 | "Intended Audience :: Science/Research", 16 | "Programming Language :: Python :: 3.6", 17 | "Topic :: Scientific/Engineering :: Artificial Intelligence" 18 | ], 19 | keywords="" 20 | ) 21 | -------------------------------------------------------------------------------- /tf-helper-bot/tf_helper_bot/utils.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def prepare_tpu(): 5 | try: 6 | tpu = tf.distribute.cluster_resolver.TPUClusterResolver() # TPU detection 7 | print('Running on TPU ', tpu.cluster_spec().as_dict()['worker']) 8 | except ValueError: 9 | tpu = None 10 | strategy = tf.distribute.get_strategy() 11 | if tpu: 12 | tf.config.experimental_connect_to_cluster(tpu) 13 | tf.tpu.experimental.initialize_tpu_system(tpu) 14 | strategy = tf.distribute.experimental.TPUStrategy(tpu) 15 | return strategy, tpu 16 | -------------------------------------------------------------------------------- /quest/metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sklearn.metrics 3 | from scipy.stats import spearmanr 4 | from scipy.special import expit 5 | from tf_helper_bot import Metric 6 | 7 | 8 | class SpearmanCorr(Metric): 9 | name = "spearman" 10 | 11 | def __init__(self, add_sigmoid: bool = False): 12 | self.add_sigmoid = add_sigmoid 13 | 14 | def __call__(self, truth: np.ndarray, pred: np.ndarray): 15 | if self.add_sigmoid: 16 | pred = expit(pred) 17 | corrs = [] 18 | for i in range(pred.shape[1]): 19 | if len(np.unique(truth[:, i])) == 1: 20 | continue 21 | corrs.append( 22 | spearmanr( 23 | truth[:, i], 24 | pred[:, i] 25 | ).correlation 26 | 27 | ) 28 | score = np.mean(corrs) 29 | return score * -1, f"{score * 100:.2f}" 30 | -------------------------------------------------------------------------------- /tf-helper-bot/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='tf_helper_bot', 5 | version='0.0.1', 6 | packages=['tf_helper_bot'], 7 | install_requires=[], 8 | classifiers=[ # Optional 9 | # How mature is this project? Common values are 10 | # 3 - Alpha 11 | # 4 - Beta 12 | # 5 - Production/Stable 13 | 'Development Status :: 3 - Alpha', 14 | 15 | # Indicate who your project is intended for 16 | 'Intended Audience :: Developers', 17 | 18 | # Pick your license as you wish 19 | 'License :: OSI Approved :: MIT License', 20 | 21 | # Specify the Python versions you support here. In particular, ensure 22 | # that you indicate whether you support Python 2, Python 3 or both. 23 | 'Programming Language :: Python :: 3.7', 24 | 'Programming Language :: Python :: 3.8' 25 | ], 26 | ) 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2020 Ceshine Lee 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /tf-helper-bot/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2020 Ceshine Lee 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /quest/post_processing.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | import numpy as np 4 | from scipy.stats import spearmanr 5 | from sklearn.preprocessing import MinMaxScaler 6 | 7 | 8 | def prevent_nan(pred): 9 | for i in range(pred.shape[1]): 10 | if len(np.unique(pred[:, i])) == 1: 11 | pred[0, i] = np.random.rand() 12 | pred[-1, i] = np.random.rand() 13 | return pred 14 | 15 | 16 | def find_best_bins(y_true, y_pred): 17 | scaler = MinMaxScaler() 18 | y_pred = scaler.fit_transform(y_pred) 19 | y = np.copy(y_pred) 20 | list_of_bins = [] 21 | for i in (range(y_pred.shape[1])): 22 | best_score = 0 # initilize score for the the column i 23 | best_bins = 1 24 | history_score = [] 25 | for max_voters in range(2, 200): 26 | y[:, i] = np.round( 27 | y_pred[:, i] * max_voters 28 | ) / max_voters 29 | y[:, i] = prevent_nan(y[:, i:i+1])[:, 0] 30 | score = spearmanr(y_true[:, i], y[:, i]).correlation 31 | history_score.append(score) 32 | if score > best_score: 33 | best_score = score 34 | best_bins = max_voters 35 | list_of_bins.append(best_bins) 36 | y[:, i] = np.round(y_pred[:, i] * best_bins) / best_bins 37 | return np.mean([ 38 | spearmanr(y_true[:, ind], y[:, ind]).correlation 39 | for ind in range(y.shape[1]) 40 | ]), list_of_bins, scaler 41 | -------------------------------------------------------------------------------- /tf-helper-bot/tf_helper_bot/logger.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import logging 3 | from pathlib import Path 4 | 5 | 6 | class Logger: 7 | def __init__(self, model_name, log_dir: Path, level=logging.INFO, echo=False): 8 | self.model_name = model_name 9 | (log_dir / "summaries").mkdir(parents=True, exist_ok=True) 10 | date_str = datetime.now().strftime('%Y%m%d_%H%M') 11 | log_file = 'log_{}.txt'.format(date_str) 12 | formatter = logging.Formatter( 13 | '[%(levelname)s][%(asctime)s] %(message)s', 14 | datefmt='%m/%d/%Y %H:%M:%S' 15 | ) 16 | self.logger = logging.getLogger("bot") 17 | # Remove all existing handlers 18 | self.logger.handlers = [] 19 | # Initialize handlers 20 | fh = logging.FileHandler(log_dir / log_file) 21 | fh.setFormatter(formatter) 22 | self.logger.addHandler(fh) 23 | if echo: 24 | sh = logging.StreamHandler() 25 | sh.setFormatter(formatter) 26 | self.logger.addHandler(sh) 27 | self.logger.setLevel(level) 28 | self.logger.propagate = False 29 | 30 | def info(self, msg, *args): 31 | self.logger.info(msg, *args) 32 | 33 | def warning(self, msg, *args): 34 | self.logger.warning(msg, *args) 35 | 36 | def debug(self, msg, *args): 37 | self.logger.debug(msg, *args) 38 | 39 | def error(self, msg, *args): 40 | self.logger.error(msg, *args) 41 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TPU-Ready TF 2.1 Solution to Google QUEST Q&A Labeling using Siamese RoBERTa Encoder Model 2 | 3 | The 5-fold models can be trained in about an hour using Colab TPU. The model performance after post-processing the predictions (to optimize the Spearman correlation to the target): 4 | 5 | ![Submission Score](imgs/submission.png) 6 | 7 | This is at around 65th place on the private leaderboard. The post-processing (which unfortunately I did not use in the competition) gives an almost 0.03 score boost. 8 | 9 | [Inference Kernel on Kaggle](https://www.kaggle.com/ceshine/quest-roberta-inference?scriptVersionId=28553401) 10 | 11 | ## Train on Colab TPU 12 | 13 | [The Notebook](https://gist.github.com/ceshine/752c77742973a013320a9f20384528a1) used the generate the above submission is on Github Gist, and can be opened in Colab. 14 | 15 | ### Preparation 16 | 17 | #### Build the wheels 18 | 19 | Run this command in the project root director and in the `tf-helper-bot` subdirectory: 20 | 21 | `python setup.py sdist bdist_wheel` 22 | 23 | And upload the `.whl` files in the `dist` directory to Google Cloud Storage. 24 | 25 | #### Create the TFRecord files 26 | 27 | Run this command and then upload the content in `cache/tfrecords` to Google Cloud Storage: 28 | 29 | `python -m quest.prepare_tfrecords --model-name roberta-base -n-folds 5` 30 | 31 | (Note: check [requirements.txt](requirements.txt) for missing dependencies.) 32 | 33 | ## Acknowledgements 34 | 35 | Some of the TPU resources used in the project is generously sponsored by [TensorFlow Research Cloud](https://www.tensorflow.org/tfrc). 36 | -------------------------------------------------------------------------------- /tf-helper-bot/tf_helper_bot/mixup.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow_probability as tfp 3 | 4 | 5 | def mixup_augment(alpha: float): 6 | """ 7 | Adapted from: 8 | https://github.com/tensorpack/tensorpack/blob/master/examples/ResNet/cifar10-preact18-mixup.py 9 | """ 10 | dist = tfp.distributions.Beta(alpha, alpha) 11 | 12 | def _mixup_augment(images, labels): 13 | batch_size = tf.shape(images)[0] 14 | lambd = dist.sample([batch_size]) 15 | lambd = tf.math.reduce_max( 16 | tf.stack([lambd, 1-lambd]), axis=0 17 | ) 18 | lambd = tf.reshape(lambd, [batch_size, 1, 1, 1]) 19 | index = tf.random.shuffle(tf.range(batch_size)) 20 | new_images = images * lambd + tf.gather(images, index) * (1 - lambd) 21 | return new_images, {"labels_1": labels, "labels_2": tf.gather(labels, index), "lambd": lambd[:, 0, 0, 0]} 22 | return _mixup_augment 23 | 24 | 25 | def mixup_loss_fn(y_true, y_pred): 26 | if isinstance(y_true, dict): 27 | loss_1 = tf.keras.losses.sparse_categorical_crossentropy( 28 | y_true["labels_1"], 29 | y_pred 30 | ) 31 | loss_2 = tf.keras.losses.sparse_categorical_crossentropy( 32 | y_true["labels_2"], 33 | y_pred 34 | ) 35 | loss = tf.reduce_mean( 36 | y_true["lambd"] * loss_1 + (1 - y_true["lambd"]) * loss_2 37 | ) 38 | else: 39 | loss = tf.reduce_mean( 40 | tf.keras.losses.sparse_categorical_crossentropy( 41 | y_true, 42 | y_pred 43 | ) 44 | ) 45 | return loss 46 | -------------------------------------------------------------------------------- /quest/train_folds.py: -------------------------------------------------------------------------------- 1 | import fire 2 | import tensorflow as tf 3 | import numpy as np 4 | 5 | from .train import train_model 6 | 7 | 8 | def main( 9 | train_path_pattern: str = "cache/tfrecords/train-%d-*.tfrec", 10 | valid_path_pattern: str = "cache/tfrecords/valid-%d-*.tfrec", 11 | model_name: str = "bert-large-uncased-whole-word-masking", 12 | output_path_pattern: str = "cache/bert-fold-%d/", 13 | batch_size: int = 8, grad_accu: int = 2, 14 | log_interval: int = 200, steps: int = 1000, 15 | checkpoint_interval: int = 500, 16 | min_lr: float = 1e-6, max_lr: float = 3e-5, 17 | n_folds: int = 5, freeze: int = 0 18 | ): 19 | scores = [] 20 | for fold in range(n_folds): 21 | tmp = list(tf.io.gfile.glob(train_path_pattern % fold)) 22 | assert len(tmp) == 1 23 | train_path = tmp[0] 24 | tmp = list(tf.io.gfile.glob(valid_path_pattern % fold)) 25 | assert len(tmp) == 1 26 | valid_path = tmp[0] 27 | output_path = output_path_pattern % fold 28 | print("=" * 20) 29 | print(f"Training Fold {fold+1}") 30 | print("=" * 20) 31 | best_score = train_model( 32 | train_path=train_path, 33 | valid_path=valid_path, 34 | model_name=model_name, 35 | output_path=output_path, 36 | batch_size=batch_size, 37 | grad_accu=grad_accu, 38 | log_interval=log_interval, 39 | steps=steps, 40 | checkpoint_interval=checkpoint_interval, 41 | min_lr=min_lr, 42 | max_lr=max_lr, 43 | freeze=freeze 44 | ) 45 | scores.append(best_score) 46 | print(f"Scores: {np.mean(scores)} +- {np.std(scores)}") 47 | 48 | 49 | if __name__ == '__main__': 50 | fire.Fire(main) 51 | -------------------------------------------------------------------------------- /tf-helper-bot/tf_helper_bot/metrics.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from typing import Tuple, Union 3 | 4 | import numpy as np 5 | # import tensorflow as tf 6 | from sklearn.metrics import fbeta_score, roc_auc_score 7 | from sklearn.exceptions import UndefinedMetricWarning 8 | 9 | 10 | class Metric: 11 | name = "metric" 12 | 13 | def __call__(self, truth: np.ndarray, pred: np.ndarray) -> Tuple[float, str]: 14 | """Calculate the metric from truth and prediction tensors 15 | 16 | Parameters 17 | ---------- 18 | truth : numpy.ndarray 19 | pred : numpy.ndarray 20 | 21 | Returns 22 | ------- 23 | Tuple[float, str] 24 | (metric value(to be minimized), formatted string) 25 | """ 26 | raise NotImplementedError() 27 | 28 | 29 | class FBeta(Metric): 30 | """FBeta for binary targets""" 31 | name = "fbeta" 32 | 33 | def __init__(self, step, beta=2, average="binary"): 34 | self.step = step 35 | self.beta = beta 36 | self.average = average 37 | 38 | def __call__(self, truth: np.ndarray, pred: np.ndarray) -> Tuple[float, str]: 39 | best_fbeta, best_thres = self.find_best_fbeta_threshold( 40 | truth, pred, 41 | step=self.step, beta=self.beta) 42 | return best_fbeta * -1, f"{best_fbeta:.4f} @ {best_thres:.2f}" 43 | 44 | def find_best_fbeta_threshold(self, truth, probs, beta=2, step=0.05): 45 | best, best_thres = 0, -1 46 | with warnings.catch_warnings(): 47 | warnings.simplefilter('ignore', category=UndefinedMetricWarning) 48 | for thres in np.arange(step, 1, step): 49 | current = fbeta_score( 50 | truth, (probs >= thres).astype("int8"), 51 | beta=beta, average=self.average) 52 | if current > best: 53 | best = current 54 | best_thres = thres 55 | return best, best_thres 56 | 57 | 58 | class AUC(Metric): 59 | """AUC for binary targets""" 60 | name = "auc" 61 | 62 | def __call__(self, truth: np.ndarray, pred: np.ndarray) -> Tuple[float, str]: 63 | auc_score = roc_auc_score( 64 | truth.astype("int"), pred) 65 | return auc_score * -1, f"{auc_score * 100:.2f}" 66 | -------------------------------------------------------------------------------- /quest/dataset.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | AUTOTUNE = tf.data.experimental.AUTOTUNE 5 | 6 | 7 | def tfrecord_dataset(filename, batch_size, strategy, is_train: bool = True): 8 | opt = tf.data.Options() 9 | opt.experimental_deterministic = False 10 | 11 | name = filename.split("/")[-1] 12 | max_q_len = int(name.split("-")[3].split(".")[0]) 13 | max_a_len = int(name.split("-")[4].split(".")[0]) 14 | cnt = int(name.split("-")[2]) 15 | 16 | features_description = { 17 | "input_ids_question": tf.io.FixedLenFeature([max_q_len], tf.int64), 18 | "input_mask_question": tf.io.FixedLenFeature([max_q_len], tf.int64), 19 | "input_ids_answer": tf.io.FixedLenFeature([max_a_len], tf.int64), 20 | "input_mask_answer": tf.io.FixedLenFeature([max_a_len], tf.int64), 21 | "labels": tf.io.FixedLenFeature([30], tf.float32), 22 | } 23 | 24 | def _parse_function(example_proto): 25 | # Parse the input `tf.Example` proto using the dictionary above. 26 | example = tf.io.parse_single_example( 27 | example_proto, features_description) 28 | return ( 29 | { 30 | 'input_ids_question': tf.cast(example['input_ids_question'], tf.int32), 31 | 'attention_mask_question': tf.cast(example['input_mask_question'], tf.int32), 32 | 'input_ids_answer': tf.cast(example['input_ids_answer'], tf.int32), 33 | 'attention_mask_answer': tf.cast(example['input_mask_answer'], tf.int32), 34 | }, 35 | example["labels"] 36 | ) 37 | 38 | raw_dataset = tf.data.TFRecordDataset( 39 | filename, num_parallel_reads=4 40 | ).with_options(opt) 41 | dataset = raw_dataset.map( 42 | _parse_function, num_parallel_calls=AUTOTUNE 43 | ).cache() 44 | if is_train: 45 | dataset = dataset.shuffle( 46 | 2048, reshuffle_each_iteration=True 47 | ).repeat() 48 | else: 49 | # usually fewer validation files than workers so disable FILE auto-sharding on validation 50 | # option not useful if there is no sharding (not harmful either) 51 | if strategy.num_replicas_in_sync > 1: 52 | opt = tf.data.Options() 53 | opt.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA 54 | dataset = dataset.with_options(opt) 55 | dataset = dataset.batch( 56 | batch_size 57 | # drop_remainder=is_train 58 | ) 59 | dataset = dataset.prefetch(AUTOTUNE) 60 | print("cnt:", cnt, "batch size:", batch_size) 61 | return dataset, int(np.ceil(cnt / batch_size)) 62 | -------------------------------------------------------------------------------- /tf-helper-bot/tf_helper_bot/lr_schedulers.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | from tensorflow.python.framework import constant_op 4 | from tensorflow.python.framework import ops 5 | from tensorflow.python.ops import control_flow_ops 6 | from tensorflow.python.ops import math_ops 7 | from tensorflow.python.keras.optimizer_v2.learning_rate_schedule import LearningRateSchedule 8 | 9 | 10 | class CosineDecayWithWarmup(LearningRateSchedule): 11 | """A LearningRateSchedule that uses a cosine decay schedule.""" 12 | 13 | def __init__( 14 | self, 15 | initial_learning_rate, 16 | max_learning_rate, 17 | warmup_steps, 18 | decay_steps, 19 | alpha=0.0, 20 | name=None): 21 | super().__init__() 22 | self.initial_learning_rate = initial_learning_rate 23 | self.max_learning_rate = max_learning_rate 24 | self.warmup_steps = warmup_steps 25 | self.decay_steps = decay_steps 26 | self.alpha = alpha 27 | self.name = name 28 | 29 | @staticmethod 30 | def lr_warmup(steps, warmup_steps, max_learning_rate, initial_learning_rate): 31 | return initial_learning_rate + ( 32 | max_learning_rate - initial_learning_rate 33 | ) * (steps / warmup_steps) 34 | 35 | @staticmethod 36 | def cosine_decay(steps, warmup_steps, decay_steps, max_learning_rate, alpha): 37 | completed_fraction = ( 38 | steps - warmup_steps) / decay_steps 39 | cosine_decayed = 0.5 * (1.0 + math_ops.cos( 40 | constant_op.constant(math.pi) * completed_fraction)) 41 | decayed = (1 - alpha) * cosine_decayed + alpha 42 | return math_ops.multiply(max_learning_rate, decayed) 43 | 44 | def __call__(self, step): 45 | with ops.name_scope_v2(self.name or "CosineDecayWithWarmup"): 46 | initial_learning_rate = ops.convert_to_tensor( 47 | self.initial_learning_rate, name="initial_learning_rate") 48 | max_learning_rate = ops.convert_to_tensor( 49 | self.max_learning_rate, name="initial_learning_rate") 50 | dtype = initial_learning_rate.dtype 51 | decay_steps = math_ops.cast(self.decay_steps, dtype) 52 | warmup_steps = math_ops.cast(self.warmup_steps, dtype) 53 | total_steps = decay_steps + warmup_steps 54 | 55 | global_step_recomp = math_ops.cast(step, dtype) 56 | global_step_recomp = math_ops.minimum( 57 | global_step_recomp, total_steps) 58 | 59 | return control_flow_ops.cond( 60 | math_ops.less_equal(global_step_recomp, warmup_steps), 61 | lambda: self.lr_warmup( 62 | global_step_recomp, warmup_steps, max_learning_rate, 63 | initial_learning_rate 64 | ), 65 | lambda: self.cosine_decay( 66 | global_step_recomp, warmup_steps, decay_steps, 67 | max_learning_rate, self.alpha 68 | ) 69 | ) 70 | 71 | def get_config(self): 72 | return { 73 | "initial_learning_rate": self.initial_learning_rate, 74 | "decay_steps": self.decay_steps, 75 | "alpha": self.alpha, 76 | "name": self.name 77 | } 78 | -------------------------------------------------------------------------------- /quest/eval_tpu.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import fire 4 | import joblib 5 | import numpy as np 6 | import pandas as pd 7 | from tqdm import tqdm 8 | import tensorflow as tf 9 | from scipy.special import expit 10 | from transformers import AutoTokenizer, RobertaConfig 11 | from tf_helper_bot.utils import prepare_tpu 12 | 13 | from .models import DualRobertaModel 14 | from .metrics import SpearmanCorr 15 | from .prepare_tfrecords import Preprocessor, OUTPUT_COLUMNS, INPUT_COLUMNS 16 | from .inference import ROBERTA_CONFIG, get_batch 17 | from .post_processing import find_best_bins 18 | from .dataset import tfrecord_dataset 19 | 20 | 21 | def eval_fold( 22 | valid_path: str, 23 | model_path: str = "cache/roberta-base-fold-0.h5", 24 | batch_size: int = 8 25 | ): 26 | strategy, tpu = prepare_tpu() 27 | if tpu: 28 | batch_size *= strategy.num_replicas_in_sync 29 | valid_ds, valid_steps = tfrecord_dataset( 30 | valid_path, batch_size, strategy, is_train=False) 31 | valid_dist_ds = strategy.experimental_distribute_dataset( 32 | valid_ds) 33 | 34 | model_name = Path(model_path).name 35 | if model_name.lower().startswith("roberta-base"): 36 | config = RobertaConfig.from_dict( 37 | ROBERTA_CONFIG) 38 | model = DualRobertaModel( 39 | model_name="roberta-base", config=config, pretrained=False 40 | ) 41 | # build 42 | model(next(iter(valid_ds))[0], training=False) 43 | model.load_weights(model_path) 44 | else: 45 | raise ValueError("Unknown model.") 46 | spearman = SpearmanCorr() 47 | 48 | @tf.function 49 | def predict_batch(inputs): 50 | return model(inputs, training=False)[0] 51 | 52 | preds, labels = [], [] 53 | for batch_, labels_ in tqdm(valid_dist_ds, total=valid_steps, ncols=100): 54 | tmp = strategy.experimental_run_v2( 55 | predict_batch, 56 | args=(batch_,) 57 | ).values 58 | preds.append( 59 | tf.concat( 60 | tmp, axis=0 61 | ).numpy() 62 | ) 63 | labels.append(tf.concat( 64 | strategy.experimental_local_results(labels_), 65 | axis=0 66 | ).numpy()) 67 | preds = np.concatenate(preds) 68 | labels = np.concatenate(labels) 69 | 70 | score = spearman(labels, preds)[0] * -1 71 | print(f"Raw Spearman: {score * 100 : .2f}") 72 | return labels, preds 73 | 74 | 75 | def eval_folds( 76 | n_folds: int = 5, 77 | valid_pattern: str = "gs://ceshine-colab-tmp-2/quest/valid-%d-*.tfrec", 78 | model_pattern: str = "cache/roberta-base-fold-%d.h5", 79 | batch_size: int = 8 80 | ): 81 | if Path("cache/oof.jl").exists(): 82 | labels, preds = joblib.load("cache/oof.jl") 83 | else: 84 | labels, preds = [], [] 85 | for fold in range(n_folds): 86 | matches = list(tf.io.gfile.glob(valid_pattern % fold)) 87 | assert len(matches) == 1 88 | labels_tmp, preds_tmp = eval_fold( 89 | matches[0], 90 | model_pattern % fold, 91 | batch_size 92 | ) 93 | labels.append(labels_tmp) 94 | preds.append(preds_tmp) 95 | 96 | labels = np.concatenate(labels) 97 | preds = np.concatenate(preds) 98 | joblib.dump([labels, preds], "cache/oof.jl") 99 | spearman = SpearmanCorr() 100 | score = spearman(labels, preds)[0] * -1 101 | print(f"Raw Spearman: {score * 100 : .2f}") 102 | best_score, best_bins, scaler = find_best_bins(labels, expit(preds)) 103 | print(f"Optimized Spearman: {best_score * 100 : .2f}") 104 | print(best_bins) 105 | joblib.dump([best_bins, scaler], "cache/best_bins.jl") 106 | 107 | 108 | if __name__ == '__main__': 109 | fire.Fire(eval_folds) 110 | -------------------------------------------------------------------------------- /quest/eval.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | 4 | import fire 5 | import joblib 6 | import numpy as np 7 | import pandas as pd 8 | from tqdm import tqdm 9 | import tensorflow as tf 10 | from scipy.special import expit 11 | from transformers import AutoTokenizer, RobertaConfig 12 | 13 | from .models import DualRobertaModel 14 | from .metrics import SpearmanCorr 15 | from .prepare_tfrecords import Preprocessor, OUTPUT_COLUMNS, INPUT_COLUMNS 16 | from .inference import ROBERTA_CONFIG, get_batch 17 | from .post_processing import find_best_bins 18 | 19 | 20 | def eval_fold( 21 | input_path: str = "data/", 22 | fold_path: str = "cache/tfrecords/fold_0.jl", 23 | model_path: str = "cache/roberta-base-fold-0.h5", 24 | tokenizer_path: str = "cache/tfrecords/tokenizer_roberta-base/", 25 | batch_size: int = 8 26 | ): 27 | df_train = pd.read_csv(input_path + 'train.csv') 28 | tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) 29 | processor = Preprocessor(tokenizer) 30 | labels = df_train.loc[:, OUTPUT_COLUMNS].values 31 | inputs = df_train.loc[:, INPUT_COLUMNS].values 32 | _, valid_idx = joblib.load(fold_path) 33 | # valid_idx = valid_idx[:100] # For faster debug 34 | labels, inputs = labels[valid_idx], inputs[valid_idx] 35 | tmp = [] 36 | for i in tqdm(range(labels.shape[0]), ncols=100): 37 | tmp.append(processor.process_one_example( 38 | inputs[i, 0], 39 | inputs[i, 1], 40 | inputs[i, 2]) 41 | ) 42 | processed_inputs = np.array(tmp) 43 | del tmp, inputs 44 | 45 | model_name = Path(model_path).name 46 | if model_name.lower().startswith("roberta-base"): 47 | config = RobertaConfig.from_dict( 48 | ROBERTA_CONFIG) 49 | model = DualRobertaModel( 50 | model_name="roberta-base", config=config, pretrained=False 51 | ) 52 | # build 53 | model(get_batch(processed_inputs[:2]), training=False) 54 | model.load_weights(model_path) 55 | else: 56 | raise ValueError("Unknown model.") 57 | spearman = SpearmanCorr() 58 | 59 | @tf.function 60 | def predict_batch(inputs): 61 | return model(inputs, training=False)[0] 62 | 63 | preds = [] 64 | for i in tqdm(range(0, len(labels), batch_size), ncols=100): 65 | input_dicts = processed_inputs[i:i+batch_size] 66 | preds.append(predict_batch(get_batch(input_dicts)).numpy()) 67 | preds = np.concatenate(preds) 68 | 69 | score = spearman(labels, preds)[0] * -1 70 | print(f"Raw Spearman: {score * 100 : .2f}") 71 | return labels, preds 72 | 73 | 74 | def eval_folds( 75 | n_folds: int = 5, 76 | input_path: str = "data/", 77 | fold_pattern: str = "cache/tfrecords/fold_%d.jl", 78 | model_pattern: str = "cache/roberta-base-fold-%d.h5", 79 | tokenizer_path: str = "cache/tfrecords/tokenizer_roberta-base/", 80 | batch_size: int = 8 81 | ): 82 | if Path("cache/oof.jl").exists(): 83 | labels, preds = joblib.load("cache/oof.jl") 84 | else: 85 | labels, preds = [], [] 86 | for fold in range(n_folds): 87 | labels_tmp, preds_tmp = eval_fold( 88 | input_path, fold_pattern % fold, 89 | model_pattern % fold, 90 | tokenizer_path, 91 | batch_size 92 | ) 93 | labels.append(labels_tmp) 94 | preds.append(preds_tmp) 95 | 96 | labels = np.concatenate(labels) 97 | preds = np.concatenate(preds) 98 | joblib.dump([labels, preds], "cache/oof.jl") 99 | spearman = SpearmanCorr() 100 | score = spearman(labels, preds)[0] * -1 101 | print(f"Raw Spearman: {score * 100 : .2f}") 102 | best_score, best_bins, scaler = find_best_bins(labels, expit(preds)) 103 | print(f"Optimized Spearman: {best_score * 100 : .2f}") 104 | print(best_bins) 105 | joblib.dump([best_bins, scaler], "cache/best_bins.jl") 106 | 107 | 108 | if __name__ == '__main__': 109 | fire.Fire(eval_folds) 110 | -------------------------------------------------------------------------------- /quest/inference.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import logging 3 | from pathlib import Path 4 | 5 | import fire 6 | import joblib 7 | import numpy as np 8 | import pandas as pd 9 | from tqdm import tqdm 10 | import tensorflow as tf 11 | from scipy.special import expit 12 | from transformers import AutoTokenizer 13 | from transformers import RobertaConfig 14 | 15 | from .models import DualRobertaModel 16 | from .prepare_tfrecords import Preprocessor, INPUT_COLUMNS, OUTPUT_COLUMNS 17 | from .post_processing import prevent_nan 18 | 19 | ROBERTA_CONFIG = { 20 | "architectures": [ 21 | "RobertaForMaskedLM" 22 | ], 23 | "attention_probs_dropout_prob": 0.1, 24 | "finetuning_task": None, 25 | "hidden_act": "gelu", 26 | "hidden_dropout_prob": 0.1, 27 | "hidden_size": 768, 28 | "id2label": { 29 | "0": "LABEL_0", 30 | "1": "LABEL_1" 31 | }, 32 | "initializer_range": 0.02, 33 | "intermediate_size": 3072, 34 | "is_decoder": False, 35 | "label2id": { 36 | "LABEL_0": 0, 37 | "LABEL_1": 1 38 | }, 39 | "layer_norm_eps": 1e-05, 40 | "max_position_embeddings": 514, 41 | "num_attention_heads": 12, 42 | "num_hidden_layers": 12, 43 | "num_labels": 30, 44 | "output_attentions": False, 45 | "output_hidden_states": False, 46 | "output_past": True, 47 | "pruned_heads": {}, 48 | "torchscript": False, 49 | "type_vocab_size": 1, 50 | "use_bfloat16": False, 51 | "vocab_size": 50265 52 | } 53 | 54 | 55 | def get_batch(input_dicts): 56 | return { 57 | "input_ids_question": tf.convert_to_tensor(np.stack([ 58 | x["input_ids_question"] for x in input_dicts 59 | ], axis=0)), 60 | "attention_mask_question": tf.convert_to_tensor(np.stack([ 61 | x["input_mask_question"] for x in input_dicts 62 | ], axis=0)), 63 | "input_ids_answer": tf.convert_to_tensor(np.stack([ 64 | x["input_ids_answer"] for x in input_dicts 65 | ], axis=0)), 66 | "attention_mask_answer": tf.convert_to_tensor(np.stack([ 67 | x["input_mask_answer"] for x in input_dicts 68 | ], axis=0)), 69 | } 70 | 71 | 72 | def main( 73 | input_path: str = "data/", 74 | tokenizer_path: str = "cache/tfrecords/tokenizer_roberta-base/", 75 | model_path_pattern: str = "cache/roberta-base-fold-*", 76 | best_bins_path: str = "cache/best_bins.jl", 77 | batch_size: int = 8, progress_bar: bool = True, 78 | add_sigmoid: bool = False, rank: bool = False 79 | ): 80 | df_valid = pd.read_csv(input_path + 'test.csv') 81 | tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) 82 | processor = Preprocessor(tokenizer) 83 | inputs = df_valid.loc[:, INPUT_COLUMNS].values 84 | tmp = [] 85 | for i in tqdm(range(inputs.shape[0]), ncols=100, disable=not progress_bar): 86 | tmp.append(processor.process_one_example( 87 | inputs[i, 0], 88 | inputs[i, 1], 89 | inputs[i, 2]) 90 | ) 91 | processed_inputs = np.array(tmp) 92 | del tmp, inputs 93 | 94 | buffer = [] 95 | for model_path in glob.glob(model_path_pattern): 96 | model_name = Path(model_path).name 97 | print(model_path, model_name) 98 | if model_name.lower().startswith("roberta-base"): 99 | config = RobertaConfig.from_dict( 100 | ROBERTA_CONFIG) 101 | model = DualRobertaModel( 102 | model_name="roberta-base", config=config, pretrained=False) 103 | # build 104 | model(get_batch(processed_inputs[:2]), training=False) 105 | model.load_weights(model_path) 106 | else: 107 | raise ValueError("Unknown model.") 108 | 109 | @tf.function 110 | def predict_batch(inputs): 111 | return model(inputs, training=False)[0] 112 | 113 | preds = [] 114 | for i in tqdm(range( 115 | 0, len(processed_inputs), batch_size 116 | ), ncols=100, disable=not progress_bar): 117 | input_dicts = processed_inputs[i:i+batch_size] 118 | preds.append(predict_batch(get_batch(input_dicts)).numpy()) 119 | if add_sigmoid and not rank: 120 | buffer.append(expit(np.concatenate(preds))) 121 | elif rank: 122 | tmp = np.concatenate(preds) 123 | buffer.append( 124 | tmp.argsort(axis=0).argsort(axis=0) / tmp.shape[0] 125 | ) 126 | else: 127 | buffer.append(np.concatenate(preds)) 128 | 129 | final_preds = np.mean(buffer, axis=0) 130 | if add_sigmoid and not rank: 131 | best_bins, scaler = joblib.load(best_bins_path) 132 | best_bins = np.array(best_bins)[None, :] 133 | # post-process 134 | final_preds = np.clip(scaler.transform(final_preds), 0., 1.) 135 | final_preds = prevent_nan( 136 | np.round(final_preds * best_bins) / best_bins 137 | ) 138 | 139 | df_sub = pd.DataFrame(final_preds, columns=OUTPUT_COLUMNS) 140 | df_sub["qa_id"] = df_valid["qa_id"].values 141 | df_sub.to_csv("submission.csv", index=False) 142 | 143 | 144 | if __name__ == '__main__': 145 | fire.Fire(main) 146 | -------------------------------------------------------------------------------- /quest/models.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from transformers import TFRobertaPreTrainedModel 4 | from transformers.modeling_tf_roberta import TFRobertaMainLayer, TFRobertaClassificationHead 5 | 6 | from .prepare_tfrecords import QUESTION_COLUMNS, ANSWER_COLUMNS, JOINT_COLUMNS 7 | 8 | 9 | class AveragePooling(tf.keras.layers.Layer): 10 | def call(self, states, mask): 11 | mask = tf.cast(tf.expand_dims(mask, 2), tf.float32) 12 | pooled = tf.reduce_sum(states * mask, axis=1) 13 | return pooled / tf.reduce_sum(mask, axis=1) 14 | 15 | 16 | class SELayer(tf.keras.layers.Layer): 17 | def __init__(self, channels, reduction): 18 | super().__init__() 19 | self.fc1 = tf.keras.layers.Dense( 20 | channels // reduction, 21 | kernel_initializer=tf.keras.initializers.he_normal(seed=None), 22 | name="fc1", 23 | activation="relu" 24 | ) 25 | self.fc2 = tf.keras.layers.Dense( 26 | channels, 27 | kernel_initializer=tf.keras.initializers.he_normal(seed=None), 28 | name="fc2", 29 | activation="sigmoid" 30 | ) 31 | 32 | def call(self, x): 33 | tmp = self.fc1(x) 34 | tmp = self.fc2(tmp) 35 | return tmp * x 36 | 37 | 38 | class RobertaEncoder(TFRobertaPreTrainedModel): 39 | def __init__(self, config, *inputs, **kwargs): 40 | super().__init__(config, *inputs, **kwargs) 41 | self.num_labels = config.num_labels 42 | self.roberta = TFRobertaMainLayer(config, name="roberta") 43 | self.pooling = AveragePooling() 44 | 45 | def call(self, inputs, **kwargs): 46 | if "attention_mask" not in inputs: 47 | inputs["attention_mask"] = tf.ones( 48 | tf.shape(inputs["input_ids"])[:2], tf.int32 49 | ) 50 | outputs = self.roberta(inputs, **kwargs)[0] 51 | return self.pooling(outputs, inputs["attention_mask"]) 52 | 53 | 54 | class DualRobertaModel(tf.keras.Model): 55 | def __init__(self, config, model_name, pretrained: bool = True): 56 | super().__init__() 57 | self.num_labels = config.num_labels 58 | 59 | if pretrained: 60 | self.roberta = RobertaEncoder.from_pretrained( 61 | model_name, config=config, name="roberta_question") 62 | else: 63 | self.roberta = RobertaEncoder( 64 | config=config, name="roberta_question") 65 | self.dropout = tf.keras.layers.Dropout(0.5) 66 | self.q_classifier = tf.keras.layers.Dense( 67 | len(QUESTION_COLUMNS), 68 | kernel_initializer=tf.keras.initializers.he_normal(seed=None), 69 | name="q_classifier", 70 | activation="linear" 71 | ) 72 | self.a_classifier = tf.keras.layers.Dense( 73 | len(ANSWER_COLUMNS), 74 | kernel_initializer=tf.keras.initializers.he_normal(seed=None), 75 | name="a_classifier", 76 | activation="linear" 77 | ) 78 | self.j_classifier = tf.keras.layers.Dense( 79 | len(JOINT_COLUMNS), 80 | kernel_initializer=tf.keras.initializers.he_normal(seed=None), 81 | name="j_classifier", 82 | activation="linear" 83 | ) 84 | self.gating_q = SELayer(config.hidden_size, 4) 85 | self.gating_a = SELayer(config.hidden_size, 4) 86 | self.gating_j = SELayer(config.hidden_size * 3, 4) 87 | 88 | def freeze(self): 89 | self.roberta.trainable = False 90 | 91 | def unfreeze(self): 92 | self.roberta.trainable = True 93 | 94 | def call(self, inputs, **kwargs): 95 | pooled_output_question = self.roberta( 96 | { 97 | "input_ids": inputs["input_ids_question"], 98 | "attention_mask": inputs["attention_mask_question"] 99 | }, **kwargs 100 | ) 101 | pooled_output_answer = self.roberta( 102 | { 103 | "input_ids": inputs["input_ids_answer"], 104 | "attention_mask": inputs["attention_mask_answer"] 105 | }, **kwargs 106 | ) 107 | combined = tf.concat( 108 | [ 109 | pooled_output_question, pooled_output_answer, 110 | pooled_output_answer * pooled_output_question 111 | ], 112 | axis=1 113 | ) 114 | q_logit = self.q_classifier(self.dropout( 115 | self.gating_q( 116 | pooled_output_question 117 | ), training=kwargs.get("training", False) 118 | )) 119 | a_logit = self.a_classifier(self.dropout( 120 | self.gating_a( 121 | pooled_output_answer 122 | ), training=kwargs.get("training", False) 123 | )) 124 | j_logit = self.j_classifier(self.dropout( 125 | self.gating_j( 126 | combined 127 | ), training=kwargs.get("training", False) 128 | )) 129 | logits = tf.concat( 130 | [q_logit, a_logit, j_logit], 131 | axis=1 132 | ) 133 | # add hidden states and attention if they are here 134 | outputs = (logits,) 135 | return outputs 136 | -------------------------------------------------------------------------------- /quest/prepare_reference_data.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from math import floor, ceil 3 | 4 | import fire 5 | import joblib 6 | import numpy as np 7 | import pandas as pd 8 | from tqdm import tqdm 9 | from transformers import AutoTokenizer 10 | from sklearn.model_selection import GroupKFold 11 | 12 | tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") 13 | 14 | logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR) 15 | 16 | 17 | def _get_masks(tokens, max_seq_length): 18 | """Mask for padding""" 19 | if len(tokens) > max_seq_length: 20 | raise IndexError("Token length more than max seq length!") 21 | return [1]*len(tokens) + [0] * (max_seq_length - len(tokens)) 22 | 23 | 24 | def _get_segments(tokens, max_seq_length): 25 | """Segments: 0 for the first sequence, 1 for the second""" 26 | if len(tokens) > max_seq_length: 27 | raise IndexError("Token length more than max seq length!") 28 | segments = [] 29 | first_sep = True 30 | current_segment_id = 0 31 | for token in tokens: 32 | segments.append(current_segment_id) 33 | if token == "[SEP]": 34 | if first_sep: 35 | first_sep = False 36 | else: 37 | current_segment_id = 1 38 | return segments + [0] * (max_seq_length - len(tokens)) 39 | 40 | 41 | def _get_ids(tokens, tokenizer, max_seq_length): 42 | """Token ids from Tokenizer vocab""" 43 | token_ids = tokenizer.convert_tokens_to_ids(tokens) 44 | input_ids = token_ids + [0] * (max_seq_length-len(token_ids)) 45 | return input_ids 46 | 47 | 48 | def _trim_input(title, question, answer, max_sequence_length, 49 | t_max_len=30, q_max_len=239, a_max_len=239): 50 | 51 | t = tokenizer.tokenize(title) 52 | q = tokenizer.tokenize(question) 53 | a = tokenizer.tokenize(answer) 54 | 55 | t_len = len(t) 56 | q_len = len(q) 57 | a_len = len(a) 58 | 59 | if (t_len+q_len+a_len+4) > max_sequence_length: 60 | 61 | if t_max_len > t_len: 62 | t_new_len = t_len 63 | a_max_len = a_max_len + floor((t_max_len - t_len)/2) 64 | q_max_len = q_max_len + ceil((t_max_len - t_len)/2) 65 | else: 66 | t_new_len = t_max_len 67 | 68 | if a_max_len > a_len: 69 | a_new_len = a_len 70 | q_new_len = q_max_len + (a_max_len - a_len) 71 | elif q_max_len > q_len: 72 | a_new_len = a_max_len + (q_max_len - q_len) 73 | q_new_len = q_len 74 | else: 75 | a_new_len = a_max_len 76 | q_new_len = q_max_len 77 | 78 | if t_new_len+a_new_len+q_new_len+4 != max_sequence_length: 79 | raise ValueError("New sequence length should be %d, but is %d" 80 | % (max_sequence_length, (t_new_len+a_new_len+q_new_len+4))) 81 | 82 | t = t[:t_new_len] 83 | q = q[:q_new_len] 84 | a = a[:a_new_len] 85 | 86 | return t, q, a 87 | 88 | 89 | def _convert_to_bert_inputs(title, question, answer, tokenizer, max_sequence_length): 90 | """Converts tokenized input to ids, masks and segments for BERT""" 91 | 92 | stoken = ["[CLS]"] + title + ["[SEP]"] + \ 93 | question + ["[SEP]"] + answer + ["[SEP]"] 94 | 95 | input_ids = _get_ids(stoken, tokenizer, max_sequence_length) 96 | input_masks = _get_masks(stoken, max_sequence_length) 97 | input_segments = _get_segments(stoken, max_sequence_length) 98 | 99 | return [input_ids, input_masks, input_segments] 100 | 101 | 102 | def compute_input_arays(df, columns, tokenizer, max_sequence_length): 103 | input_ids, input_masks, input_segments = [], [], [] 104 | for _, instance in tqdm(df[columns].iterrows(), total=df.shape[0]): 105 | t, q, a = instance.question_title, instance.question_body, instance.answer 106 | 107 | t, q, a = _trim_input(t, q, a, max_sequence_length) 108 | 109 | ids, masks, segments = _convert_to_bert_inputs( 110 | t, q, a, tokenizer, max_sequence_length) 111 | input_ids.append(ids) 112 | input_masks.append(masks) 113 | input_segments.append(segments) 114 | 115 | return [np.asarray(input_ids, dtype=np.int32), 116 | np.asarray(input_masks, dtype=np.int32), 117 | np.asarray(input_segments, dtype=np.int32)] 118 | 119 | 120 | def compute_output_arrays(df, columns): 121 | return np.asarray(df[columns]) 122 | 123 | 124 | def main( 125 | input_path: str = "data/", 126 | max_sequence_length: int = 512 127 | ): 128 | df_train = pd.read_csv(input_path + 'train.csv') 129 | output_categories = list(df_train.columns[11:]) 130 | input_categories = list(df_train.columns[[1, 2, 5]]) 131 | 132 | gkf = GroupKFold(n_splits=5).split( 133 | X=df_train.question_body, groups=df_train.question_body) 134 | outputs = compute_output_arrays(df_train, output_categories) 135 | inputs = compute_input_arays( 136 | df_train, input_categories, tokenizer, max_sequence_length) 137 | 138 | for fold, (train_idx, valid_idx) in enumerate(gkf): 139 | joblib.dump( 140 | [inputs[0][train_idx], inputs[1][train_idx], 141 | inputs[2][train_idx], outputs[train_idx]], 142 | f"cache/tfrecords/train-{fold}.jl" 143 | ) 144 | joblib.dump( 145 | [inputs[0][valid_idx], inputs[1][valid_idx], 146 | inputs[2][valid_idx], outputs[valid_idx]], 147 | f"cache/tfrecords/valid-{fold}.jl" 148 | ) 149 | 150 | 151 | if __name__ == '__main__': 152 | fire.Fire(main) 153 | -------------------------------------------------------------------------------- /quest/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | from pathlib import Path 4 | 5 | import fire 6 | import tensorflow as tf 7 | from transformers import BertConfig, RobertaConfig 8 | from tf_helper_bot import ( 9 | BaseBot, BaseDistributedBot, 10 | MovingAverageStatsTrackerCallback, CheckpointCallback, TelegramCallback, 11 | CosineDecayWithWarmup 12 | ) 13 | from tf_helper_bot.utils import prepare_tpu 14 | from tf_helper_bot.optimizers import RAdam 15 | 16 | from .models import DualRobertaModel 17 | from .dataset import tfrecord_dataset 18 | from .metrics import SpearmanCorr 19 | 20 | TELEGRAM_TOKEN = os.environ.get("TG_TOKEN", "") 21 | TELEGRAM_CHAT_ID = os.environ.get("TG_CHAT_ID", "") 22 | 23 | 24 | class QuestBot(BaseBot): 25 | def _extract_prediction(self, x): 26 | if isinstance(x, tuple): 27 | # the model returns a tuple when run inside tf.function 28 | return x[0] 29 | return x 30 | 31 | 32 | class QuestDistributedBot(BaseDistributedBot): 33 | def _extract_prediction(self, x): 34 | if isinstance(x, tuple): 35 | return x[0] 36 | return x 37 | 38 | 39 | def loss_fn(labels, predictions): 40 | return tf.math.reduce_mean( 41 | tf.keras.losses.binary_crossentropy( 42 | labels, predictions, from_logits=True 43 | # tf.keras.losses.mean_absolute_error( 44 | # tf.keras.losses.mean_squared_error( 45 | # labels, predictions, 46 | ), 47 | axis=0 48 | ) 49 | 50 | 51 | def train_model( 52 | train_path: str = "cache/tfrecords/train-0-4863-288-320.tfrec", 53 | valid_path: str = "cache/tfrecords/valid-0-1216-288-320.tfrec", 54 | model_name: str = "bert-large-uncased-whole-word-masking", 55 | output_path: str = "cache/model", 56 | batch_size: int = 8, grad_accu: int = 2, 57 | log_interval: int = 200, steps: int = 1000, 58 | checkpoint_interval: int = 500, 59 | min_lr: float = 1e-6, max_lr: float = 3e-5, 60 | freeze: int = 0 61 | ): 62 | # Path(output_path).mkdir(exist_ok=True, parents=True) 63 | strategy, tpu = prepare_tpu() 64 | print("REPLICAS: ", strategy.num_replicas_in_sync) 65 | 66 | valid_batch_size = batch_size * 2 67 | if strategy.num_replicas_in_sync == 8: # single TPU 68 | valid_batch_size = batch_size * strategy.num_replicas_in_sync * 2 69 | batch_size = batch_size * strategy.num_replicas_in_sync 70 | logging.getLogger("tensorflow").setLevel(logging.WARNING) 71 | 72 | with strategy.scope(): 73 | train_ds, train_steps = tfrecord_dataset( 74 | train_path, batch_size, strategy, is_train=True) 75 | valid_ds, valid_steps = tfrecord_dataset( 76 | valid_path, valid_batch_size, strategy, is_train=False) 77 | if model_name.lower().startswith("roberta"): 78 | config = RobertaConfig.from_pretrained(model_name, num_labels=30) 79 | model = DualRobertaModel( 80 | model_name=model_name, config=config) 81 | else: 82 | raise ValueError("Unknown model!") 83 | lr_schedule = CosineDecayWithWarmup( 84 | initial_learning_rate=min_lr, max_learning_rate=max_lr, 85 | warmup_steps=int(steps * 0.1), 86 | decay_steps=steps - int(steps * 0.1), 87 | alpha=1e-4 88 | ) 89 | # optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule, epsilon=1e-6) 90 | optimizer_1 = RAdam(learning_rate=1e-3, epsilon=1e-6) 91 | optimizer = RAdam(learning_rate=lr_schedule, epsilon=1e-6) 92 | # build the model 93 | model(next(iter(train_ds))[0]) 94 | 95 | if freeze > 0: 96 | model.freeze() 97 | model.compile( 98 | optimizer=optimizer_1, 99 | loss=loss_fn 100 | ) 101 | print(model.summary()) 102 | model.fit( 103 | train_ds, epochs=1, 104 | steps_per_epoch=train_steps * freeze 105 | ) 106 | model.unfreeze() 107 | model.compile( 108 | optimizer=optimizer, 109 | loss=loss_fn 110 | ) 111 | print(model.summary()) 112 | 113 | train_dist_ds = strategy.experimental_distribute_dataset( 114 | train_ds) 115 | valid_dist_ds = strategy.experimental_distribute_dataset( 116 | valid_ds) 117 | 118 | checkpoints = CheckpointCallback( 119 | keep_n_checkpoints=1, 120 | checkpoint_dir="cache/model_cache/", 121 | monitor_metric="spearman" 122 | ) 123 | callbacks = [ 124 | MovingAverageStatsTrackerCallback( 125 | avg_window=int(log_interval * 1.25), 126 | log_interval=log_interval, 127 | ), 128 | checkpoints 129 | ] 130 | if TELEGRAM_TOKEN and TELEGRAM_CHAT_ID: 131 | callbacks += [ 132 | TelegramCallback( 133 | token=TELEGRAM_TOKEN, 134 | chat_id=TELEGRAM_CHAT_ID, 135 | name="QuestFinetune", 136 | report_evals=False 137 | ) 138 | ] 139 | metrics = (SpearmanCorr(add_sigmoid=True),) 140 | if tpu: 141 | bot = QuestDistributedBot( 142 | model=model, 143 | criterion=loss_fn, 144 | optimizer=optimizer, 145 | train_dataset=train_dist_ds, 146 | valid_dataset=valid_dist_ds, 147 | steps_per_epoch=train_steps, 148 | strategy=strategy, 149 | gradient_accumulation_steps=1, 150 | callbacks=callbacks, 151 | metrics=metrics, 152 | valid_steps=valid_steps, 153 | ) 154 | else: 155 | bot = QuestBot( 156 | model=model, 157 | criterion=loss_fn, 158 | optimizer=optimizer, 159 | train_dataset=train_dist_ds, 160 | valid_dataset=valid_dist_ds, 161 | steps_per_epoch=train_steps, 162 | gradient_accumulation_steps=grad_accu, 163 | callbacks=callbacks, 164 | metrics=metrics, 165 | valid_steps=valid_steps 166 | ) 167 | print(f"Steps per epoch: {train_steps} | {valid_steps}") 168 | 169 | bot.train(checkpoint_interval=checkpoint_interval, n_steps=steps) 170 | best_score = checkpoints.best_performers[0][0] 171 | bot.model.load_weights(str(checkpoints.best_performers[0][1])) 172 | checkpoints.remove_checkpoints(keep=0) 173 | bot.model.save_weights(output_path + ".h5") 174 | return best_score 175 | 176 | 177 | if __name__ == '__main__': 178 | fire.Fire(train_model) 179 | -------------------------------------------------------------------------------- /quest/prepare_tfrecords.py: -------------------------------------------------------------------------------- 1 | import math 2 | import logging 3 | from pathlib import Path 4 | 5 | import fire 6 | import joblib 7 | import numpy as np 8 | import pandas as pd 9 | from tqdm import tqdm 10 | import tensorflow as tf 11 | from transformers import AutoTokenizer 12 | from sklearn.model_selection import GroupKFold 13 | 14 | QUESTION_COLUMNS = ( 15 | 'question_asker_intent_understanding', 'question_body_critical', 'question_conversational', 16 | 'question_expect_short_answer', 'question_fact_seeking', 'question_has_commonly_accepted_answer', 17 | 'question_interestingness_others', 'question_interestingness_self', 'question_multi_intent', 18 | 'question_not_really_a_question', 'question_opinion_seeking', 'question_type_choice', 19 | 'question_type_compare', 'question_type_consequence', 'question_type_definition', 20 | 'question_type_entity', 'question_type_instructions', 'question_type_procedure', 21 | 'question_type_reason_explanation', 'question_type_spelling', 'question_well_written', 22 | ) 23 | ANSWER_COLUMNS = ( 24 | 'answer_type_instructions', 'answer_type_procedure', 'answer_type_reason_explanation', 25 | 'answer_well_written', 'answer_level_of_information' 26 | ) 27 | JOINT_COLUMNS = ( 28 | 'answer_helpful', 'answer_plausible', 'answer_relevance', 29 | 'answer_satisfaction' 30 | ) 31 | INPUT_COLUMNS = ('question_title', 'question_body', 'answer') 32 | OUTPUT_COLUMNS = QUESTION_COLUMNS + ANSWER_COLUMNS + JOINT_COLUMNS 33 | 34 | logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR) 35 | 36 | 37 | class Preprocessor: 38 | def __init__(self, tokenizer, title_max_len=64, question_max_len=352-7, answer_max_len=480-5): 39 | self.tokenizer = tokenizer 40 | self.title_max_len = title_max_len 41 | self.question_max_len = question_max_len 42 | self.answer_max_len = answer_max_len 43 | self.bos_token_id = self.tokenizer.convert_tokens_to_ids( 44 | [self.tokenizer.bos_token])[0] 45 | self.eos_token_id = self.tokenizer.convert_tokens_to_ids( 46 | [self.tokenizer.eos_token])[0] 47 | self.pad_token_id = self.tokenizer.convert_tokens_to_ids( 48 | [self.tokenizer.pad_token])[0] 49 | self.question_head = self.tokenizer.encode( 50 | "question", add_special_tokens=False) 51 | self.answer_head = self.tokenizer.encode( 52 | "answer", add_special_tokens=False) 53 | self.max_q_len = ( 54 | self.title_max_len + self.question_max_len + 55 | 6 + len(self.question_head) 56 | ) 57 | self.max_a_len = ( 58 | self.answer_max_len + 4 + len(self.answer_head) 59 | ) 60 | 61 | def _trim_input(self, title, question, answer): 62 | t = self.tokenizer.encode(title, add_special_tokens=False) 63 | q = self.tokenizer.encode(question, add_special_tokens=False) 64 | a = self.tokenizer.encode(answer, add_special_tokens=False) 65 | t = t[:self.title_max_len] 66 | q = q[:self.question_max_len] 67 | a = a[:self.answer_max_len] 68 | return t, q, a 69 | 70 | def process_one_example(self, title, question, answer): 71 | t_tokens, q_tokens, a_tokens = self._trim_input( 72 | title, question, answer) 73 | 74 | input_ids_question = np.zeros( 75 | self.max_q_len, dtype=np.int 76 | ) + self.pad_token_id 77 | input_ids_answer = np.zeros( 78 | self.max_a_len, dtype=np.int 79 | ) + self.pad_token_id 80 | question_tokens = np.asarray( 81 | [self.bos_token_id] + self.question_head + 82 | [self.eos_token_id, self.bos_token_id] + 83 | t_tokens + [self.eos_token_id, self.bos_token_id] + 84 | q_tokens + [self.eos_token_id] 85 | ) 86 | answer_tokens = np.asarray( 87 | [self.bos_token_id] + self.answer_head + 88 | [self.eos_token_id, self.bos_token_id] + 89 | a_tokens + [self.eos_token_id] 90 | ) 91 | assert len(question_tokens) <= len(input_ids_question) 92 | assert len(answer_tokens) <= len(input_ids_answer) 93 | input_ids_question[:len(question_tokens)] = question_tokens 94 | input_ids_answer[:len(answer_tokens)] = answer_tokens 95 | input_mask_question = np.zeros(len(input_ids_question), dtype=np.int) 96 | input_mask_question[:len(question_tokens)] = 1 97 | input_mask_answer = np.zeros(len(input_ids_answer), dtype=np.int) 98 | input_mask_answer[:len(answer_tokens)] = 1 99 | return { 100 | "input_ids_question": input_ids_question, 101 | "input_mask_question": input_mask_question, 102 | "input_ids_answer": input_ids_answer, 103 | "input_mask_answer": input_mask_answer 104 | } 105 | 106 | 107 | def to_example(input_dict, labels): 108 | feature = { 109 | "input_ids_question": tf.train.Feature( 110 | int64_list=tf.train.Int64List( 111 | value=input_dict["input_ids_question"]) 112 | ), 113 | "input_mask_question": tf.train.Feature( 114 | int64_list=tf.train.Int64List( 115 | value=input_dict["input_mask_question"]) 116 | ), 117 | "input_ids_answer": tf.train.Feature( 118 | int64_list=tf.train.Int64List(value=input_dict["input_ids_answer"]) 119 | ), 120 | "input_mask_answer": tf.train.Feature( 121 | int64_list=tf.train.Int64List( 122 | value=input_dict["input_mask_answer"]) 123 | ), 124 | "labels": tf.train.Feature( 125 | float_list=tf.train.FloatList(value=labels) 126 | ) 127 | } 128 | return tf.train.Example(features=tf.train.Features(feature=feature)) 129 | 130 | 131 | def _write_tfrecords(inputs, labels, output_filepath): 132 | with tf.io.TFRecordWriter(str(output_filepath)) as writer: 133 | for input_dict, labels_single in zip(inputs, labels): 134 | example = to_example(input_dict, labels_single) 135 | writer.write(example.SerializeToString()) 136 | print("Wrote file {} containing {} records".format( 137 | output_filepath, len(inputs))) 138 | 139 | 140 | def _write_arrays(inputs, labels, output_filepath): 141 | input_ids, input_mask, token_type_ids = [], [], [] 142 | for input_dict in inputs: 143 | input_ids.append(input_dict["input_ids"]) 144 | input_mask.append(input_dict["input_mask"]) 145 | token_type_ids.append(input_dict["token_type_ids"]) 146 | joblib.dump( 147 | [np.stack(input_ids), np.stack(input_mask), 148 | np.stack(token_type_ids), labels], 149 | output_filepath) 150 | 151 | 152 | def main( 153 | input_path: str = "data/", model_name: str = "roberta-base", 154 | output_path: str = "cache/tfrecords/", n_folds: int = 5 155 | ): 156 | output_path_ = Path(output_path) 157 | output_path_.mkdir(exist_ok=True, parents=True) 158 | (output_path_ / f"tokenizer_{model_name}").mkdir(exist_ok=True) 159 | 160 | df_train = pd.read_csv(input_path + 'train.csv') 161 | tokenizer = AutoTokenizer.from_pretrained(model_name) 162 | tokenizer.save_pretrained(str(output_path_ / f"tokenizer_{model_name}")) 163 | print(tokenizer) 164 | processor = Preprocessor(tokenizer) 165 | labels = df_train.loc[ 166 | :, OUTPUT_COLUMNS 167 | ].values 168 | inputs = df_train.loc[:, INPUT_COLUMNS].values 169 | tmp = [] 170 | for i in tqdm(range(df_train.shape[0]), ncols=100): 171 | tmp.append(processor.process_one_example( 172 | inputs[i, 0], 173 | inputs[i, 1], 174 | inputs[i, 2]) 175 | ) 176 | processed_inputs = np.array(tmp) 177 | print(processed_inputs[0]["input_ids_question"]) 178 | del tmp 179 | 180 | gkf = GroupKFold(n_splits=n_folds).split( 181 | X=df_train.question_body, groups=df_train.question_body) 182 | for fold, (train_idx, valid_idx) in enumerate(gkf): 183 | joblib.dump([train_idx, valid_idx], output_path_ / f"fold_{fold}.jl") 184 | filepath = ( 185 | output_path_ / 186 | f"train-{fold}-{len(train_idx)}-{processor.max_q_len}-{processor.max_a_len}.tfrec" 187 | ) 188 | _write_tfrecords( 189 | processed_inputs[train_idx], labels[train_idx], filepath) 190 | filepath = ( 191 | output_path_ / 192 | f"valid-{fold}-{len(valid_idx)}-{processor.max_q_len}-{processor.max_a_len}.tfrec" 193 | ) 194 | _write_tfrecords( 195 | processed_inputs[valid_idx], labels[valid_idx], filepath) 196 | 197 | 198 | if __name__ == '__main__': 199 | fire.Fire(main) 200 | -------------------------------------------------------------------------------- /tf-helper-bot/tf_helper_bot/callbacks.py: -------------------------------------------------------------------------------- 1 | import socket 2 | from datetime import datetime, timedelta 3 | from time import time 4 | from collections import deque, defaultdict 5 | from typing import Dict, Tuple, List, Optional, Union 6 | from pathlib import Path 7 | 8 | import numpy as np 9 | import tensorflow as tf 10 | try: 11 | import wandb 12 | WANDB = True 13 | except ImportError: 14 | WANDB = False 15 | 16 | from .bot import BaseBot 17 | 18 | __all__ = [ 19 | "Callback", "MovingAverageStatsTrackerCallback", 20 | "CheckpointCallback", "TelegramCallback", "WandbCallback" 21 | ] 22 | 23 | 24 | class Callback: 25 | def on_batch_inputs(self, bot: BaseBot, input_tensors: tf.Tensor, targets: tf.Tensor): 26 | return input_tensors, targets 27 | 28 | def on_train_starts(self, bot: BaseBot): 29 | return 30 | 31 | def on_train_ends(self, bot: BaseBot): 32 | return 33 | 34 | def on_epoch_ends(self, bot: BaseBot, epoch: int): 35 | return 36 | 37 | def on_eval_ends(self, bot: BaseBot, metrics: Dict[str, Tuple[float, str]]): 38 | return 39 | 40 | def on_step_ends(self, bot: BaseBot, train_loss: float, train_weight: int): 41 | return 42 | 43 | def on_load_checkpoint(self, **kwargs): 44 | return 45 | 46 | def on_save_checkpoint(self): 47 | return 48 | 49 | def reset(self): 50 | return 51 | 52 | 53 | class MovingAverageStatsTrackerCallback(Callback): 54 | """Log moving average of training losses, and report evaluation metrics. 55 | """ 56 | 57 | def __init__(self, avg_window: int, log_interval: int): 58 | super().__init__() 59 | self.avg_window = avg_window 60 | self.log_interval = log_interval 61 | self.reset() 62 | self.timer: float = 0.0 63 | 64 | def on_train_starts(self, bot: BaseBot): 65 | self.timer = time() 66 | 67 | def on_step_ends(self, bot: BaseBot, train_loss, train_weight): 68 | self.train_losses.append(train_loss) 69 | self.train_weights.append(train_weight) 70 | if bot.step % self.log_interval == 0: 71 | # print(len(self.train_weights), len(self.train_losses)) 72 | train_loss_avg = np.average( 73 | self.train_losses, weights=self.train_weights, axis=0) 74 | lr = ( 75 | bot.optimizer.lr(bot.step) if callable(bot.optimizer.lr) 76 | else bot.optimizer.lr 77 | ) 78 | if not isinstance(lr, float): 79 | lr = lr.numpy() 80 | speed = (time() - self.timer) / self.log_interval 81 | # reset timer 82 | self.timer = time() 83 | bot.logger.info( 84 | f"Step %5d | loss {bot.loss_format} | lr %.2e | %.3fs per step", 85 | bot.step, train_loss_avg, lr, speed) 86 | self.train_logs.append(train_loss_avg) 87 | 88 | def on_eval_ends(self, bot: BaseBot, metrics: Dict[str, Tuple[float, str]]): 89 | self.metrics["step"].append(bot.step) 90 | history_length = len(self.metrics["step"]) 91 | bot.logger.info(f"Metrics at step {bot.step}:") 92 | for metric_name, (metric_value, metric_string) in metrics.items(): 93 | self.metrics[metric_name].append((metric_value, metric_string)) 94 | assert history_length == len( 95 | self.metrics[metric_name]), "Inconsistent metric found!" 96 | bot.logger.info(f"{metric_name}: {metric_string}") 97 | 98 | def on_train_ends(self, bot: BaseBot): 99 | if self.metrics["step"]: 100 | bot.logger.info("Training finished. Best step(s):") 101 | for metric_name, metric_values in self.metrics.items(): 102 | if metric_name == "step": 103 | continue 104 | best_idx = np.argmin( 105 | np.array([x[0] for x in metric_values])) 106 | bot.logger.info( 107 | "%s: %s @ step %d", 108 | metric_name, metric_values[best_idx][1], 109 | self.metrics["step"][best_idx] 110 | ) 111 | 112 | def reset(self): 113 | self.train_losses = deque(maxlen=self.avg_window) 114 | self.train_weights = deque(maxlen=self.avg_window) 115 | self.metrics = defaultdict(list) 116 | self.train_logs = [] 117 | 118 | 119 | class CheckpointCallback(Callback): 120 | """Save and manage checkpoints. 121 | 122 | TODO: Checkpoints that can be used to resume training 123 | """ 124 | 125 | def __init__( 126 | self, keep_n_checkpoints: int = 1, 127 | checkpoint_dir: Union[Path, str] = Path("./data/cache/model_cache/"), 128 | monitor_metric: str = "loss"): 129 | super().__init__() 130 | assert keep_n_checkpoints > 0 131 | self.keep_n_checkpoints = keep_n_checkpoints 132 | self.checkpoint_dir = Path(checkpoint_dir) 133 | self.monitor_metric = monitor_metric 134 | self.best_performers: List[Tuple[float, Path, int]] = [] 135 | self.checkpoint_dir.mkdir(exist_ok=True, parents=True) 136 | 137 | def on_eval_ends(self, bot: BaseBot, metrics: Dict[str, Tuple[float, str]]): 138 | target_value, target_string = metrics[self.monitor_metric] 139 | target_path = ( 140 | self.checkpoint_dir / 141 | "ckpt_{}_{}_{}_{}.h5".format( 142 | bot.name, target_string, bot.step, 143 | datetime.now().strftime("%m%d%H%M")) 144 | ) 145 | bot.logger.debug("Saving checkpoint %s...", target_path) 146 | if ( 147 | len(self.best_performers) < self.keep_n_checkpoints or 148 | target_value < self.best_performers[-1][0] 149 | ): 150 | self.best_performers.append((target_value, target_path, bot.step)) 151 | self.remove_checkpoints(keep=self.keep_n_checkpoints) 152 | bot.model.save_weights(str(target_path)) 153 | assert target_path.exists() 154 | 155 | def remove_checkpoints(self, keep): 156 | self.best_performers = sorted(self.best_performers, key=lambda x: x[0]) 157 | for checkpoint in np.unique([ 158 | x[1] for x in self.best_performers[keep:]]): 159 | Path(checkpoint).unlink() 160 | self.best_performers = self.best_performers[:keep] 161 | 162 | def reset(self, ignore_previous=False): 163 | if ignore_previous: 164 | self.best_performers = [] 165 | else: 166 | self.remove_checkpoints(0) 167 | 168 | 169 | class TelegramCallback(Callback): 170 | """A Telegram notification callback 171 | 172 | Reference: https://github.com/huggingface/knockknock 173 | """ 174 | DATE_FORMAT = "%Y-%m-%d %H:%M:%d" 175 | 176 | def __init__(self, token: str, chat_id: int, name: str, report_evals: bool = False): 177 | try: 178 | import telegram 179 | except ImportError: 180 | raise ImportError( 181 | "Please install 'python-telegram-bot' before using TelegramCallback.") 182 | self.telegram_bot = telegram.Bot(token=token) 183 | self.host_name = socket.gethostname() 184 | self.report_evals = report_evals 185 | self.chat_id = chat_id 186 | self.name = name 187 | self.start_time = datetime.now() 188 | 189 | def on_train_starts(self, bot: BaseBot): 190 | self.start_time = datetime.now() 191 | contents = [ 192 | f'{self.name} has started training 🎬', 193 | 'Machine name: %s' % self.host_name, 194 | 'Starting date: %s' % self.start_time.strftime( 195 | TelegramCallback.DATE_FORMAT) 196 | ] 197 | text = '\n'.join(contents) 198 | self.telegram_bot.send_message(chat_id=self.chat_id, text=text) 199 | 200 | def on_train_ends(self, bot: BaseBot): 201 | end_time = datetime.now() 202 | elapsed_time = end_time - self.start_time 203 | contents = [ 204 | f'{self.name} has finished training 🎉', 205 | 'Machine name: %s' % self.host_name, 206 | 'Starting date: %s' % self.start_time.strftime( 207 | TelegramCallback.DATE_FORMAT), 208 | 'End date: %s' % end_time.strftime( 209 | TelegramCallback.DATE_FORMAT), 210 | 'Training duration: %s' % str(elapsed_time) 211 | ] 212 | text = '\n'.join(contents) 213 | self.telegram_bot.send_message(chat_id=self.chat_id, text=text) 214 | 215 | def on_eval_ends(self, bot: BaseBot, metrics: Dict[str, Tuple[float, str]]): 216 | if self.report_evals is False: 217 | return 218 | contents = [ 219 | f"Metrics from {self.name} at step {bot.step}:" 220 | ] 221 | contents += [ 222 | f"{metric_name}: {metric_string}" 223 | for metric_name, (metric_value, metric_string) in metrics.items() 224 | ] 225 | text = '\n'.join(contents) 226 | self.telegram_bot.send_message(chat_id=self.chat_id, text=text) 227 | 228 | 229 | class WandbCallback(Callback): 230 | """ Callback for the Weights and Biases service 231 | 232 | Prerequisites: install `wandb` and run `wandb login`. 233 | 234 | Note: train a few more steps after the last eval to make sure the log is complete. 235 | 236 | WARNING: Resuming is not fully supported yet. 237 | 238 | Reference: https://github.com/wandb/client/raw/ef0911c47beebab0db8749d764802057d3480e69/wandb/fastai/__init__.py 239 | """ 240 | 241 | def __init__(self, config: Dict, name: str): 242 | if WANDB is False: 243 | raise ImportError( 244 | "Please install 'wandb' before using WandbCallback.") 245 | # project name can only be in lower case 246 | wandb.init(config=config, project=name.lower()) 247 | 248 | def on_step_ends(self, bot: BaseBot, train_loss: float, train_weight: int): 249 | wandb.log({"train_loss": train_loss}, step=bot.step) 250 | 251 | def on_eval_ends(self, bot: BaseBot, metrics: Dict[str, Tuple[float, str]]): 252 | metrics_ = { 253 | metric_name: metric_value 254 | for metric_name, (metric_value, _) in metrics.items() 255 | } 256 | # Rename to avoid conflicts 257 | metrics_["val_loss"] = metrics_["loss"] 258 | del metrics_["loss"] 259 | # NOTE: remember to train one more step to sync the final eval metrics to the server 260 | wandb.log(metrics_, step=bot.step) 261 | -------------------------------------------------------------------------------- /tf-helper-bot/tf_helper_bot/optimizers.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.python.keras.optimizer_v2.optimizer_v2 import OptimizerV2 3 | from tensorflow.python import ops, math_ops, state_ops, control_flow_ops 4 | from tensorflow.python.keras import backend as K 5 | 6 | __all__ = ['RAdam'] 7 | 8 | 9 | class RAdam(OptimizerV2): 10 | """RAdam optimizer. 11 | 12 | According to the paper 13 | [On The Variance Of The Adaptive Learning Rate And Beyond](https://arxiv.org/pdf/1908.03265v1.pdf). 14 | """ 15 | 16 | def __init__(self, 17 | learning_rate=0.001, 18 | beta_1=0.9, 19 | beta_2=0.999, 20 | epsilon=1e-7, 21 | weight_decay=0., 22 | amsgrad=False, 23 | total_steps=0, 24 | warmup_proportion=0.1, 25 | min_lr=0., 26 | name='RAdam', 27 | **kwargs): 28 | r"""Construct a new Adam optimizer. 29 | 30 | Args: 31 | learning_rate: A Tensor or a floating point value. The learning rate. 32 | beta_1: A float value or a constant float tensor. The exponential decay 33 | rate for the 1st moment estimates. 34 | beta_2: A float value or a constant float tensor. The exponential decay 35 | rate for the 2nd moment estimates. 36 | epsilon: A small constant for numerical stability. This epsilon is 37 | "epsilon hat" in the Kingma and Ba paper (in the formula just before 38 | Section 2.1), not the epsilon in Algorithm 1 of the paper. 39 | weight_decay: A floating point value. Weight decay for each param. 40 | amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from 41 | the paper "On the Convergence of Adam and beyond". 42 | total_steps: An integer. Total number of training steps. 43 | Enable warmup by setting a positive value. 44 | warmup_proportion: A floating point value. The proportion of increasing steps. 45 | min_lr: A floating point value. Minimum learning rate after warmup. 46 | name: Optional name for the operations created when applying gradients. 47 | Defaults to "Adam". @compatibility(eager) When eager execution is 48 | enabled, `learning_rate`, `beta_1`, `beta_2`, and `epsilon` can each be 49 | a callable that takes no arguments and returns the actual value to use. 50 | This can be useful for changing these values across different 51 | invocations of optimizer functions. @end_compatibility 52 | **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`, 53 | `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip 54 | gradients by value, `decay` is included for backward compatibility to 55 | allow time inverse decay of learning rate. `lr` is included for backward 56 | compatibility, recommended to use `learning_rate` instead. 57 | """ 58 | 59 | super(RAdam, self).__init__(name, **kwargs) 60 | self._set_hyper('learning_rate', kwargs.get('lr', learning_rate)) 61 | self._set_hyper('beta_1', beta_1) 62 | self._set_hyper('beta_2', beta_2) 63 | self._set_hyper('decay', self._initial_decay) 64 | self._set_hyper('weight_decay', weight_decay) 65 | self._set_hyper('total_steps', float(total_steps)) 66 | self._set_hyper('warmup_proportion', warmup_proportion) 67 | self._set_hyper('min_lr', min_lr) 68 | self.epsilon = epsilon or K.epsilon() 69 | self.amsgrad = amsgrad 70 | self._initial_weight_decay = weight_decay 71 | self._initial_total_steps = total_steps 72 | 73 | def _create_slots(self, var_list): 74 | for var in var_list: 75 | self.add_slot(var, 'm') 76 | for var in var_list: 77 | self.add_slot(var, 'v') 78 | if self.amsgrad: 79 | for var in var_list: 80 | self.add_slot(var, 'vhat') 81 | 82 | def set_weights(self, weights): 83 | params = self.weights 84 | num_vars = int((len(params) - 1) / 2) 85 | if len(weights) == 3 * num_vars + 1: 86 | weights = weights[:len(params)] 87 | super(RAdam, self).set_weights(weights) 88 | 89 | def _resource_apply_dense(self, grad, var): 90 | var_dtype = var.dtype.base_dtype 91 | lr_t = self._decayed_lr(var_dtype) 92 | m = self.get_slot(var, 'm') 93 | v = self.get_slot(var, 'v') 94 | beta_1_t = self._get_hyper('beta_1', var_dtype) 95 | beta_2_t = self._get_hyper('beta_2', var_dtype) 96 | epsilon_t = ops.convert_to_tensor(self.epsilon, var_dtype) 97 | local_step = math_ops.cast(self.iterations + 1, var_dtype) 98 | beta_1_power = math_ops.pow(beta_1_t, local_step) 99 | beta_2_power = math_ops.pow(beta_2_t, local_step) 100 | 101 | if self._initial_total_steps > 0: 102 | total_steps = self._get_hyper('total_steps', var_dtype) 103 | warmup_steps = total_steps * \ 104 | self._get_hyper('warmup_proportion', var_dtype) 105 | min_lr = self._get_hyper('min_lr', var_dtype) 106 | decay_steps = K.maximum(total_steps - warmup_steps, 1) 107 | decay_rate = (min_lr - lr_t) / decay_steps 108 | lr_t = tf.where( 109 | local_step <= warmup_steps, 110 | lr_t * (local_step / warmup_steps), 111 | lr_t + decay_rate * 112 | K.minimum(local_step - warmup_steps, decay_steps), 113 | ) 114 | 115 | sma_inf = 2.0 / (1.0 - beta_2_t) - 1.0 116 | sma_t = sma_inf - 2.0 * local_step * \ 117 | beta_2_power / (1.0 - beta_2_power) 118 | 119 | m_t = state_ops.assign(m, 120 | beta_1_t * m + (1.0 - beta_1_t) * grad, 121 | use_locking=self._use_locking) 122 | m_corr_t = m_t / (1.0 - beta_1_power) 123 | 124 | v_t = state_ops.assign(v, 125 | beta_2_t * v + (1.0 - beta_2_t) * 126 | math_ops.square(grad), 127 | use_locking=self._use_locking) 128 | if self.amsgrad: 129 | vhat = self.get_slot(var, 'vhat') 130 | vhat_t = state_ops.assign(vhat, 131 | math_ops.maximum(vhat, v_t), 132 | use_locking=self._use_locking) 133 | v_corr_t = math_ops.sqrt(vhat_t / (1.0 - beta_2_power)) 134 | else: 135 | vhat_t = None 136 | v_corr_t = math_ops.sqrt(v_t / (1.0 - beta_2_power)) 137 | 138 | r_t = math_ops.sqrt((sma_t - 4.0) / (sma_inf - 4.0) * 139 | (sma_t - 2.0) / (sma_inf - 2.0) * 140 | sma_inf / sma_t) 141 | 142 | var_t = tf.where(sma_t >= 5.0, r_t * m_corr_t / 143 | (v_corr_t + epsilon_t), m_corr_t) 144 | 145 | if self._initial_weight_decay > 0.0: 146 | var_t += self._get_hyper('weight_decay', var_dtype) * var 147 | 148 | var_update = state_ops.assign_sub(var, 149 | lr_t * var_t, 150 | use_locking=self._use_locking) 151 | 152 | updates = [var_update, m_t, v_t] 153 | if self.amsgrad: 154 | updates.append(vhat_t) 155 | return control_flow_ops.group(*updates) 156 | 157 | def _resource_apply_sparse(self, grad, var, indices): 158 | var_dtype = var.dtype.base_dtype 159 | lr_t = self._decayed_lr(var_dtype) 160 | beta_1_t = self._get_hyper('beta_1', var_dtype) 161 | beta_2_t = self._get_hyper('beta_2', var_dtype) 162 | epsilon_t = ops.convert_to_tensor(self.epsilon, var_dtype) 163 | local_step = math_ops.cast(self.iterations + 1, var_dtype) 164 | beta_1_power = math_ops.pow(beta_1_t, local_step) 165 | beta_2_power = math_ops.pow(beta_2_t, local_step) 166 | 167 | if self._initial_total_steps > 0: 168 | total_steps = self._get_hyper('total_steps', var_dtype) 169 | warmup_steps = total_steps * \ 170 | self._get_hyper('warmup_proportion', var_dtype) 171 | min_lr = self._get_hyper('min_lr', var_dtype) 172 | decay_steps = K.maximum(total_steps - warmup_steps, 1) 173 | decay_rate = (min_lr - lr_t) / decay_steps 174 | lr_t = tf.where( 175 | local_step <= warmup_steps, 176 | lr_t * (local_step / warmup_steps), 177 | lr_t + decay_rate * 178 | K.minimum(local_step - warmup_steps, decay_steps), 179 | ) 180 | 181 | sma_inf = 2.0 / (1.0 - beta_2_t) - 1.0 182 | sma_t = sma_inf - 2.0 * local_step * \ 183 | beta_2_power / (1.0 - beta_2_power) 184 | 185 | m = self.get_slot(var, 'm') 186 | m_scaled_g_values = grad * (1 - beta_1_t) 187 | m_t = state_ops.assign(m, m * beta_1_t, use_locking=self._use_locking) 188 | with ops.control_dependencies([m_t]): 189 | m_t = self._resource_scatter_add(m, indices, m_scaled_g_values) 190 | m_corr_t = m_t / (1.0 - beta_1_power) 191 | 192 | v = self.get_slot(var, 'v') 193 | v_scaled_g_values = (grad * grad) * (1 - beta_2_t) 194 | v_t = state_ops.assign(v, v * beta_2_t, use_locking=self._use_locking) 195 | with ops.control_dependencies([v_t]): 196 | v_t = self._resource_scatter_add(v, indices, v_scaled_g_values) 197 | 198 | if self.amsgrad: 199 | vhat = self.get_slot(var, 'vhat') 200 | vhat_t = state_ops.assign(vhat, 201 | math_ops.maximum(vhat, v_t), 202 | use_locking=self._use_locking) 203 | v_corr_t = math_ops.sqrt(vhat_t / (1.0 - beta_2_power)) 204 | else: 205 | vhat_t = None 206 | v_corr_t = math_ops.sqrt(v_t / (1.0 - beta_2_power)) 207 | 208 | r_t = math_ops.sqrt((sma_t - 4.0) / (sma_inf - 4.0) * 209 | (sma_t - 2.0) / (sma_inf - 2.0) * 210 | sma_inf / sma_t) 211 | 212 | var_t = tf.where(sma_t >= 5.0, r_t * m_corr_t / 213 | (v_corr_t + epsilon_t), m_corr_t) 214 | 215 | if self._initial_weight_decay > 0.0: 216 | var_t += self._get_hyper('weight_decay', var_dtype) * var 217 | 218 | var_update = self._resource_scatter_add( 219 | var, indices, tf.gather(-lr_t * var_t, indices)) 220 | 221 | updates = [var_update, m_t, v_t] 222 | if self.amsgrad: 223 | updates.append(vhat_t) 224 | return control_flow_ops.group(*updates) 225 | 226 | def get_config(self): 227 | config = super(RAdam, self).get_config() 228 | config.update({ 229 | 'learning_rate': self._serialize_hyperparameter('learning_rate'), 230 | 'beta_1': self._serialize_hyperparameter('beta_1'), 231 | 'beta_2': self._serialize_hyperparameter('beta_2'), 232 | 'decay': self._serialize_hyperparameter('decay'), 233 | 'weight_decay': self._serialize_hyperparameter('weight_decay'), 234 | 'epsilon': self.epsilon, 235 | 'amsgrad': self.amsgrad, 236 | 'total_steps': self._serialize_hyperparameter('total_steps'), 237 | 'warmup_proportion': self._serialize_hyperparameter('warmup_proportion'), 238 | 'min_lr': self._serialize_hyperparameter('min_lr'), 239 | }) 240 | return config 241 | -------------------------------------------------------------------------------- /tf-helper-bot/tf_helper_bot/bot.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | from typing import Callable, Sequence, Union, Optional 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | from dataclasses import dataclass, field 8 | from tqdm.autonotebook import tqdm 9 | 10 | from .logger import Logger 11 | 12 | 13 | @dataclass 14 | class BaseBot: 15 | """Base Interface to Model Training and Inference""" 16 | train_dataset: tf.data.Dataset 17 | valid_dataset: tf.data.Dataset 18 | steps_per_epoch: int 19 | criterion: Callable 20 | model: tf.keras.Model 21 | optimizer: tf.keras.optimizers.Optimizer 22 | name: str = "basebot" 23 | log_dir: Union[Path, str] = "./logs" 24 | log_level: int = logging.INFO 25 | loss_format: str = "%.4f" 26 | echo: bool = True 27 | pbar: bool = True 28 | step: int = 0 29 | total_steps: int = 0 30 | valid_steps: Optional[int] = None 31 | gradient_accumulation_steps: int = 1 32 | metrics: Sequence = () 33 | callbacks: Sequence = () 34 | mixed_precision: bool = False 35 | 36 | def __post_init__(self): 37 | self._gradients = [] 38 | self.logger = Logger( 39 | self.name, Path(self.log_dir), self.log_level, 40 | echo=self.echo 41 | ) 42 | 43 | @tf.function 44 | def get_gradient(input_tensors, target): 45 | with tf.GradientTape() as tape: 46 | output = self.model( 47 | input_tensors, training=True) 48 | loss_raw = self.criterion( 49 | target, self._extract_prediction(output) 50 | ) 51 | loss_ = ( 52 | self.optimizer.get_scaled_loss(loss_raw) 53 | if self.mixed_precision else loss_raw 54 | ) 55 | gradients_ = tape.gradient( 56 | loss_, self.model.trainable_variables) 57 | if self.mixed_precision: 58 | gradients_ = self.optimizer.get_unscaled_gradients(gradients_) 59 | return loss_raw, gradients_ 60 | 61 | @tf.function 62 | def step_optimizer(gradients): 63 | self.optimizer.apply_gradients( 64 | zip( 65 | gradients, 66 | self.model.trainable_variables 67 | ) 68 | ) 69 | 70 | @tf.function 71 | def predict_batch(input_tensors): 72 | return self.model(input_tensors, training=False) 73 | 74 | self._get_gradient = get_gradient 75 | self._step_optimizer = step_optimizer 76 | self._predict_batch = predict_batch 77 | 78 | @staticmethod 79 | def _sum_indexed_slice(grad_1, grad_2, div_): 80 | values = tf.concat([grad_1.values, grad_2.values / div_], 0) 81 | indices = tf.concat([grad_1.indices, grad_2.indices], 0) 82 | return tf.IndexedSlices(values, indices) 83 | 84 | def train_one_step(self, input_tensor_list, target): 85 | loss, gradients = self._get_gradient( 86 | input_tensor_list[0], target) 87 | if self.gradient_accumulation_steps > 1: 88 | div_ = tf.constant( 89 | self.gradient_accumulation_steps, 90 | dtype=tf.float32 91 | ) 92 | gradients = [x / div_ for x in gradients] 93 | loss, gradients = self._get_gradient( 94 | input_tensor_list[0], target) 95 | for i in range(1, self.gradient_accumulation_steps): 96 | loss_, gradients_ = self._get_gradient( 97 | input_tensor_list[i], target) 98 | gradients = [ 99 | grad_1 + grad_2 / div_ if not isinstance(grad_1, tf.IndexedSlices) 100 | else self._sum_indexed_slice(grad_1, grad_2, div_) 101 | for grad_1, grad_2 in zip(gradients, gradients_) 102 | ] 103 | loss = loss + loss_ 104 | loss = loss / tf.constant( 105 | self.gradient_accumulation_steps, 106 | dtype=tf.float32 107 | ) 108 | self._step_optimizer(gradients) 109 | return loss 110 | 111 | @staticmethod 112 | def _extract_prediction(output): 113 | """Can be overridden to act as a shortcut to transform model outputs. 114 | 115 | Useful when using a pretrained model whose outputs are not in the desired format. 116 | """ 117 | return output 118 | 119 | def train(self, *, checkpoint_interval, n_steps=None, total_steps=None): 120 | if total_steps: 121 | self.total_steps = total_steps 122 | if n_steps is None: 123 | if self.total_steps is None: 124 | raise ValueError("n_steps and total_steps cannot both be None") 125 | n_steps = self.total_steps - self.step 126 | elif self.total_steps is None: 127 | self.total_steps = n_steps 128 | target_step = self.step + n_steps 129 | input_tensor_list, cnt = [], 0 130 | # Train starts 131 | self.run_train_starts_callbacks() 132 | try: 133 | while self.step < target_step: 134 | for input_tensors, targets in self.train_dataset: 135 | self.step += 1 136 | input_tensors, targets = self.run_batch_inputs_callbacks( 137 | input_tensors, targets) 138 | input_tensor_list.append(input_tensors) 139 | cnt += self.get_batch_size(input_tensors) 140 | if len(input_tensor_list) == self.gradient_accumulation_steps: 141 | loss = self.train_one_step( 142 | input_tensor_list, targets 143 | ) 144 | # Step ends 145 | self.run_step_ends_callbacks(loss.numpy(), cnt) 146 | input_tensor_list, cnt = [], 0 147 | if ( 148 | (callable(checkpoint_interval) and checkpoint_interval(self.step)) or 149 | ( 150 | not callable(checkpoint_interval) and 151 | self.step % checkpoint_interval == 0 152 | ) 153 | ): 154 | # Eval starts 155 | metrics = self.eval(self.valid_dataset) 156 | # Eval ends 157 | self.run_eval_ends_callbacks(metrics) 158 | if self.step >= target_step: 159 | break 160 | # Epoch ends 161 | if self.step % self.steps_per_epoch == 0: 162 | self.run_epoch_ends_callbacks( 163 | self.step // self.steps_per_epoch) 164 | except (KeyboardInterrupt): 165 | pass 166 | finally: 167 | # Train ends 168 | self.run_train_ends_callbacks() 169 | 170 | def predict_batch(self, input_tensors): 171 | """To be overriden in distributed modes""" 172 | return self._extract_prediction( 173 | self._predict_batch(input_tensors) 174 | ) 175 | 176 | def _extract_target_for_eval(self, target): 177 | return target 178 | 179 | def predict(self, dataset, *, return_y=False): 180 | self.model.eval() 181 | outputs, y_global = [], [] 182 | for *input_tensors, y_local in tqdm(dataset, disable=not self.pbar): 183 | outputs.append(self.predict_batch(input_tensors).numpy()) 184 | if return_y: 185 | y_global.append( 186 | self._extract_target_for_eval(y_local).numpy()) 187 | outputs = np.concatenate(outputs, axis=0) 188 | if return_y: 189 | y_global = np.concatenate(y_global, axis=0) 190 | return outputs, y_global 191 | return outputs 192 | 193 | def eval(self, dataset): 194 | """Warning: Only support datasets whose predictions and labels together fit in memory.""" 195 | preds, ys = [], [] 196 | losses, weights = [], [] 197 | self.logger.debug("Evaluating...") 198 | for input_tensors, y_local in tqdm(dataset, disable=not self.pbar, total=self.valid_steps, ncols=100): 199 | output = self.predict_batch(input_tensors) 200 | y_local = self._extract_target_for_eval(y_local) 201 | batch_loss = self.criterion(y_local, output) 202 | losses.append(batch_loss.numpy()) 203 | weights.append(y_local.shape[0]) 204 | # Save batch labels and predictions 205 | preds.append(output.numpy()) 206 | ys.append(y_local.numpy()) 207 | loss = np.average(losses, weights=weights) 208 | metrics = {"loss": (loss, self.loss_format % loss)} 209 | global_ys, global_preds = np.concatenate(ys), np.concatenate(preds) 210 | for metric in self.metrics: 211 | metric_loss, metric_string = metric(global_ys, global_preds) 212 | metrics[metric.name] = (metric_loss, metric_string) 213 | return metrics 214 | 215 | def get_batch_size(self, input_tensors): 216 | if isinstance(input_tensors, list): 217 | return self.get_batch_size(input_tensors[0]) 218 | elif isinstance(input_tensors, dict): 219 | return self.get_batch_size(list(input_tensors.values())[0]) 220 | return input_tensors.shape[0] 221 | 222 | def run_batch_inputs_callbacks(self, input_tensors, targets): 223 | for callback in self.callbacks: 224 | input_tensors, targets = callback.on_batch_inputs( 225 | self, input_tensors, targets) 226 | return input_tensors, targets 227 | 228 | def run_step_ends_callbacks(self, train_loss, train_weight): 229 | for callback in self.callbacks: 230 | callback.on_step_ends(self, train_loss, train_weight) 231 | 232 | def run_train_starts_callbacks(self): 233 | for callback in self.callbacks: 234 | callback.on_train_starts(self) 235 | 236 | def run_train_ends_callbacks(self): 237 | for callback in self.callbacks: 238 | callback.on_train_ends(self) 239 | 240 | def run_epoch_ends_callbacks(self, epoch): 241 | for callback in self.callbacks: 242 | callback.on_epoch_ends(self, epoch) 243 | 244 | def run_eval_ends_callbacks(self, metrics): 245 | for callback in self.callbacks: 246 | callback.on_eval_ends(self, metrics) 247 | 248 | 249 | @dataclass 250 | class BaseDistributedBot(BaseBot): 251 | """Base Interface to Model Training and Inference""" 252 | strategy: tf.distribute.Strategy = None 253 | 254 | def __post_init__(self): 255 | assert self.strategy is not None 256 | assert self.gradient_accumulation_steps == 1, ( 257 | "Distribution mode doesn't suppoprt gradient accumulation" 258 | ) 259 | super().__post_init__() 260 | @tf.function 261 | def train_one_step(input_tensor_list, target): 262 | loss, gradients = self._get_gradient( 263 | input_tensor_list[0], target) 264 | self._step_optimizer(gradients) 265 | return loss 266 | 267 | self._train_one_step = train_one_step 268 | 269 | def train_one_step(self, input_tensors, target): 270 | loss = self.strategy.experimental_run_v2( 271 | self._train_one_step, 272 | args=(input_tensors, target) 273 | ) 274 | return self.strategy.reduce( 275 | tf.distribute.ReduceOp.MEAN, loss, axis=None 276 | ) 277 | 278 | def get_batch_size(self, input_tensors): 279 | # Just use a rough estimate for speed 280 | return 1 281 | # the following can be slow (and unnecessary in most cases) 282 | # if isinstance(input_tensors, list): 283 | # x_per_gpu_as_list = self.strategy.experimental_local_results( 284 | # input_tensors[0]) 285 | # else: 286 | # x_per_gpu_as_list = self.strategy.experimental_local_results( 287 | # input_tensors) 288 | # batch_sizes = [tf.shape(x_gpu)[0] for x_gpu in x_per_gpu_as_list] 289 | # return tf.reduce_sum(tf.stack(batch_sizes)).numpy() 290 | 291 | def _extract_target_for_eval(self, target): 292 | return tf.concat( 293 | self.strategy.experimental_local_results(target), 294 | axis=0 295 | ) 296 | 297 | def predict_batch(self, input_tensors): 298 | preds = self.strategy.experimental_run_v2( 299 | self._predict_batch, 300 | args=(input_tensors,) 301 | ) 302 | if isinstance(preds, tuple): 303 | # WARNING: This might not applicable in all situations 304 | preds = preds[0] 305 | preds_local = tf.concat( 306 | preds.values, axis=0 307 | ) 308 | return preds_local 309 | --------------------------------------------------------------------------------