├── LICENSE ├── README.md ├── mtbeam_human_eval.npz.npy └── run.py /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2019, 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # dl4dial-bayesian-calibration 2 | 3 | This code is running inference with human eval data using MCMC with NUTS sampler. It returns the mean and std of the empirical score distribution for each model. 4 | 5 | Check our paper for more details: 6 | 7 | https://uralik.github.io/beamdream/ 8 | 9 | https://arxiv.org/abs/1811.00907 10 | 11 | ## Dependencies 12 | * Python 3 13 | * Pyro (0.3.2) 14 | * Pytorch 15 | * Numpy 16 | 17 | ## Example dataset 18 | 19 | In `mtbeam_human_eval.npz.npy` you can find an example matrix of human scores assigned to conversations. Each row in the matrix corresponds to a tuple: `(model_id, annotator_id, score)`. 20 | 21 | `model_id` and `annotator_id` are integer non-negative values starting from 0. Score is an integer value from 0 to 3. 22 | 23 | ## Running the script 24 | 25 | `python run.py --input-array ./mtbeam_human_eval.npz.npy --num-samples 50 --num-warmup-samples 100` 26 | 27 | Sufficient number of samples depends on the size of your data. 28 | 29 | ## Citation 30 | 31 | If you use this calibration method please cite this work as: 32 | ``` 33 | @article{kulikov2018importance, 34 | title={Importance of a search strategy in neural dialogue modelling}, 35 | author={Kulikov, Ilya and Miller, Alexander H and Cho, Kyunghyun and Weston, Jason}, 36 | journal={arXiv preprint arXiv:1811.00907}, 37 | year={2018} 38 | } 39 | ``` 40 | -------------------------------------------------------------------------------- /mtbeam_human_eval.npz.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nyu-dl/dl4dial-bayesian-calibration/0512d0a21f5c0fbd819f73e79ef849e93011c82b/mtbeam_human_eval.npz.npy -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import logging 3 | import argparse 4 | 5 | import numpy 6 | import torch 7 | import pyro 8 | import pyro.infer 9 | import pyro.infer.mcmc 10 | import pyro.distributions as dist 11 | 12 | 13 | def get_arguments(): 14 | 15 | parser = argparse.ArgumentParser(description='Bayesian Calibration arguments') 16 | parser.add_argument('--logging-level', default='WARNING', choices=['WARNING', 'DEBUG']) 17 | parser.add_argument('--input-array', type=str, required=True) 18 | parser.add_argument('--num-samples', type=int, default=150) 19 | parser.add_argument('--num-warmup-samples', type=int, default=100) 20 | 21 | args = parser.parse_args() 22 | 23 | return args 24 | 25 | 26 | def sigmoid(x): 27 | return 1./(1. + numpy.exp(-x)) 28 | 29 | 30 | def model_single_score(data, config): 31 | """ 32 | p(m_a) ~ U(0,3): prior for each model score 33 | p(s_a) ~ N(s_a; m_a, 1^2): prior for each score data point given model sampled mean 34 | """ 35 | zm = [] 36 | for mi in range(config['n_models']): 37 | mu_ = pyro.sample("model-mean-{}".format(mi), dist.Uniform(0., 3.)) 38 | zm.append(pyro.sample("model-{}".format(mi), dist.Normal(mu_, 1.))) 39 | 40 | """ 41 | p(s_t) ~ N(s_t; 0, 1^2): prior score for each annotator, no bias by default 42 | """ 43 | tm = [] 44 | for ti in range(config['n_turkers']): 45 | tm.append(pyro.sample("turker-mean-{}".format(ti), dist.Normal(0., 1.))) 46 | 47 | """ 48 | p(s|a, t) = N(s, s_a + s_t, 1^2): likelihood mean for each score given by annotator 49 | t for model a 50 | """ 51 | mu = [] 52 | for ii, sc in enumerate(data): 53 | mu.append(zm[int(sc[0])] + tm[int(sc[1])]) # original 54 | mu_ = torch.stack(mu) 55 | 56 | return pyro.sample("scores", dist.Normal(mu_, 1.)) 57 | 58 | 59 | def infer(data, config): 60 | observed_single_scores = torch.Tensor([tup[2] for tup in data]) 61 | single_score_condition = pyro.condition(model_single_score, data={'scores': observed_single_scores}) 62 | nuts_kernel = pyro.infer.mcmc.NUTS(single_score_condition, adapt_step_size=True, step_size=0.1) 63 | mcmc_run = pyro.infer.mcmc.MCMC(nuts_kernel, num_samples=config['num-samples'], warmup_steps=config['warmup-steps']).run(data, config) 64 | score_marginal = pyro.infer.EmpiricalMarginal(mcmc_run, sites=["model-{}".format(mi) for mi in range(config['n_models'])]) 65 | 66 | return score_marginal.mean, score_marginal.stddev 67 | 68 | 69 | def prepare_data(args): 70 | data = numpy.load(args.input_array) 71 | 72 | # we assume models and annotator indexing from 0 73 | n_turkers = max([a[1] for a in data])+1 74 | n_models = max([a[0] for a in data])+1 75 | 76 | config = { 77 | 'logging-level': args.logging_level, 78 | 'num-samples': args.num_samples, 79 | 'warmup-steps': args.num_warmup_samples, 80 | 'n_models': n_models, 81 | 'n_turkers': n_turkers, 82 | } 83 | return config, data 84 | 85 | 86 | def main(): 87 | args = get_arguments() 88 | config, data = prepare_data(args) 89 | 90 | mean, std = infer(data, config) 91 | print('Empirical mean: {}\n\n'.format(mean)) 92 | print('Empirical std: {}\n\n'.format(std)) 93 | 94 | 95 | if __name__ == '__main__': 96 | main() 97 | 98 | 99 | --------------------------------------------------------------------------------