├── LICENSE
├── README.md
├── mtbeam_human_eval.npz.npy
└── run.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2019, 
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # dl4dial-bayesian-calibration
 2 | 
 3 | This code is running inference with human eval data using MCMC with NUTS sampler. It returns the mean and std of the empirical score distribution for each model.
 4 | 
 5 | Check our paper for more details:
 6 | 
 7 | https://uralik.github.io/beamdream/
 8 | 
 9 | https://arxiv.org/abs/1811.00907
10 | 
11 | ## Dependencies
12 | * Python 3
13 | * Pyro (0.3.2)
14 | * Pytorch
15 | * Numpy
16 | 
17 | ## Example dataset
18 | 
19 | In `mtbeam_human_eval.npz.npy` you can find an example matrix of human scores assigned to conversations. Each row in the matrix corresponds to a tuple: `(model_id, annotator_id, score)`.
20 | 
21 | `model_id` and `annotator_id` are integer non-negative values starting from 0. Score is an integer value from 0 to 3.
22 | 
23 | ## Running the script
24 | 
25 | `python run.py --input-array ./mtbeam_human_eval.npz.npy --num-samples 50 --num-warmup-samples 100`
26 | 
27 | Sufficient number of samples depends on the size of your data.
28 | 
29 | ## Citation
30 | 
31 | If you use this calibration method please cite this work as:
32 | ```
33 | @article{kulikov2018importance,
34 |   title={Importance of a search strategy in neural dialogue modelling},
35 |   author={Kulikov, Ilya and Miller, Alexander H and Cho, Kyunghyun and Weston, Jason},
36 |   journal={arXiv preprint arXiv:1811.00907},
37 |   year={2018}
38 | }
39 | ```
40 | 


--------------------------------------------------------------------------------
/mtbeam_human_eval.npz.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nyu-dl/dl4dial-bayesian-calibration/0512d0a21f5c0fbd819f73e79ef849e93011c82b/mtbeam_human_eval.npz.npy


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import logging
 3 | import argparse
 4 | 
 5 | import numpy
 6 | import torch
 7 | import pyro
 8 | import pyro.infer
 9 | import pyro.infer.mcmc
10 | import pyro.distributions as dist
11 | 
12 | 
13 | def get_arguments():
14 | 
15 |     parser = argparse.ArgumentParser(description='Bayesian Calibration arguments')
16 |     parser.add_argument('--logging-level', default='WARNING', choices=['WARNING', 'DEBUG'])
17 |     parser.add_argument('--input-array', type=str, required=True)
18 |     parser.add_argument('--num-samples', type=int, default=150)
19 |     parser.add_argument('--num-warmup-samples', type=int, default=100)
20 | 
21 |     args = parser.parse_args()
22 | 
23 |     return args
24 | 
25 | 
26 | def sigmoid(x):
27 |     return 1./(1. + numpy.exp(-x))
28 | 
29 | 
30 | def model_single_score(data, config):
31 |     """
32 |     p(m_a) ~ U(0,3): prior for each model score
33 |     p(s_a) ~ N(s_a; m_a, 1^2): prior for each score data point given model sampled mean
34 |     """
35 |     zm = []
36 |     for mi in range(config['n_models']):
37 |         mu_ = pyro.sample("model-mean-{}".format(mi), dist.Uniform(0., 3.))
38 |         zm.append(pyro.sample("model-{}".format(mi), dist.Normal(mu_, 1.)))
39 | 
40 |     """
41 |     p(s_t) ~ N(s_t; 0, 1^2): prior score for each annotator, no bias by default
42 |     """
43 |     tm = []
44 |     for ti in range(config['n_turkers']):
45 |         tm.append(pyro.sample("turker-mean-{}".format(ti), dist.Normal(0., 1.)))
46 | 
47 |     """
48 |     p(s|a, t) = N(s, s_a + s_t, 1^2): likelihood mean for each score given by annotator
49 |     t for model a
50 |     """
51 |     mu = []
52 |     for ii, sc in enumerate(data):
53 |         mu.append(zm[int(sc[0])] + tm[int(sc[1])]) # original
54 |     mu_ = torch.stack(mu)
55 | 
56 |     return pyro.sample("scores", dist.Normal(mu_, 1.))
57 | 
58 | 
59 | def infer(data, config):
60 |     observed_single_scores = torch.Tensor([tup[2] for tup in data])
61 |     single_score_condition = pyro.condition(model_single_score, data={'scores': observed_single_scores})
62 |     nuts_kernel = pyro.infer.mcmc.NUTS(single_score_condition, adapt_step_size=True, step_size=0.1)
63 |     mcmc_run = pyro.infer.mcmc.MCMC(nuts_kernel, num_samples=config['num-samples'], warmup_steps=config['warmup-steps']).run(data, config)
64 |     score_marginal = pyro.infer.EmpiricalMarginal(mcmc_run, sites=["model-{}".format(mi) for mi in range(config['n_models'])])
65 | 
66 |     return score_marginal.mean, score_marginal.stddev
67 | 
68 | 
69 | def prepare_data(args):
70 |     data = numpy.load(args.input_array)
71 | 
72 |     # we assume models and annotator indexing from 0
73 |     n_turkers = max([a[1] for a in data])+1
74 |     n_models = max([a[0] for a in data])+1
75 |     
76 |     config = {
77 |         'logging-level': args.logging_level,
78 |         'num-samples': args.num_samples,
79 |         'warmup-steps': args.num_warmup_samples,
80 |         'n_models': n_models,
81 |         'n_turkers': n_turkers,
82 |     }
83 |     return config, data
84 | 
85 | 
86 | def main():
87 |     args = get_arguments()
88 |     config, data = prepare_data(args)
89 | 
90 |     mean, std = infer(data, config)
91 |     print('Empirical mean: {}\n\n'.format(mean))
92 |     print('Empirical std: {}\n\n'.format(std))
93 | 
94 | 
95 | if __name__ == '__main__':
96 |     main()
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------