├── .dockerignore ├── .gitignore ├── CODEOWNERS ├── Dockerfile ├── LICENSE ├── README.md ├── __init__.py ├── dataset.py ├── evaluate.py ├── models ├── __init__.py └── glad.py ├── preprocess_data.py ├── requirements.txt ├── train.py ├── utils.py └── version.txt /.dockerignore: -------------------------------------------------------------------------------- 1 | exp/ 2 | data/ 3 | Dockerfile 4 | .git/ 5 | *.py[cod] 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | Mar*/ 3 | .DS_Store 4 | *.py[cod] 5 | *.json 6 | *.json[~] 7 | *.save 8 | *.log 9 | *.model 10 | *.t7 11 | *.npy 12 | *.flist 13 | *.zip 14 | *.gzip 15 | *.tar 16 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | # Comment line immediately above ownership line is reserved for related gus information. Please be careful while editing. 2 | #ECCN:Open Source 3 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:9.0-base-ubuntu16.04 2 | 3 | # install Miniconda 4 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 5 | ENV PATH /opt/conda/bin:$PATH 6 | 7 | RUN apt-get update --fix-missing && apt-get install -y wget bzip2 ca-certificates \ 8 | libglib2.0-0 libxext6 libsm6 libxrender1 \ 9 | git mercurial subversion 10 | 11 | RUN wget --quiet https://repo.continuum.io/miniconda/Miniconda3-4.4.10-Linux-x86_64.sh -O ~/miniconda.sh && \ 12 | /bin/bash ~/miniconda.sh -b -p /opt/conda && \ 13 | rm ~/miniconda.sh && \ 14 | /opt/conda/bin/conda clean -tipsy && \ 15 | ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ 16 | echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ 17 | echo "conda activate base" >> ~/.bashrc 18 | 19 | # copy GLAD 20 | RUN mkdir -p /opt/glad 21 | WORKDIR /opt/glad 22 | 23 | # install dependencies 24 | COPY requirements.txt . 25 | RUN pip install -r requirements.txt 26 | 27 | # copy source 28 | COPY . . 29 | 30 | # volumes and environment variables 31 | ENV EMBEDDINGS_ROOT /opt/embeddings 32 | RUN mkdir -p /opt/embeddings 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2018, Salesforce 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Global-Locally Self-Attentive Dialogue State Tracker 2 | 3 | This repository contains an implementation of the [Global-Locally Self-Attentive Dialogue State Tracker (GLAD)](https://arxiv.org/abs/1805.09655). 4 | If you use this in your work, please cite the following 5 | 6 | ``` 7 | @inproceedings{ zhong2018global, 8 | title={ Global-Locally Self-Attentive Encoder for Dialogue State Tracking }, 9 | author={ Zhong, Victor and Xiong, Caiming and Socher, Richard }, 10 | booktitle={ ACL }, 11 | year={ 2018 } 12 | } 13 | ``` 14 | 15 | 16 | # Install dependencies 17 | 18 | Using Docker 19 | 20 | ``` 21 | docker build -t glad:0.4 . 22 | docker run --name embeddings -d vzhong/embeddings:0.0.5 # get the embeddings 23 | env NV_GPU=0 nvidia-docker run --name glad -d -t --net host --volumes-from embeddings glad:0.4 24 | ``` 25 | 26 | If you do not want to build the Docker image, then run the following (you still need to have the CoreNLP server). 27 | 28 | ``` 29 | pip install -r requirements.txt 30 | ``` 31 | 32 | # Download and annotate data 33 | 34 | This project uses Stanford CoreNLP to annotate the dataset. 35 | In particular, we use the [Stanford NLP Stanza python interface](https://github.com/stanfordnlp/stanza). 36 | To run the server, do 37 | 38 | ``` 39 | docker run --name corenlp -d -p 9000:9000 vzhong/corenlp-server 40 | ``` 41 | 42 | The first time you preprocess the data, we will [download word embeddings and character embeddings and put them into a SQLite database](https://github.com/vzhong/embeddings), which will be slow. 43 | Subsequent runs will be much faster. 44 | 45 | ``` 46 | docker exec glad python preprocess_data.py 47 | ``` 48 | 49 | The raw data will be stored in `data/woz/raw` of the container. 50 | The annotation results will be stored in `data/woz/ann` of the container. 51 | 52 | If you do not want to build the Docker image, then run 53 | 54 | ``` 55 | python preprocess_data.py 56 | ``` 57 | 58 | 59 | # Train model 60 | 61 | You can checkout the training options via `python train.py -h`. 62 | By default, `train.py` will save checkpoints to `exp/glad/default`. 63 | 64 | ``` 65 | docker exec glad python train.py --gpu 0 66 | ``` 67 | 68 | You can attach to the container via `docker exec glad -it bin/bash` to look at what's inside or `docker cp glad /opt/glad/exp exp` to copy out the experiment results. 69 | 70 | If you do not want to build the Docker image, then run 71 | 72 | ``` 73 | python train.py --gpu 0 74 | ``` 75 | 76 | 77 | # Evaluation 78 | 79 | You can evaluate the model using 80 | 81 | ``` 82 | docker exec glad python evaluate.py --gpu 0 --split test exp/glad/default 83 | ``` 84 | 85 | You can also dump a predictions file by specifying the `--fout` flag. 86 | In this case, the output will be a list of lists. 87 | Each `i`th sublist is the set of predicted slot-value pairs for the `i`th turn. 88 | Please see `evaluate.py` to see how to match up the turn predictions with the dialogues. 89 | 90 | If you do not want to build the Docker image, then run 91 | 92 | ``` 93 | python evaluate.py --gpu 0 --split test exp/glad/default 94 | ``` 95 | 96 | 97 | # Contribution 98 | 99 | Pull requests are welcome! 100 | If you have any questions, please create an issue or contact the corresponding author at `victor victorzhong com`. 101 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/glad/d8de6a22a3be0f2a63a12b799c3041b4a8b4081d/__init__.py -------------------------------------------------------------------------------- /dataset.py: -------------------------------------------------------------------------------- 1 | import json 2 | from collections import defaultdict 3 | import numpy as np 4 | from tqdm import tqdm 5 | from stanza.nlp.corenlp import CoreNLPClient 6 | 7 | 8 | client = None 9 | 10 | 11 | def annotate(sent): 12 | global client 13 | if client is None: 14 | client = CoreNLPClient(default_annotators='ssplit,tokenize'.split(',')) 15 | words = [] 16 | for sent in client.annotate(sent).sentences: 17 | for tok in sent: 18 | words.append(tok.word) 19 | return words 20 | 21 | 22 | class Turn: 23 | 24 | def __init__(self, turn_id, transcript, turn_label, belief_state, system_acts, system_transcript, num=None): 25 | self.id = turn_id 26 | self.transcript = transcript 27 | self.turn_label = turn_label 28 | self.belief_state = belief_state 29 | self.system_acts = system_acts 30 | self.system_transcript = system_transcript 31 | self.num = num or {} 32 | 33 | def to_dict(self): 34 | return {'turn_id': self.id, 'transcript': self.transcript, 'turn_label': self.turn_label, 'belief_state': self.belief_state, 'system_acts': self.system_acts, 'system_transcript': self.system_transcript, 'num': self.num} 35 | 36 | @classmethod 37 | def from_dict(cls, d): 38 | return cls(**d) 39 | 40 | @classmethod 41 | def annotate_raw(cls, raw): 42 | system_acts = [] 43 | for a in raw['system_acts']: 44 | if isinstance(a, list): 45 | s, v = a 46 | system_acts.append(['inform'] + s.split() + ['='] + v.split()) 47 | else: 48 | system_acts.append(['request'] + a.split()) 49 | # NOTE: fix inconsistencies in data label 50 | fix = {'centre': 'center', 'areas': 'area', 'phone number': 'number'} 51 | return cls( 52 | turn_id=raw['turn_idx'], 53 | transcript=annotate(raw['transcript']), 54 | system_acts=system_acts, 55 | turn_label=[[fix.get(s.strip(), s.strip()), fix.get(v.strip(), v.strip())] for s, v in raw['turn_label']], 56 | belief_state=raw['belief_state'], 57 | system_transcript=raw['system_transcript'], 58 | ) 59 | 60 | def numericalize_(self, vocab): 61 | self.num['transcript'] = vocab.word2index([''] + [w.lower() for w in self.transcript + ['']], train=True) 62 | self.num['system_acts'] = [vocab.word2index([''] + [w.lower() for w in a] + [''], train=True) for a in self.system_acts + [['']]] 63 | 64 | 65 | class Dialogue: 66 | 67 | def __init__(self, dialogue_id, turns): 68 | self.id = dialogue_id 69 | self.turns = turns 70 | 71 | def __len__(self): 72 | return len(self.turns) 73 | 74 | def to_dict(self): 75 | return {'dialogue_id': self.id, 'turns': [t.to_dict() for t in self.turns]} 76 | 77 | @classmethod 78 | def from_dict(cls, d): 79 | return cls(d['dialogue_id'], [Turn.from_dict(t) for t in d['turns']]) 80 | 81 | @classmethod 82 | def annotate_raw(cls, raw): 83 | return cls(raw['dialogue_idx'], [Turn.annotate_raw(t) for t in raw['dialogue']]) 84 | 85 | 86 | class Dataset: 87 | 88 | def __init__(self, dialogues): 89 | self.dialogues = dialogues 90 | 91 | def __len__(self): 92 | return len(self.dialogues) 93 | 94 | def iter_turns(self): 95 | for d in self.dialogues: 96 | for t in d.turns: 97 | yield t 98 | 99 | def to_dict(self): 100 | return {'dialogues': [d.to_dict() for d in self.dialogues]} 101 | 102 | @classmethod 103 | def from_dict(cls, d): 104 | return cls([Dialogue.from_dict(dd) for dd in d['dialogues']]) 105 | 106 | @classmethod 107 | def annotate_raw(cls, fname): 108 | with open(fname) as f: 109 | data = json.load(f) 110 | return cls([Dialogue.annotate_raw(d) for d in tqdm(data)]) 111 | 112 | def numericalize_(self, vocab): 113 | for t in self.iter_turns(): 114 | t.numericalize_(vocab) 115 | 116 | def extract_ontology(self): 117 | slots = set() 118 | values = defaultdict(set) 119 | for t in self.iter_turns(): 120 | for s, v in t.turn_label: 121 | slots.add(s.lower()) 122 | values[s].add(v.lower()) 123 | return Ontology(sorted(list(slots)), {k: sorted(list(v)) for k, v in values.items()}) 124 | 125 | def batch(self, batch_size, shuffle=False): 126 | turns = list(self.iter_turns()) 127 | if shuffle: 128 | np.random.shuffle(turns) 129 | for i in tqdm(range(0, len(turns), batch_size)): 130 | yield turns[i:i+batch_size] 131 | 132 | def evaluate_preds(self, preds): 133 | request = [] 134 | inform = [] 135 | joint_goal = [] 136 | fix = {'centre': 'center', 'areas': 'area', 'phone number': 'number'} 137 | i = 0 138 | for d in self.dialogues: 139 | pred_state = {} 140 | for t in d.turns: 141 | gold_request = set([(s, v) for s, v in t.turn_label if s == 'request']) 142 | gold_inform = set([(s, v) for s, v in t.turn_label if s != 'request']) 143 | pred_request = set([(s, v) for s, v in preds[i] if s == 'request']) 144 | pred_inform = set([(s, v) for s, v in preds[i] if s != 'request']) 145 | request.append(gold_request == pred_request) 146 | inform.append(gold_inform == pred_inform) 147 | 148 | gold_recovered = set() 149 | pred_recovered = set() 150 | for s, v in pred_inform: 151 | pred_state[s] = v 152 | for b in t.belief_state: 153 | for s, v in b['slots']: 154 | if b['act'] != 'request': 155 | gold_recovered.add((b['act'], fix.get(s.strip(), s.strip()), fix.get(v.strip(), v.strip()))) 156 | for s, v in pred_state.items(): 157 | pred_recovered.add(('inform', s, v)) 158 | joint_goal.append(gold_recovered == pred_recovered) 159 | i += 1 160 | return {'turn_inform': np.mean(inform), 'turn_request': np.mean(request), 'joint_goal': np.mean(joint_goal)} 161 | 162 | def record_preds(self, preds, to_file): 163 | data = self.to_dict() 164 | i = 0 165 | for d in data['dialogues']: 166 | for t in d['turns']: 167 | t['pred'] = sorted(list(preds[i])) 168 | i += 1 169 | with open(to_file, 'wt') as f: 170 | json.dump(data, f) 171 | 172 | 173 | class Ontology: 174 | 175 | def __init__(self, slots=None, values=None, num=None): 176 | self.slots = slots or [] 177 | self.values = values or {} 178 | self.num = num or {} 179 | 180 | def __add__(self, another): 181 | new_slots = sorted(list(set(self.slots + another.slots))) 182 | new_values = {s: sorted(list(set(self.values.get(s, []) + another.values.get(s, [])))) for s in new_slots} 183 | return Ontology(new_slots, new_values) 184 | 185 | def __radd__(self, another): 186 | return self if another == 0 else self.__add__(another) 187 | 188 | def to_dict(self): 189 | return {'slots': self.slots, 'values': self.values, 'num': self.num} 190 | 191 | def numericalize_(self, vocab): 192 | self.num = {} 193 | for s, vs in self.values.items(): 194 | self.num[s] = [vocab.word2index(annotate('{} = {}'.format(s, v)) + [''], train=True) for v in vs] 195 | 196 | @classmethod 197 | def from_dict(cls, d): 198 | return cls(**d) 199 | -------------------------------------------------------------------------------- /evaluate.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import logging 4 | from argparse import ArgumentParser, Namespace 5 | from pprint import pprint 6 | from utils import load_dataset, load_model 7 | 8 | 9 | if __name__ == '__main__': 10 | parser = ArgumentParser() 11 | parser.add_argument('dsave', help='save location of model') 12 | parser.add_argument('--split', help='split to evaluate on', default='dev') 13 | parser.add_argument('--gpu', type=int, help='gpu to use', default=None) 14 | parser.add_argument('--fout', help='optional save file to store the predictions') 15 | args = parser.parse_args() 16 | 17 | logging.basicConfig(level=logging.INFO) 18 | 19 | with open(os.path.join(args.dsave, 'config.json')) as f: 20 | args_save = Namespace(**json.load(f)) 21 | args_save.gpu = args.gpu 22 | pprint(args_save) 23 | 24 | dataset, ontology, vocab, Eword = load_dataset() 25 | 26 | model = load_model(args_save.model, args_save, ontology, vocab) 27 | model.load_best_save(directory=args.dsave) 28 | if args.gpu is not None: 29 | model.cuda(args.gpu) 30 | 31 | logging.info('Making predictions for {} dialogues and {} turns'.format(len(dataset[args.split]), len(list(dataset[args.split].iter_turns())))) 32 | preds = model.run_pred(dataset[args.split], args_save) 33 | pprint(dataset[args.split].evaluate_preds(preds)) 34 | 35 | if args.fout: 36 | with open(args.fout, 'wt') as f: 37 | # predictions is a list of sets, need to convert to list of lists to make it JSON serializable 38 | json.dump([list(p) for p in preds], f, indent=2) 39 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/glad/d8de6a22a3be0f2a63a12b799c3041b4a8b4081d/models/__init__.py -------------------------------------------------------------------------------- /models/glad.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch import optim 4 | from torch.nn import functional as F 5 | import numpy as np 6 | import logging 7 | import os 8 | import re 9 | import json 10 | from collections import defaultdict 11 | from pprint import pformat 12 | 13 | 14 | def pad(seqs, emb, device, pad=0): 15 | lens = [len(s) for s in seqs] 16 | max_len = max(lens) 17 | padded = torch.LongTensor([s + (max_len-l) * [pad] for s, l in zip(seqs, lens)]) 18 | return emb(padded.to(device)), lens 19 | 20 | 21 | def run_rnn(rnn, inputs, lens): 22 | # sort by lens 23 | order = np.argsort(lens)[::-1].tolist() 24 | reindexed = inputs.index_select(0, inputs.data.new(order).long()) 25 | reindexed_lens = [lens[i] for i in order] 26 | packed = nn.utils.rnn.pack_padded_sequence(reindexed, reindexed_lens, batch_first=True) 27 | outputs, _ = rnn(packed) 28 | padded, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True, padding_value=0.) 29 | reverse_order = np.argsort(order).tolist() 30 | recovered = padded.index_select(0, inputs.data.new(reverse_order).long()) 31 | # reindexed_lens = [lens[i] for i in order] 32 | # recovered_lens = [reindexed_lens[i] for i in reverse_order] 33 | # assert recovered_lens == lens 34 | return recovered 35 | 36 | 37 | def attend(seq, cond, lens): 38 | """ 39 | attend over the sequences `seq` using the condition `cond`. 40 | """ 41 | scores = cond.unsqueeze(1).expand_as(seq).mul(seq).sum(2) 42 | max_len = max(lens) 43 | for i, l in enumerate(lens): 44 | if l < max_len: 45 | scores.data[i, l:] = -np.inf 46 | scores = F.softmax(scores, dim=1) 47 | context = scores.unsqueeze(2).expand_as(seq).mul(seq).sum(1) 48 | return context, scores 49 | 50 | 51 | class FixedEmbedding(nn.Embedding): 52 | """ 53 | this is the same as `nn.Embedding` but detaches the result from the graph and has dropout after lookup. 54 | """ 55 | 56 | def __init__(self, *args, dropout=0, **kwargs): 57 | super().__init__(*args, **kwargs) 58 | self.dropout = dropout 59 | 60 | def forward(self, *args, **kwargs): 61 | out = super().forward(*args, **kwargs) 62 | out.detach_() 63 | return F.dropout(out, self.dropout, self.training) 64 | 65 | 66 | class SelfAttention(nn.Module): 67 | """ 68 | scores each element of the sequence with a linear layer and uses the normalized scores to compute a context over the sequence. 69 | """ 70 | 71 | def __init__(self, d_hid, dropout=0.): 72 | super().__init__() 73 | self.scorer = nn.Linear(d_hid, 1) 74 | self.dropout = nn.Dropout(dropout) 75 | 76 | def forward(self, inp, lens): 77 | batch_size, seq_len, d_feat = inp.size() 78 | inp = self.dropout(inp) 79 | scores = self.scorer(inp.contiguous().view(-1, d_feat)).view(batch_size, seq_len) 80 | max_len = max(lens) 81 | for i, l in enumerate(lens): 82 | if l < max_len: 83 | scores.data[i, l:] = -np.inf 84 | scores = F.softmax(scores, dim=1) 85 | context = scores.unsqueeze(2).expand_as(inp).mul(inp).sum(1) 86 | return context 87 | 88 | 89 | class GLADEncoder(nn.Module): 90 | """ 91 | the GLAD encoder described in https://arxiv.org/abs/1805.09655. 92 | """ 93 | 94 | def __init__(self, din, dhid, slots, dropout=None): 95 | super().__init__() 96 | self.dropout = dropout or {} 97 | self.global_rnn = nn.LSTM(din, dhid, bidirectional=True, batch_first=True) 98 | self.global_selfattn = SelfAttention(2 * dhid, dropout=self.dropout.get('selfattn', 0.)) 99 | for s in slots: 100 | setattr(self, '{}_rnn'.format(s), nn.LSTM(din, dhid, bidirectional=True, batch_first=True, dropout=self.dropout.get('rnn', 0.))) 101 | setattr(self, '{}_selfattn'.format(s), SelfAttention(2*dhid, dropout=self.dropout.get('selfattn', 0.))) 102 | self.slots = slots 103 | self.beta_raw = nn.Parameter(torch.Tensor(len(slots))) 104 | nn.init.uniform_(self.beta_raw, -0.01, 0.01) 105 | 106 | def beta(self, slot): 107 | return F.sigmoid(self.beta_raw[self.slots.index(slot)]) 108 | 109 | def forward(self, x, x_len, slot, default_dropout=0.2): 110 | local_rnn = getattr(self, '{}_rnn'.format(slot)) 111 | local_selfattn = getattr(self, '{}_selfattn'.format(slot)) 112 | beta = self.beta(slot) 113 | local_h = run_rnn(local_rnn, x, x_len) 114 | global_h = run_rnn(self.global_rnn, x, x_len) 115 | h = F.dropout(local_h, self.dropout.get('local', default_dropout), self.training) * beta + F.dropout(global_h, self.dropout.get('global', default_dropout), self.training) * (1-beta) 116 | c = F.dropout(local_selfattn(h, x_len), self.dropout.get('local', default_dropout), self.training) * beta + F.dropout(self.global_selfattn(h, x_len), self.dropout.get('global', default_dropout), self.training) * (1-beta) 117 | return h, c 118 | 119 | 120 | class Model(nn.Module): 121 | """ 122 | the GLAD model described in https://arxiv.org/abs/1805.09655. 123 | """ 124 | 125 | def __init__(self, args, ontology, vocab): 126 | super().__init__() 127 | self.optimizer = None 128 | self.args = args 129 | self.vocab = vocab 130 | self.ontology = ontology 131 | self.emb_fixed = FixedEmbedding(len(vocab), args.demb, dropout=args.dropout.get('emb', 0.2)) 132 | 133 | self.utt_encoder = GLADEncoder(args.demb, args.dhid, self.ontology.slots, dropout=args.dropout) 134 | self.act_encoder = GLADEncoder(args.demb, args.dhid, self.ontology.slots, dropout=args.dropout) 135 | self.ont_encoder = GLADEncoder(args.demb, args.dhid, self.ontology.slots, dropout=args.dropout) 136 | self.utt_scorer = nn.Linear(2 * args.dhid, 1) 137 | self.score_weight = nn.Parameter(torch.Tensor([0.5])) 138 | 139 | @property 140 | def device(self): 141 | if self.args.gpu is not None and torch.cuda.is_available(): 142 | return torch.device('cuda') 143 | else: 144 | return torch.device('cpu') 145 | 146 | def set_optimizer(self): 147 | self.optimizer = optim.Adam(self.parameters(), lr=self.args.lr) 148 | 149 | def load_emb(self, Eword): 150 | new = self.emb_fixed.weight.data.new 151 | self.emb_fixed.weight.data.copy_(new(Eword)) 152 | 153 | def forward(self, batch): 154 | # convert to variables and look up embeddings 155 | eos = self.vocab.word2index('') 156 | utterance, utterance_len = pad([e.num['transcript'] for e in batch], self.emb_fixed, self.device, pad=eos) 157 | acts = [pad(e.num['system_acts'], self.emb_fixed, self.device, pad=eos) for e in batch] 158 | ontology = {s: pad(v, self.emb_fixed, self.device, pad=eos) for s, v in self.ontology.num.items()} 159 | 160 | ys = {} 161 | for s in self.ontology.slots: 162 | # for each slot, compute the scores for each value 163 | H_utt, c_utt = self.utt_encoder(utterance, utterance_len, slot=s) 164 | _, C_acts = list(zip(*[self.act_encoder(a, a_len, slot=s) for a, a_len in acts])) 165 | _, C_vals = self.ont_encoder(ontology[s][0], ontology[s][1], slot=s) 166 | 167 | # compute the utterance score 168 | y_utts = [] 169 | q_utts = [] 170 | for c_val in C_vals: 171 | q_utt, _ = attend(H_utt, c_val.unsqueeze(0).expand(len(batch), *c_val.size()), lens=utterance_len) 172 | q_utts.append(q_utt) 173 | y_utts = self.utt_scorer(torch.stack(q_utts, dim=1)).squeeze(2) 174 | 175 | # compute the previous action score 176 | q_acts = [] 177 | for i, C_act in enumerate(C_acts): 178 | q_act, _ = attend(C_act.unsqueeze(0), c_utt[i].unsqueeze(0), lens=[C_act.size(0)]) 179 | q_acts.append(q_act) 180 | y_acts = torch.cat(q_acts, dim=0).mm(C_vals.transpose(0, 1)) 181 | 182 | # combine the scores 183 | ys[s] = F.sigmoid(y_utts + self.score_weight * y_acts) 184 | 185 | if self.training: 186 | # create label variable and compute loss 187 | labels = {s: [len(self.ontology.values[s]) * [0] for i in range(len(batch))] for s in self.ontology.slots} 188 | for i, e in enumerate(batch): 189 | for s, v in e.turn_label: 190 | labels[s][i][self.ontology.values[s].index(v)] = 1 191 | labels = {s: torch.Tensor(m).to(self.device) for s, m in labels.items()} 192 | 193 | loss = 0 194 | for s in self.ontology.slots: 195 | loss += F.binary_cross_entropy(ys[s], labels[s]) 196 | else: 197 | loss = torch.Tensor([0]).to(self.device) 198 | return loss, {s: v.data.tolist() for s, v in ys.items()} 199 | 200 | def get_train_logger(self): 201 | logger = logging.getLogger('train-{}'.format(self.__class__.__name__)) 202 | formatter = logging.Formatter('%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s] %(message)s') 203 | file_handler = logging.FileHandler(os.path.join(self.args.dout, 'train.log')) 204 | file_handler.setFormatter(formatter) 205 | logger.addHandler(file_handler) 206 | return logger 207 | 208 | def run_train(self, train, dev, args): 209 | track = defaultdict(list) 210 | iteration = 0 211 | best = {} 212 | logger = self.get_train_logger() 213 | if self.optimizer is None: 214 | self.set_optimizer() 215 | 216 | for epoch in range(args.epoch): 217 | logger.info('starting epoch {}'.format(epoch)) 218 | 219 | # train and update parameters 220 | self.train() 221 | for batch in train.batch(batch_size=args.batch_size, shuffle=True): 222 | iteration += 1 223 | self.zero_grad() 224 | loss, scores = self.forward(batch) 225 | loss.backward() 226 | self.optimizer.step() 227 | track['loss'].append(loss.item()) 228 | 229 | # evalute on train and dev 230 | summary = {'iteration': iteration, 'epoch': epoch} 231 | for k, v in track.items(): 232 | summary[k] = sum(v) / len(v) 233 | summary.update({'eval_train_{}'.format(k): v for k, v in self.run_eval(train, args).items()}) 234 | summary.update({'eval_dev_{}'.format(k): v for k, v in self.run_eval(dev, args).items()}) 235 | 236 | # do early stopping saves 237 | stop_key = 'eval_dev_{}'.format(args.stop) 238 | train_key = 'eval_train_{}'.format(args.stop) 239 | if best.get(stop_key, 0) <= summary[stop_key]: 240 | best_dev = '{:f}'.format(summary[stop_key]) 241 | best_train = '{:f}'.format(summary[train_key]) 242 | best.update(summary) 243 | self.save( 244 | best, 245 | identifier='epoch={epoch},iter={iteration},train_{key}={train},dev_{key}={dev}'.format( 246 | epoch=epoch, iteration=iteration, train=best_train, dev=best_dev, key=args.stop, 247 | ) 248 | ) 249 | self.prune_saves() 250 | dev.record_preds( 251 | preds=self.run_pred(dev, self.args), 252 | to_file=os.path.join(self.args.dout, 'dev.pred.json'), 253 | ) 254 | summary.update({'best_{}'.format(k): v for k, v in best.items()}) 255 | logger.info(pformat(summary)) 256 | track.clear() 257 | 258 | def extract_predictions(self, scores, threshold=0.5): 259 | batch_size = len(list(scores.values())[0]) 260 | predictions = [set() for i in range(batch_size)] 261 | for s in self.ontology.slots: 262 | for i, p in enumerate(scores[s]): 263 | triggered = [(s, v, p_v) for v, p_v in zip(self.ontology.values[s], p) if p_v > threshold] 264 | if s == 'request': 265 | # we can have multiple requests predictions 266 | predictions[i] |= set([(s, v) for s, v, p_v in triggered]) 267 | elif triggered: 268 | # only extract the top inform prediction 269 | sort = sorted(triggered, key=lambda tup: tup[-1], reverse=True) 270 | predictions[i].add((sort[0][0], sort[0][1])) 271 | return predictions 272 | 273 | def run_pred(self, dev, args): 274 | self.eval() 275 | predictions = [] 276 | for batch in dev.batch(batch_size=args.batch_size): 277 | loss, scores = self.forward(batch) 278 | predictions += self.extract_predictions(scores) 279 | return predictions 280 | 281 | def run_eval(self, dev, args): 282 | predictions = self.run_pred(dev, args) 283 | return dev.evaluate_preds(predictions) 284 | 285 | def save_config(self): 286 | fname = '{}/config.json'.format(self.args.dout) 287 | with open(fname, 'wt') as f: 288 | logging.info('saving config to {}'.format(fname)) 289 | json.dump(vars(self.args), f, indent=2) 290 | 291 | @classmethod 292 | def load_config(cls, fname, ontology, **kwargs): 293 | with open(fname) as f: 294 | logging.info('loading config from {}'.format(fname)) 295 | args = object() 296 | for k, v in json.load(f): 297 | setattr(args, k, kwargs.get(k, v)) 298 | return cls(args, ontology) 299 | 300 | def save(self, summary, identifier): 301 | fname = '{}/{}.t7'.format(self.args.dout, identifier) 302 | logging.info('saving model to {}'.format(fname)) 303 | state = { 304 | 'args': vars(self.args), 305 | 'model': self.state_dict(), 306 | 'summary': summary, 307 | 'optimizer': self.optimizer.state_dict(), 308 | } 309 | torch.save(state, fname) 310 | 311 | def load(self, fname): 312 | logging.info('loading model from {}'.format(fname)) 313 | state = torch.load(fname) 314 | self.load_state_dict(state['model']) 315 | self.set_optimizer() 316 | self.optimizer.load_state_dict(state['optimizer']) 317 | 318 | def get_saves(self, directory=None): 319 | if directory is None: 320 | directory = self.args.dout 321 | files = [f for f in os.listdir(directory) if f.endswith('.t7')] 322 | scores = [] 323 | for fname in files: 324 | re_str = r'dev_{}=([0-9\.]+)'.format(self.args.stop) 325 | dev_acc = re.findall(re_str, fname) 326 | if dev_acc: 327 | score = float(dev_acc[0].strip('.')) 328 | scores.append((score, os.path.join(directory, fname))) 329 | if not scores: 330 | raise Exception('No files found!') 331 | scores.sort(key=lambda tup: tup[0], reverse=True) 332 | return scores 333 | 334 | def prune_saves(self, n_keep=5): 335 | scores_and_files = self.get_saves() 336 | if len(scores_and_files) > n_keep: 337 | for score, fname in scores_and_files[n_keep:]: 338 | os.remove(fname) 339 | 340 | def load_best_save(self, directory): 341 | if directory is None: 342 | directory = self.args.dout 343 | 344 | scores_and_files = self.get_saves(directory=directory) 345 | if scores_and_files: 346 | assert scores_and_files, 'no saves exist at {}'.format(directory) 347 | score, fname = scores_and_files[0] 348 | self.load(fname) 349 | -------------------------------------------------------------------------------- /preprocess_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import json 4 | import logging 5 | import requests 6 | from tqdm import tqdm 7 | from vocab import Vocab 8 | from embeddings import GloveEmbedding, KazumaCharEmbedding 9 | from dataset import Dataset, Ontology 10 | 11 | 12 | root_dir = os.path.dirname(__file__) 13 | data_dir = os.path.join(root_dir, 'data', 'woz') 14 | 15 | 16 | draw = os.path.join(data_dir, 'raw') 17 | dann = os.path.join(data_dir, 'ann') 18 | 19 | splits = ['dev', 'train', 'test'] 20 | 21 | 22 | def download(url, to_file): 23 | r = requests.get(url, stream=True) 24 | with open(to_file, 'wb') as f: 25 | for chunk in r.iter_content(chunk_size=1024): 26 | if chunk: 27 | f.write(chunk) 28 | 29 | 30 | def missing_files(d, files): 31 | return not all([os.path.isfile(os.path.join(d, '{}.json'.format(s))) for s in files]) 32 | 33 | 34 | if __name__ == '__main__': 35 | if missing_files(draw, splits): 36 | if not os.path.isdir(draw): 37 | os.makedirs(draw) 38 | download('https://github.com/nmrksic/neural-belief-tracker/raw/master/data/woz/woz_train_en.json', os.path.join(draw, 'train.json')) 39 | download('https://github.com/nmrksic/neural-belief-tracker/raw/master/data/woz/woz_validate_en.json', os.path.join(draw, 'dev.json')) 40 | download('https://github.com/nmrksic/neural-belief-tracker/raw/master/data/woz/woz_test_en.json', os.path.join(draw, 'test.json')) 41 | 42 | if missing_files(dann, files=splits + ['ontology', 'vocab', 'emb']): 43 | if not os.path.isdir(dann): 44 | os.makedirs(dann) 45 | dataset = {} 46 | ontology = Ontology() 47 | vocab = Vocab() 48 | vocab.word2index(['', ''], train=True) 49 | for s in splits: 50 | fname = '{}.json'.format(s) 51 | logging.warn('Annotating {}'.format(s)) 52 | dataset[s] = Dataset.annotate_raw(os.path.join(draw, fname)) 53 | dataset[s].numericalize_(vocab) 54 | ontology = ontology + dataset[s].extract_ontology() 55 | with open(os.path.join(dann, fname), 'wt') as f: 56 | json.dump(dataset[s].to_dict(), f) 57 | ontology.numericalize_(vocab) 58 | with open(os.path.join(dann, 'ontology.json'), 'wt') as f: 59 | json.dump(ontology.to_dict(), f) 60 | with open(os.path.join(dann, 'vocab.json'), 'wt') as f: 61 | json.dump(vocab.to_dict(), f) 62 | 63 | logging.warn('Computing word embeddings') 64 | embeddings = [GloveEmbedding(), KazumaCharEmbedding()] 65 | E = [] 66 | for w in tqdm(vocab._index2word): 67 | e = [] 68 | for emb in embeddings: 69 | e += emb.emb(w, default='zero') 70 | E.append(e) 71 | with open(os.path.join(dann, 'emb.json'), 'wt') as f: 72 | json.dump(E, f) 73 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | protobuf==3.4.0 2 | requests==2.18.4 3 | stanza==0.3 4 | tqdm==4.19.1.post1 5 | vocab==0.0.3 6 | embeddings==0.0.4 7 | http://download.pytorch.org/whl/cu90/torch-0.4.0-cp36-cp36m-linux_x86_64.whl 8 | numpy==1.13.1 9 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser 3 | from utils import load_dataset, get_models, load_model 4 | import os 5 | import logging 6 | import numpy as np 7 | from pprint import pprint 8 | import torch 9 | from random import seed 10 | 11 | 12 | def run(args): 13 | pprint(args) 14 | logging.basicConfig(level=logging.INFO) 15 | 16 | np.random.seed(args.seed) 17 | torch.manual_seed(args.seed) 18 | seed(args.seed) 19 | 20 | dataset, ontology, vocab, Eword = load_dataset() 21 | 22 | model = load_model(args.model, args, ontology, vocab) 23 | model.save_config() 24 | model.load_emb(Eword) 25 | 26 | model = model.to(model.device) 27 | if not args.test: 28 | logging.info('Starting train') 29 | model.run_train(dataset['train'], dataset['dev'], args) 30 | if args.resume: 31 | model.load_best_save(directory=args.resume) 32 | else: 33 | model.load_best_save(directory=args.dout) 34 | model = model.to(model.device) 35 | logging.info('Running dev evaluation') 36 | dev_out = model.run_eval(dataset['dev'], args) 37 | pprint(dev_out) 38 | 39 | 40 | def get_args(): 41 | parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) 42 | parser.add_argument('--dexp', help='root experiment folder', default='exp') 43 | parser.add_argument('--model', help='which model to use', default='glad', choices=get_models()) 44 | parser.add_argument('--epoch', help='max epoch to run for', default=50, type=int) 45 | parser.add_argument('--demb', help='word embedding size', default=400, type=int) 46 | parser.add_argument('--dhid', help='hidden state size', default=200, type=int) 47 | parser.add_argument('--batch_size', help='batch size', default=50, type=int) 48 | parser.add_argument('--lr', help='learning rate', default=1e-3, type=float) 49 | parser.add_argument('--stop', help='slot to early stop on', default='joint_goal') 50 | parser.add_argument('--resume', help='save directory to resume from') 51 | parser.add_argument('-n', '--nick', help='nickname for model', default='default') 52 | parser.add_argument('--seed', default=42, help='random seed', type=int) 53 | parser.add_argument('--test', action='store_true', help='run in evaluation only mode') 54 | parser.add_argument('--gpu', type=int, help='which GPU to use') 55 | parser.add_argument('--dropout', nargs='*', help='dropout rates', default=['emb=0.2', 'local=0.2', 'global=0.2']) 56 | args = parser.parse_args() 57 | args.dout = os.path.join(args.dexp, args.model, args.nick) 58 | args.dropout = {d.split('=')[0]: float(d.split('=')[1]) for d in args.dropout} 59 | if not os.path.isdir(args.dout): 60 | os.makedirs(args.dout) 61 | return args 62 | 63 | 64 | if __name__ == '__main__': 65 | args = get_args() 66 | run(args) 67 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | from pprint import pformat 5 | from importlib import import_module 6 | from vocab import Vocab 7 | from dataset import Dataset, Ontology 8 | from preprocess_data import dann 9 | 10 | 11 | def load_dataset(splits=('train', 'dev', 'test')): 12 | with open(os.path.join(dann, 'ontology.json')) as f: 13 | ontology = Ontology.from_dict(json.load(f)) 14 | with open(os.path.join(dann, 'vocab.json')) as f: 15 | vocab = Vocab.from_dict(json.load(f)) 16 | with open(os.path.join(dann, 'emb.json')) as f: 17 | E = json.load(f) 18 | dataset = {} 19 | for split in splits: 20 | with open(os.path.join(dann, '{}.json'.format(split))) as f: 21 | logging.warn('loading split {}'.format(split)) 22 | dataset[split] = Dataset.from_dict(json.load(f)) 23 | 24 | logging.info('dataset sizes: {}'.format(pformat({k: len(v) for k, v in dataset.items()}))) 25 | return dataset, ontology, vocab, E 26 | 27 | 28 | def get_models(): 29 | return [m.replace('.py', '') for m in os.listdir('models') if not m.startswith('_') and m != 'model'] 30 | 31 | 32 | def load_model(model, *args, **kwargs): 33 | Model = import_module('models.{}'.format(model)).Model 34 | model = Model(*args, **kwargs) 35 | logging.info('loaded model {}'.format(Model)) 36 | return model 37 | -------------------------------------------------------------------------------- /version.txt: -------------------------------------------------------------------------------- 1 | 0.1 2 | --------------------------------------------------------------------------------