├── .dockerignore
├── .gitignore
├── CODEOWNERS
├── Dockerfile
├── LICENSE
├── README.md
├── __init__.py
├── dataset.py
├── evaluate.py
├── models
    ├── __init__.py
    └── glad.py
├── preprocess_data.py
├── requirements.txt
├── train.py
├── utils.py
└── version.txt


/.dockerignore:
--------------------------------------------------------------------------------
1 | exp/
2 | data/
3 | Dockerfile
4 | .git/
5 | *.py[cod]
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | Mar*/
 3 | .DS_Store
 4 | *.py[cod]
 5 | *.json
 6 | *.json[~]
 7 | *.save
 8 | *.log
 9 | *.model
10 | *.t7
11 | *.npy
12 | *.flist
13 | *.zip
14 | *.gzip
15 | *.tar
16 | 


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # Comment line immediately above ownership line is reserved for related gus information. Please be careful while editing.
2 | #ECCN:Open Source
3 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:9.0-base-ubuntu16.04
 2 | 
 3 | # install Miniconda
 4 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
 5 | ENV PATH /opt/conda/bin:$PATH
 6 | 
 7 | RUN apt-get update --fix-missing && apt-get install -y wget bzip2 ca-certificates \
 8 |     libglib2.0-0 libxext6 libsm6 libxrender1 \
 9 |     git mercurial subversion
10 | 
11 | RUN wget --quiet https://repo.continuum.io/miniconda/Miniconda3-4.4.10-Linux-x86_64.sh -O ~/miniconda.sh && \
12 |     /bin/bash ~/miniconda.sh -b -p /opt/conda && \
13 |     rm ~/miniconda.sh && \
14 |     /opt/conda/bin/conda clean -tipsy && \
15 |     ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
16 |     echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \
17 |     echo "conda activate base" >> ~/.bashrc
18 | 
19 | # copy GLAD
20 | RUN mkdir -p /opt/glad
21 | WORKDIR /opt/glad
22 | 
23 | # install dependencies
24 | COPY requirements.txt .
25 | RUN pip install -r requirements.txt
26 | 
27 | # copy source
28 | COPY . .
29 | 
30 | # volumes and environment variables
31 | ENV EMBEDDINGS_ROOT /opt/embeddings
32 | RUN mkdir -p /opt/embeddings
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2018, Salesforce
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Global-Locally Self-Attentive Dialogue State Tracker
  2 | 
  3 | This repository contains an implementation of the [Global-Locally Self-Attentive Dialogue State Tracker (GLAD)](https://arxiv.org/abs/1805.09655).
  4 | If you use this in your work, please cite the following
  5 | 
  6 | ```
  7 | @inproceedings{ zhong2018global,
  8 |   title={ Global-Locally Self-Attentive Encoder for Dialogue State Tracking },
  9 |   author={ Zhong, Victor and Xiong, Caiming and Socher, Richard },
 10 |   booktitle={ ACL },
 11 |   year={ 2018 }
 12 | }
 13 | ```
 14 | 
 15 | 
 16 | # Install dependencies
 17 | 
 18 | Using Docker
 19 | 
 20 | ```
 21 | docker build -t glad:0.4 .
 22 | docker run --name embeddings -d vzhong/embeddings:0.0.5  # get the embeddings
 23 | env NV_GPU=0 nvidia-docker run --name glad -d -t --net host --volumes-from embeddings glad:0.4
 24 | ```
 25 | 
 26 | If you do not want to build the Docker image, then run the following (you still need to have the CoreNLP server).
 27 | 
 28 | ```
 29 | pip install -r requirements.txt
 30 | ```
 31 | 
 32 | # Download and annotate data
 33 | 
 34 | This project uses Stanford CoreNLP to annotate the dataset.
 35 | In particular, we use the [Stanford NLP Stanza python interface](https://github.com/stanfordnlp/stanza).
 36 | To run the server, do
 37 | 
 38 | ```
 39 | docker run --name corenlp -d -p 9000:9000 vzhong/corenlp-server
 40 | ```
 41 | 
 42 | The first time you preprocess the data, we will [download word embeddings and character embeddings and put them into a SQLite database](https://github.com/vzhong/embeddings), which will be slow.
 43 | Subsequent runs will be much faster.
 44 | 
 45 | ```
 46 | docker exec glad python preprocess_data.py
 47 | ```
 48 | 
 49 | The raw data will be stored in `data/woz/raw` of the container.
 50 | The annotation results will be stored in `data/woz/ann` of the container.
 51 | 
 52 | If you do not want to build the Docker image, then run
 53 | 
 54 | ```
 55 | python preprocess_data.py
 56 | ```
 57 | 
 58 | 
 59 | # Train model
 60 | 
 61 | You can checkout the training options via `python train.py -h`.
 62 | By default, `train.py` will save checkpoints to `exp/glad/default`.
 63 | 
 64 | ```
 65 | docker exec glad python train.py --gpu 0
 66 | ```
 67 | 
 68 | You can attach to the container via `docker exec glad -it bin/bash` to look at what's inside or `docker cp glad /opt/glad/exp exp` to copy out the experiment results.
 69 | 
 70 | If you do not want to build the Docker image, then run
 71 | 
 72 | ```
 73 | python train.py --gpu 0
 74 | ```
 75 | 
 76 | 
 77 | # Evaluation
 78 | 
 79 | You can evaluate the model using
 80 | 
 81 | ```
 82 | docker exec glad python evaluate.py --gpu 0 --split test exp/glad/default
 83 | ```
 84 | 
 85 | You can also dump a predictions file by specifying the `--fout` flag.
 86 | In this case, the output will be a list of lists.
 87 | Each `i`th sublist is the set of predicted slot-value pairs for the `i`th turn.
 88 | Please see `evaluate.py` to see how to match up the turn predictions with the dialogues.
 89 | 
 90 | If you do not want to build the Docker image, then run
 91 | 
 92 | ```
 93 | python evaluate.py --gpu 0 --split test exp/glad/default
 94 | ```
 95 | 
 96 | 
 97 | # Contribution
 98 | 
 99 | Pull requests are welcome!
100 | If you have any questions, please create an issue or contact the corresponding author at `victor <at> victorzhong <dot> com`.
101 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/glad/d8de6a22a3be0f2a63a12b799c3041b4a8b4081d/__init__.py


--------------------------------------------------------------------------------
/dataset.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from collections import defaultdict
  3 | import numpy as np
  4 | from tqdm import tqdm
  5 | from stanza.nlp.corenlp import CoreNLPClient
  6 | 
  7 | 
  8 | client = None
  9 | 
 10 | 
 11 | def annotate(sent):
 12 |     global client
 13 |     if client is None:
 14 |         client = CoreNLPClient(default_annotators='ssplit,tokenize'.split(','))
 15 |     words = []
 16 |     for sent in client.annotate(sent).sentences:
 17 |         for tok in sent:
 18 |             words.append(tok.word)
 19 |     return words
 20 | 
 21 | 
 22 | class Turn:
 23 | 
 24 |     def __init__(self, turn_id, transcript, turn_label, belief_state, system_acts, system_transcript, num=None):
 25 |         self.id = turn_id
 26 |         self.transcript = transcript
 27 |         self.turn_label = turn_label
 28 |         self.belief_state = belief_state
 29 |         self.system_acts = system_acts
 30 |         self.system_transcript = system_transcript
 31 |         self.num = num or {}
 32 | 
 33 |     def to_dict(self):
 34 |         return {'turn_id': self.id, 'transcript': self.transcript, 'turn_label': self.turn_label, 'belief_state': self.belief_state, 'system_acts': self.system_acts, 'system_transcript': self.system_transcript, 'num': self.num}
 35 | 
 36 |     @classmethod
 37 |     def from_dict(cls, d):
 38 |         return cls(**d)
 39 | 
 40 |     @classmethod
 41 |     def annotate_raw(cls, raw):
 42 |         system_acts = []
 43 |         for a in raw['system_acts']:
 44 |             if isinstance(a, list):
 45 |                 s, v = a
 46 |                 system_acts.append(['inform'] + s.split() + ['='] + v.split())
 47 |             else:
 48 |                 system_acts.append(['request'] + a.split())
 49 |         # NOTE: fix inconsistencies in data label
 50 |         fix = {'centre': 'center', 'areas': 'area', 'phone number': 'number'}
 51 |         return cls(
 52 |             turn_id=raw['turn_idx'],
 53 |             transcript=annotate(raw['transcript']),
 54 |             system_acts=system_acts,
 55 |             turn_label=[[fix.get(s.strip(), s.strip()), fix.get(v.strip(), v.strip())] for s, v in raw['turn_label']],
 56 |             belief_state=raw['belief_state'],
 57 |             system_transcript=raw['system_transcript'],
 58 |         )
 59 | 
 60 |     def numericalize_(self, vocab):
 61 |         self.num['transcript'] = vocab.word2index(['<sos>'] + [w.lower() for w in self.transcript + ['<eos>']], train=True)
 62 |         self.num['system_acts'] = [vocab.word2index(['<sos>'] + [w.lower() for w in a] + ['<eos>'], train=True) for a in self.system_acts + [['<sentinel>']]]
 63 | 
 64 | 
 65 | class Dialogue:
 66 | 
 67 |     def __init__(self, dialogue_id, turns):
 68 |         self.id = dialogue_id
 69 |         self.turns = turns
 70 | 
 71 |     def __len__(self):
 72 |         return len(self.turns)
 73 | 
 74 |     def to_dict(self):
 75 |         return {'dialogue_id': self.id, 'turns': [t.to_dict() for t in self.turns]}
 76 | 
 77 |     @classmethod
 78 |     def from_dict(cls, d):
 79 |         return cls(d['dialogue_id'], [Turn.from_dict(t) for t in d['turns']])
 80 | 
 81 |     @classmethod
 82 |     def annotate_raw(cls, raw):
 83 |         return cls(raw['dialogue_idx'], [Turn.annotate_raw(t) for t in raw['dialogue']])
 84 | 
 85 | 
 86 | class Dataset:
 87 | 
 88 |     def __init__(self, dialogues):
 89 |         self.dialogues = dialogues
 90 | 
 91 |     def __len__(self):
 92 |         return len(self.dialogues)
 93 | 
 94 |     def iter_turns(self):
 95 |         for d in self.dialogues:
 96 |             for t in d.turns:
 97 |                 yield t
 98 | 
 99 |     def to_dict(self):
100 |         return {'dialogues': [d.to_dict() for d in self.dialogues]}
101 | 
102 |     @classmethod
103 |     def from_dict(cls, d):
104 |         return cls([Dialogue.from_dict(dd) for dd in d['dialogues']])
105 | 
106 |     @classmethod
107 |     def annotate_raw(cls, fname):
108 |         with open(fname) as f:
109 |             data = json.load(f)
110 |             return cls([Dialogue.annotate_raw(d) for d in tqdm(data)])
111 | 
112 |     def numericalize_(self, vocab):
113 |         for t in self.iter_turns():
114 |             t.numericalize_(vocab)
115 | 
116 |     def extract_ontology(self):
117 |         slots = set()
118 |         values = defaultdict(set)
119 |         for t in self.iter_turns():
120 |             for s, v in t.turn_label:
121 |                 slots.add(s.lower())
122 |                 values[s].add(v.lower())
123 |         return Ontology(sorted(list(slots)), {k: sorted(list(v)) for k, v in values.items()})
124 | 
125 |     def batch(self, batch_size, shuffle=False):
126 |         turns = list(self.iter_turns())
127 |         if shuffle:
128 |             np.random.shuffle(turns)
129 |         for i in tqdm(range(0, len(turns), batch_size)):
130 |             yield turns[i:i+batch_size]
131 | 
132 |     def evaluate_preds(self, preds):
133 |         request = []
134 |         inform = []
135 |         joint_goal = []
136 |         fix = {'centre': 'center', 'areas': 'area', 'phone number': 'number'}
137 |         i = 0
138 |         for d in self.dialogues:
139 |             pred_state = {}
140 |             for t in d.turns:
141 |                 gold_request = set([(s, v) for s, v in t.turn_label if s == 'request'])
142 |                 gold_inform = set([(s, v) for s, v in t.turn_label if s != 'request'])
143 |                 pred_request = set([(s, v) for s, v in preds[i] if s == 'request'])
144 |                 pred_inform = set([(s, v) for s, v in preds[i] if s != 'request'])
145 |                 request.append(gold_request == pred_request)
146 |                 inform.append(gold_inform == pred_inform)
147 | 
148 |                 gold_recovered = set()
149 |                 pred_recovered = set()
150 |                 for s, v in pred_inform:
151 |                     pred_state[s] = v
152 |                 for b in t.belief_state:
153 |                     for s, v in b['slots']:
154 |                         if b['act'] != 'request':
155 |                             gold_recovered.add((b['act'], fix.get(s.strip(), s.strip()), fix.get(v.strip(), v.strip())))
156 |                 for s, v in pred_state.items():
157 |                     pred_recovered.add(('inform', s, v))
158 |                 joint_goal.append(gold_recovered == pred_recovered)
159 |                 i += 1
160 |         return {'turn_inform': np.mean(inform), 'turn_request': np.mean(request), 'joint_goal': np.mean(joint_goal)}
161 | 
162 |     def record_preds(self, preds, to_file):
163 |         data = self.to_dict()
164 |         i = 0
165 |         for d in data['dialogues']:
166 |             for t in d['turns']:
167 |                 t['pred'] = sorted(list(preds[i]))
168 |                 i += 1
169 |         with open(to_file, 'wt') as f:
170 |             json.dump(data, f)
171 | 
172 | 
173 | class Ontology:
174 | 
175 |     def __init__(self, slots=None, values=None, num=None):
176 |         self.slots = slots or []
177 |         self.values = values or {}
178 |         self.num = num or {}
179 | 
180 |     def __add__(self, another):
181 |         new_slots = sorted(list(set(self.slots + another.slots)))
182 |         new_values = {s: sorted(list(set(self.values.get(s, []) + another.values.get(s, [])))) for s in new_slots}
183 |         return Ontology(new_slots, new_values)
184 | 
185 |     def __radd__(self, another):
186 |         return self if another == 0 else self.__add__(another)
187 | 
188 |     def to_dict(self):
189 |         return {'slots': self.slots, 'values': self.values, 'num': self.num}
190 | 
191 |     def numericalize_(self, vocab):
192 |         self.num = {}
193 |         for s, vs in self.values.items():
194 |             self.num[s] = [vocab.word2index(annotate('{} = {}'.format(s, v)) + ['<eos>'], train=True) for v in vs]
195 | 
196 |     @classmethod
197 |     def from_dict(cls, d):
198 |         return cls(**d)
199 | 


--------------------------------------------------------------------------------
/evaluate.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import logging
 4 | from argparse import ArgumentParser, Namespace
 5 | from pprint import pprint
 6 | from utils import load_dataset, load_model
 7 | 
 8 | 
 9 | if __name__ == '__main__':
10 |     parser = ArgumentParser()
11 |     parser.add_argument('dsave', help='save location of model')
12 |     parser.add_argument('--split', help='split to evaluate on', default='dev')
13 |     parser.add_argument('--gpu', type=int, help='gpu to use', default=None)
14 |     parser.add_argument('--fout', help='optional save file to store the predictions')
15 |     args = parser.parse_args()
16 | 
17 |     logging.basicConfig(level=logging.INFO)
18 | 
19 |     with open(os.path.join(args.dsave, 'config.json')) as f:
20 |         args_save = Namespace(**json.load(f))
21 |         args_save.gpu = args.gpu
22 |     pprint(args_save)
23 | 
24 |     dataset, ontology, vocab, Eword = load_dataset()
25 | 
26 |     model = load_model(args_save.model, args_save, ontology, vocab)
27 |     model.load_best_save(directory=args.dsave)
28 |     if args.gpu is not None:
29 |         model.cuda(args.gpu)
30 | 
31 |     logging.info('Making predictions for {} dialogues and {} turns'.format(len(dataset[args.split]), len(list(dataset[args.split].iter_turns()))))
32 |     preds = model.run_pred(dataset[args.split], args_save)
33 |     pprint(dataset[args.split].evaluate_preds(preds))
34 | 
35 |     if args.fout:
36 |         with open(args.fout, 'wt') as f:
37 |             # predictions is a list of sets, need to convert to list of lists to make it JSON serializable
38 |             json.dump([list(p) for p in preds], f, indent=2)
39 | 


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/glad/d8de6a22a3be0f2a63a12b799c3041b4a8b4081d/models/__init__.py


--------------------------------------------------------------------------------
/models/glad.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | from torch import optim
  4 | from torch.nn import functional as F
  5 | import numpy as np
  6 | import logging
  7 | import os
  8 | import re
  9 | import json
 10 | from collections import defaultdict
 11 | from pprint import pformat
 12 | 
 13 | 
 14 | def pad(seqs, emb, device, pad=0):
 15 |     lens = [len(s) for s in seqs]
 16 |     max_len = max(lens)
 17 |     padded = torch.LongTensor([s + (max_len-l) * [pad] for s, l in zip(seqs, lens)])
 18 |     return emb(padded.to(device)), lens
 19 | 
 20 | 
 21 | def run_rnn(rnn, inputs, lens):
 22 |     # sort by lens
 23 |     order = np.argsort(lens)[::-1].tolist()
 24 |     reindexed = inputs.index_select(0, inputs.data.new(order).long())
 25 |     reindexed_lens = [lens[i] for i in order]
 26 |     packed = nn.utils.rnn.pack_padded_sequence(reindexed, reindexed_lens, batch_first=True)
 27 |     outputs, _ = rnn(packed)
 28 |     padded, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True, padding_value=0.)
 29 |     reverse_order = np.argsort(order).tolist()
 30 |     recovered = padded.index_select(0, inputs.data.new(reverse_order).long())
 31 |     # reindexed_lens = [lens[i] for i in order]
 32 |     # recovered_lens = [reindexed_lens[i] for i in reverse_order]
 33 |     # assert recovered_lens == lens
 34 |     return recovered
 35 | 
 36 | 
 37 | def attend(seq, cond, lens):
 38 |     """
 39 |     attend over the sequences `seq` using the condition `cond`.
 40 |     """
 41 |     scores = cond.unsqueeze(1).expand_as(seq).mul(seq).sum(2)
 42 |     max_len = max(lens)
 43 |     for i, l in enumerate(lens):
 44 |         if l < max_len:
 45 |             scores.data[i, l:] = -np.inf
 46 |     scores = F.softmax(scores, dim=1)
 47 |     context = scores.unsqueeze(2).expand_as(seq).mul(seq).sum(1)
 48 |     return context, scores
 49 | 
 50 | 
 51 | class FixedEmbedding(nn.Embedding):
 52 |     """
 53 |     this is the same as `nn.Embedding` but detaches the result from the graph and has dropout after lookup.
 54 |     """
 55 | 
 56 |     def __init__(self, *args, dropout=0, **kwargs):
 57 |         super().__init__(*args, **kwargs)
 58 |         self.dropout = dropout
 59 | 
 60 |     def forward(self, *args, **kwargs):
 61 |         out = super().forward(*args, **kwargs)
 62 |         out.detach_()
 63 |         return F.dropout(out, self.dropout, self.training)
 64 | 
 65 | 
 66 | class SelfAttention(nn.Module):
 67 |     """
 68 |     scores each element of the sequence with a linear layer and uses the normalized scores to compute a context over the sequence.
 69 |     """
 70 | 
 71 |     def __init__(self, d_hid, dropout=0.):
 72 |         super().__init__()
 73 |         self.scorer = nn.Linear(d_hid, 1)
 74 |         self.dropout = nn.Dropout(dropout)
 75 | 
 76 |     def forward(self, inp, lens):
 77 |         batch_size, seq_len, d_feat = inp.size()
 78 |         inp = self.dropout(inp)
 79 |         scores = self.scorer(inp.contiguous().view(-1, d_feat)).view(batch_size, seq_len)
 80 |         max_len = max(lens)
 81 |         for i, l in enumerate(lens):
 82 |             if l < max_len:
 83 |                 scores.data[i, l:] = -np.inf
 84 |         scores = F.softmax(scores, dim=1)
 85 |         context = scores.unsqueeze(2).expand_as(inp).mul(inp).sum(1)
 86 |         return context
 87 | 
 88 | 
 89 | class GLADEncoder(nn.Module):
 90 |     """
 91 |     the GLAD encoder described in https://arxiv.org/abs/1805.09655.
 92 |     """
 93 | 
 94 |     def __init__(self, din, dhid, slots, dropout=None):
 95 |         super().__init__()
 96 |         self.dropout = dropout or {}
 97 |         self.global_rnn = nn.LSTM(din, dhid, bidirectional=True, batch_first=True)
 98 |         self.global_selfattn = SelfAttention(2 * dhid, dropout=self.dropout.get('selfattn', 0.))
 99 |         for s in slots:
100 |             setattr(self, '{}_rnn'.format(s), nn.LSTM(din, dhid, bidirectional=True, batch_first=True, dropout=self.dropout.get('rnn', 0.)))
101 |             setattr(self, '{}_selfattn'.format(s), SelfAttention(2*dhid, dropout=self.dropout.get('selfattn', 0.)))
102 |         self.slots = slots
103 |         self.beta_raw = nn.Parameter(torch.Tensor(len(slots)))
104 |         nn.init.uniform_(self.beta_raw, -0.01, 0.01)
105 | 
106 |     def beta(self, slot):
107 |         return F.sigmoid(self.beta_raw[self.slots.index(slot)])
108 | 
109 |     def forward(self, x, x_len, slot, default_dropout=0.2):
110 |         local_rnn = getattr(self, '{}_rnn'.format(slot))
111 |         local_selfattn = getattr(self, '{}_selfattn'.format(slot))
112 |         beta = self.beta(slot)
113 |         local_h = run_rnn(local_rnn, x, x_len)
114 |         global_h = run_rnn(self.global_rnn, x, x_len)
115 |         h = F.dropout(local_h, self.dropout.get('local', default_dropout), self.training) * beta + F.dropout(global_h, self.dropout.get('global', default_dropout), self.training) * (1-beta)
116 |         c = F.dropout(local_selfattn(h, x_len), self.dropout.get('local', default_dropout), self.training) * beta + F.dropout(self.global_selfattn(h, x_len), self.dropout.get('global', default_dropout), self.training) * (1-beta)
117 |         return h, c
118 | 
119 | 
120 | class Model(nn.Module):
121 |     """
122 |     the GLAD model described in https://arxiv.org/abs/1805.09655.
123 |     """
124 | 
125 |     def __init__(self, args, ontology, vocab):
126 |         super().__init__()
127 |         self.optimizer = None
128 |         self.args = args
129 |         self.vocab = vocab
130 |         self.ontology = ontology
131 |         self.emb_fixed = FixedEmbedding(len(vocab), args.demb, dropout=args.dropout.get('emb', 0.2))
132 | 
133 |         self.utt_encoder = GLADEncoder(args.demb, args.dhid, self.ontology.slots, dropout=args.dropout)
134 |         self.act_encoder = GLADEncoder(args.demb, args.dhid, self.ontology.slots, dropout=args.dropout)
135 |         self.ont_encoder = GLADEncoder(args.demb, args.dhid, self.ontology.slots, dropout=args.dropout)
136 |         self.utt_scorer = nn.Linear(2 * args.dhid, 1)
137 |         self.score_weight = nn.Parameter(torch.Tensor([0.5]))
138 | 
139 |     @property
140 |     def device(self):
141 |         if self.args.gpu is not None and torch.cuda.is_available():
142 |             return torch.device('cuda')
143 |         else:
144 |             return torch.device('cpu')
145 | 
146 |     def set_optimizer(self):
147 |         self.optimizer = optim.Adam(self.parameters(), lr=self.args.lr)
148 | 
149 |     def load_emb(self, Eword):
150 |         new = self.emb_fixed.weight.data.new
151 |         self.emb_fixed.weight.data.copy_(new(Eword))
152 | 
153 |     def forward(self, batch):
154 |         # convert to variables and look up embeddings
155 |         eos = self.vocab.word2index('<eos>')
156 |         utterance, utterance_len = pad([e.num['transcript'] for e in batch], self.emb_fixed, self.device, pad=eos)
157 |         acts = [pad(e.num['system_acts'], self.emb_fixed, self.device, pad=eos) for e in batch]
158 |         ontology = {s: pad(v, self.emb_fixed, self.device, pad=eos) for s, v in self.ontology.num.items()}
159 | 
160 |         ys = {}
161 |         for s in self.ontology.slots:
162 |             # for each slot, compute the scores for each value
163 |             H_utt, c_utt = self.utt_encoder(utterance, utterance_len, slot=s)
164 |             _, C_acts = list(zip(*[self.act_encoder(a, a_len, slot=s) for a, a_len in acts]))
165 |             _, C_vals = self.ont_encoder(ontology[s][0], ontology[s][1], slot=s)
166 | 
167 |             # compute the utterance score
168 |             y_utts = []
169 |             q_utts = []
170 |             for c_val in C_vals:
171 |                 q_utt, _ = attend(H_utt, c_val.unsqueeze(0).expand(len(batch), *c_val.size()), lens=utterance_len)
172 |                 q_utts.append(q_utt)
173 |             y_utts = self.utt_scorer(torch.stack(q_utts, dim=1)).squeeze(2)
174 | 
175 |             # compute the previous action score
176 |             q_acts = []
177 |             for i, C_act in enumerate(C_acts):
178 |                 q_act, _ = attend(C_act.unsqueeze(0), c_utt[i].unsqueeze(0), lens=[C_act.size(0)])
179 |                 q_acts.append(q_act)
180 |             y_acts = torch.cat(q_acts, dim=0).mm(C_vals.transpose(0, 1))
181 | 
182 |             # combine the scores
183 |             ys[s] = F.sigmoid(y_utts + self.score_weight * y_acts)
184 | 
185 |         if self.training:
186 |             # create label variable and compute loss
187 |             labels = {s: [len(self.ontology.values[s]) * [0] for i in range(len(batch))] for s in self.ontology.slots}
188 |             for i, e in enumerate(batch):
189 |                 for s, v in e.turn_label:
190 |                     labels[s][i][self.ontology.values[s].index(v)] = 1
191 |             labels = {s: torch.Tensor(m).to(self.device) for s, m in labels.items()}
192 | 
193 |             loss = 0
194 |             for s in self.ontology.slots:
195 |                 loss += F.binary_cross_entropy(ys[s], labels[s])
196 |         else:
197 |             loss = torch.Tensor([0]).to(self.device)
198 |         return loss, {s: v.data.tolist() for s, v in ys.items()}
199 | 
200 |     def get_train_logger(self):
201 |         logger = logging.getLogger('train-{}'.format(self.__class__.__name__))
202 |         formatter = logging.Formatter('%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s]  %(message)s')
203 |         file_handler = logging.FileHandler(os.path.join(self.args.dout, 'train.log'))
204 |         file_handler.setFormatter(formatter)
205 |         logger.addHandler(file_handler)
206 |         return logger
207 | 
208 |     def run_train(self, train, dev, args):
209 |         track = defaultdict(list)
210 |         iteration = 0
211 |         best = {}
212 |         logger = self.get_train_logger()
213 |         if self.optimizer is None:
214 |             self.set_optimizer()
215 | 
216 |         for epoch in range(args.epoch):
217 |             logger.info('starting epoch {}'.format(epoch))
218 | 
219 |             # train and update parameters
220 |             self.train()
221 |             for batch in train.batch(batch_size=args.batch_size, shuffle=True):
222 |                 iteration += 1
223 |                 self.zero_grad()
224 |                 loss, scores = self.forward(batch)
225 |                 loss.backward()
226 |                 self.optimizer.step()
227 |                 track['loss'].append(loss.item())
228 | 
229 |             # evalute on train and dev
230 |             summary = {'iteration': iteration, 'epoch': epoch}
231 |             for k, v in track.items():
232 |                 summary[k] = sum(v) / len(v)
233 |             summary.update({'eval_train_{}'.format(k): v for k, v in self.run_eval(train, args).items()})
234 |             summary.update({'eval_dev_{}'.format(k): v for k, v in self.run_eval(dev, args).items()})
235 | 
236 |             # do early stopping saves
237 |             stop_key = 'eval_dev_{}'.format(args.stop)
238 |             train_key = 'eval_train_{}'.format(args.stop)
239 |             if best.get(stop_key, 0) <= summary[stop_key]:
240 |                 best_dev = '{:f}'.format(summary[stop_key])
241 |                 best_train = '{:f}'.format(summary[train_key])
242 |                 best.update(summary)
243 |                 self.save(
244 |                     best,
245 |                     identifier='epoch={epoch},iter={iteration},train_{key}={train},dev_{key}={dev}'.format(
246 |                         epoch=epoch, iteration=iteration, train=best_train, dev=best_dev, key=args.stop,
247 |                     )
248 |                 )
249 |                 self.prune_saves()
250 |                 dev.record_preds(
251 |                     preds=self.run_pred(dev, self.args),
252 |                     to_file=os.path.join(self.args.dout, 'dev.pred.json'),
253 |                 )
254 |             summary.update({'best_{}'.format(k): v for k, v in best.items()})
255 |             logger.info(pformat(summary))
256 |             track.clear()
257 | 
258 |     def extract_predictions(self, scores, threshold=0.5):
259 |         batch_size = len(list(scores.values())[0])
260 |         predictions = [set() for i in range(batch_size)]
261 |         for s in self.ontology.slots:
262 |             for i, p in enumerate(scores[s]):
263 |                 triggered = [(s, v, p_v) for v, p_v in zip(self.ontology.values[s], p) if p_v > threshold]
264 |                 if s == 'request':
265 |                     # we can have multiple requests predictions
266 |                     predictions[i] |= set([(s, v) for s, v, p_v in triggered])
267 |                 elif triggered:
268 |                     # only extract the top inform prediction
269 |                     sort = sorted(triggered, key=lambda tup: tup[-1], reverse=True)
270 |                     predictions[i].add((sort[0][0], sort[0][1]))
271 |         return predictions
272 | 
273 |     def run_pred(self, dev, args):
274 |         self.eval()
275 |         predictions = []
276 |         for batch in dev.batch(batch_size=args.batch_size):
277 |             loss, scores = self.forward(batch)
278 |             predictions += self.extract_predictions(scores)
279 |         return predictions
280 | 
281 |     def run_eval(self, dev, args):
282 |         predictions = self.run_pred(dev, args)
283 |         return dev.evaluate_preds(predictions)
284 | 
285 |     def save_config(self):
286 |         fname = '{}/config.json'.format(self.args.dout)
287 |         with open(fname, 'wt') as f:
288 |             logging.info('saving config to {}'.format(fname))
289 |             json.dump(vars(self.args), f, indent=2)
290 | 
291 |     @classmethod
292 |     def load_config(cls, fname, ontology, **kwargs):
293 |         with open(fname) as f:
294 |             logging.info('loading config from {}'.format(fname))
295 |             args = object()
296 |             for k, v in json.load(f):
297 |                 setattr(args, k, kwargs.get(k, v))
298 |         return cls(args, ontology)
299 | 
300 |     def save(self, summary, identifier):
301 |         fname = '{}/{}.t7'.format(self.args.dout, identifier)
302 |         logging.info('saving model to {}'.format(fname))
303 |         state = {
304 |             'args': vars(self.args),
305 |             'model': self.state_dict(),
306 |             'summary': summary,
307 |             'optimizer': self.optimizer.state_dict(),
308 |         }
309 |         torch.save(state, fname)
310 | 
311 |     def load(self, fname):
312 |         logging.info('loading model from {}'.format(fname))
313 |         state = torch.load(fname)
314 |         self.load_state_dict(state['model'])
315 |         self.set_optimizer()
316 |         self.optimizer.load_state_dict(state['optimizer'])
317 | 
318 |     def get_saves(self, directory=None):
319 |         if directory is None:
320 |             directory = self.args.dout
321 |         files = [f for f in os.listdir(directory) if f.endswith('.t7')]
322 |         scores = []
323 |         for fname in files:
324 |             re_str = r'dev_{}=([0-9\.]+)'.format(self.args.stop)
325 |             dev_acc = re.findall(re_str, fname)
326 |             if dev_acc:
327 |                 score = float(dev_acc[0].strip('.'))
328 |                 scores.append((score, os.path.join(directory, fname)))
329 |         if not scores:
330 |             raise Exception('No files found!')
331 |         scores.sort(key=lambda tup: tup[0], reverse=True)
332 |         return scores
333 | 
334 |     def prune_saves(self, n_keep=5):
335 |         scores_and_files = self.get_saves()
336 |         if len(scores_and_files) > n_keep:
337 |             for score, fname in scores_and_files[n_keep:]:
338 |                 os.remove(fname)
339 | 
340 |     def load_best_save(self, directory):
341 |         if directory is None:
342 |             directory = self.args.dout
343 | 
344 |         scores_and_files = self.get_saves(directory=directory)
345 |         if scores_and_files:
346 |             assert scores_and_files, 'no saves exist at {}'.format(directory)
347 |             score, fname = scores_and_files[0]
348 |             self.load(fname)
349 | 


--------------------------------------------------------------------------------
/preprocess_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | import json
 4 | import logging
 5 | import requests
 6 | from tqdm import tqdm
 7 | from vocab import Vocab
 8 | from embeddings import GloveEmbedding, KazumaCharEmbedding
 9 | from dataset import Dataset, Ontology
10 | 
11 | 
12 | root_dir = os.path.dirname(__file__)
13 | data_dir = os.path.join(root_dir, 'data', 'woz')
14 | 
15 | 
16 | draw = os.path.join(data_dir, 'raw')
17 | dann = os.path.join(data_dir, 'ann')
18 | 
19 | splits = ['dev', 'train', 'test']
20 | 
21 | 
22 | def download(url, to_file):
23 |     r = requests.get(url, stream=True)
24 |     with open(to_file, 'wb') as f:
25 |         for chunk in r.iter_content(chunk_size=1024):
26 |             if chunk:
27 |                 f.write(chunk)
28 | 
29 | 
30 | def missing_files(d, files):
31 |     return not all([os.path.isfile(os.path.join(d, '{}.json'.format(s))) for s in files])
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     if missing_files(draw, splits):
36 |         if not os.path.isdir(draw):
37 |             os.makedirs(draw)
38 |         download('https://github.com/nmrksic/neural-belief-tracker/raw/master/data/woz/woz_train_en.json', os.path.join(draw, 'train.json'))
39 |         download('https://github.com/nmrksic/neural-belief-tracker/raw/master/data/woz/woz_validate_en.json', os.path.join(draw, 'dev.json'))
40 |         download('https://github.com/nmrksic/neural-belief-tracker/raw/master/data/woz/woz_test_en.json', os.path.join(draw, 'test.json'))
41 | 
42 |     if missing_files(dann, files=splits + ['ontology', 'vocab', 'emb']):
43 |         if not os.path.isdir(dann):
44 |             os.makedirs(dann)
45 |         dataset = {}
46 |         ontology = Ontology()
47 |         vocab = Vocab()
48 |         vocab.word2index(['<sos>', '<eos>'], train=True)
49 |         for s in splits:
50 |             fname = '{}.json'.format(s)
51 |             logging.warn('Annotating {}'.format(s))
52 |             dataset[s] = Dataset.annotate_raw(os.path.join(draw, fname))
53 |             dataset[s].numericalize_(vocab)
54 |             ontology = ontology + dataset[s].extract_ontology()
55 |             with open(os.path.join(dann, fname), 'wt') as f:
56 |                 json.dump(dataset[s].to_dict(), f)
57 |         ontology.numericalize_(vocab)
58 |         with open(os.path.join(dann, 'ontology.json'), 'wt') as f:
59 |             json.dump(ontology.to_dict(), f)
60 |         with open(os.path.join(dann, 'vocab.json'), 'wt') as f:
61 |             json.dump(vocab.to_dict(), f)
62 | 
63 |         logging.warn('Computing word embeddings')
64 |         embeddings = [GloveEmbedding(), KazumaCharEmbedding()]
65 |         E = []
66 |         for w in tqdm(vocab._index2word):
67 |             e = []
68 |             for emb in embeddings:
69 |                 e += emb.emb(w, default='zero')
70 |             E.append(e)
71 |         with open(os.path.join(dann, 'emb.json'), 'wt') as f:
72 |             json.dump(E, f)
73 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | protobuf==3.4.0
2 | requests==2.18.4
3 | stanza==0.3
4 | tqdm==4.19.1.post1
5 | vocab==0.0.3
6 | embeddings==0.0.4
7 | http://download.pytorch.org/whl/cu90/torch-0.4.0-cp36-cp36m-linux_x86_64.whl 
8 | numpy==1.13.1
9 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
 3 | from utils import load_dataset, get_models, load_model
 4 | import os
 5 | import logging
 6 | import numpy as np
 7 | from pprint import pprint
 8 | import torch
 9 | from random import seed
10 | 
11 | 
12 | def run(args):
13 |     pprint(args)
14 |     logging.basicConfig(level=logging.INFO)
15 | 
16 |     np.random.seed(args.seed)
17 |     torch.manual_seed(args.seed)
18 |     seed(args.seed)
19 | 
20 |     dataset, ontology, vocab, Eword = load_dataset()
21 | 
22 |     model = load_model(args.model, args, ontology, vocab)
23 |     model.save_config()
24 |     model.load_emb(Eword)
25 | 
26 |     model = model.to(model.device)
27 |     if not args.test:
28 |         logging.info('Starting train')
29 |         model.run_train(dataset['train'], dataset['dev'], args)
30 |     if args.resume:
31 |         model.load_best_save(directory=args.resume)
32 |     else:
33 |         model.load_best_save(directory=args.dout)
34 |     model = model.to(model.device)
35 |     logging.info('Running dev evaluation')
36 |     dev_out = model.run_eval(dataset['dev'], args)
37 |     pprint(dev_out)
38 | 
39 | 
40 | def get_args():
41 |     parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
42 |     parser.add_argument('--dexp', help='root experiment folder', default='exp')
43 |     parser.add_argument('--model', help='which model to use', default='glad', choices=get_models())
44 |     parser.add_argument('--epoch', help='max epoch to run for', default=50, type=int)
45 |     parser.add_argument('--demb', help='word embedding size', default=400, type=int)
46 |     parser.add_argument('--dhid', help='hidden state size', default=200, type=int)
47 |     parser.add_argument('--batch_size', help='batch size', default=50, type=int)
48 |     parser.add_argument('--lr', help='learning rate', default=1e-3, type=float)
49 |     parser.add_argument('--stop', help='slot to early stop on', default='joint_goal')
50 |     parser.add_argument('--resume', help='save directory to resume from')
51 |     parser.add_argument('-n', '--nick', help='nickname for model', default='default')
52 |     parser.add_argument('--seed', default=42, help='random seed', type=int)
53 |     parser.add_argument('--test', action='store_true', help='run in evaluation only mode')
54 |     parser.add_argument('--gpu', type=int, help='which GPU to use')
55 |     parser.add_argument('--dropout', nargs='*', help='dropout rates', default=['emb=0.2', 'local=0.2', 'global=0.2'])
56 |     args = parser.parse_args()
57 |     args.dout = os.path.join(args.dexp, args.model, args.nick)
58 |     args.dropout = {d.split('=')[0]: float(d.split('=')[1]) for d in args.dropout}
59 |     if not os.path.isdir(args.dout):
60 |         os.makedirs(args.dout)
61 |     return args
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     args = get_args()
66 |     run(args)
67 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | import os
 4 | from pprint import pformat
 5 | from importlib import import_module
 6 | from vocab import Vocab
 7 | from dataset import Dataset, Ontology
 8 | from preprocess_data import dann
 9 | 
10 | 
11 | def load_dataset(splits=('train', 'dev', 'test')):
12 |     with open(os.path.join(dann, 'ontology.json')) as f:
13 |         ontology = Ontology.from_dict(json.load(f))
14 |     with open(os.path.join(dann, 'vocab.json')) as f:
15 |         vocab = Vocab.from_dict(json.load(f))
16 |     with open(os.path.join(dann, 'emb.json')) as f:
17 |         E = json.load(f)
18 |     dataset = {}
19 |     for split in splits:
20 |         with open(os.path.join(dann, '{}.json'.format(split))) as f:
21 |             logging.warn('loading split {}'.format(split))
22 |             dataset[split] = Dataset.from_dict(json.load(f))
23 | 
24 |     logging.info('dataset sizes: {}'.format(pformat({k: len(v) for k, v in dataset.items()})))
25 |     return dataset, ontology, vocab, E
26 | 
27 | 
28 | def get_models():
29 |     return [m.replace('.py', '') for m in os.listdir('models') if not m.startswith('_') and m != 'model']
30 | 
31 | 
32 | def load_model(model, *args, **kwargs):
33 |     Model = import_module('models.{}'.format(model)).Model
34 |     model = Model(*args, **kwargs)
35 |     logging.info('loaded model {}'.format(Model))
36 |     return model
37 | 


--------------------------------------------------------------------------------
/version.txt:
--------------------------------------------------------------------------------
1 | 0.1
2 | 


--------------------------------------------------------------------------------