├── README.md
├── fed.py
└── fed_demo.py


/README.md:
--------------------------------------------------------------------------------
 1 | # FED
 2 | 
 3 | ```
 4 | import fed
 5 |   
 6 | # Load model
 7 | model, tokenizer = fed.load_models("microsoft/DialoGPT-large")
 8 | 
 9 | # Evaluate
10 | conversation = "<|endoftext|> Hi! <|endoftext|> Hello, how is your day? <|endoftext|> It's good. It's raining a bit, but I am enjoying a good book. How about you? <|endoftext|> It's good, I just got back from walking my dog What book did you read?"
11 | scores = fed.evaluate(conversation,
12 |                       model,
13 |                       tokenizer)
14 | ```
15 | 


--------------------------------------------------------------------------------
/fed.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | 
  4 | import numpy as np
  5 | import torch
  6 | from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler
  7 | from torch.utils.data.distributed import DistributedSampler
  8 | 
  9 | 
 10 | import math
 11 | from transformers import AutoTokenizer, AutoModelWithLMHead
 12 | 
 13 | # Old loading code. Use for from-scratch models
 14 | #tokenizer = GPT2Tokenizer.from_pretrained('dialogpt')
 15 | #model = GPT2LMHeadModel.from_pretrained('gpt2')
 16 | #weights = torch.load("dialogpt/small_fs.pkl")
 17 | #weights = {k.replace("module.", ""): v for k,v in weights.items()}
 18 | #weights["lm_head.weight"] = weights["lm_head.decoder.weight"]
 19 | #weights.pop("lm_head.decoder.weight",None)
 20 | #model.load_state_dict(weights)
 21 | 
 22 | 
 23 | def load_models(name="microsoft/DialoGPT-large"):
 24 |   tokenizer = AutoTokenizer.from_pretrained(name)
 25 |   model = AutoModelWithLMHead.from_pretrained(name)
 26 |   model.to("cuda")
 27 |   return model, tokenizer
 28 | 
 29 | def score(text, tokenizer, model):
 30 |   if not text.startswith("<|endoftext|> "):
 31 |     text = "<|endoftext|> " + text
 32 |   input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)  # Batch size 1
 33 |   tokenize_input = tokenizer.tokenize(text)
 34 |   #50256 is the token_id for <|endoftext|>
 35 |   tensor_input = torch.tensor([ tokenizer.convert_tokens_to_ids(tokenize_input)]).cuda()
 36 |   with torch.no_grad():
 37 |       outputs = model(tensor_input, labels=tensor_input)
 38 |       loss, logits = outputs[:2]
 39 | 
 40 |   return loss.item() 
 41 | 
 42 | def evaluate(conversation, model, tokenizer):
 43 |   scores = {}
 44 |   turn_level_utts = {
 45 |     "interesting": {
 46 |       "positive": ["Wow that is really interesting.", "That's really interesting!", "Cool! That sounds super interesting."],
 47 |       "negative": ["That's not very interesting.", "That's really boring.", "That was a really boring response."]
 48 |     },
 49 |     "engaging": {
 50 |       "positive": ["Wow! That's really cool!", "Tell me more!", "I'm really interested in learning more about this."],
 51 |       "negative": ["Let's change the topic.", "I don't really care. That's pretty boring.", "I want to talk about something else."]
 52 |     },
 53 |     "specific": {
 54 |       "positive": ["That's good to know. Cool!", "I see, that's interesting.", "That's a good point."],
 55 |       "negative": ["That's a very generic response.", "Not really relevant here.", "That's not really relevant here."]
 56 |     },
 57 |     "relevant": {
 58 |       "positive": [],
 59 |       "negative": ["That's not even related to what I said.", "Don't change the topic!", "Why are you changing the topic?"]
 60 |     },
 61 |     "correct": {
 62 |       "positive": [],
 63 |       "negative": ["You're not understanding me!", "I am so confused right now!", "I don't understand what you're saying."]
 64 |     },
 65 |     "semantically appropriate": {
 66 |       "positive": ["That makes sense!", "You have a good point."],
 67 |       "negative": ["That makes no sense!"]
 68 |     },
 69 |     "understandable": {
 70 |       "positive": ["That makes sense!", "You have a good point."],
 71 |       "negative": ["I don't understand at all!", "I'm so confused!", "That makes no sense!", "What does that even mean?"]
 72 |     },
 73 |     "fluent": {
 74 |       "positive": ["That makes sense!", "You have a good point."],
 75 |       "negative": ["Is that real English?", "I'm so confused right now!", "That makes no sense!"]
 76 |     },
 77 |   }
 78 |   for metric,utts in turn_level_utts.items():
 79 |     pos = utts["positive"]
 80 |     neg = utts["negative"]
 81 | 
 82 |     # Positive score
 83 |     high_score = 0
 84 |     for m in pos:
 85 |       hs = score(conversation + " <|endoftext|> " + m, tokenizer, model) 
 86 |       high_score += hs 
 87 | 
 88 |     high_score = high_score/max(len(pos), 1)
 89 | 
 90 |     # Negative score
 91 |     low_score = 0
 92 |     for m in neg:
 93 |       ls = score(conversation + " <|endoftext|> " + m, tokenizer, model) 
 94 |       low_score += ls 
 95 |     low_score = low_score/max(len(neg), 1)
 96 | 
 97 |     scores[metric] = (low_score - high_score)
 98 | 
 99 |   dialog_level_utts = {
100 |     "coherent": {
101 |       "positive": [],
102 |       "negative": ["You're making no sense at all.", "You're changing the topic so much!", "You are so confusing."]
103 |     },
104 |     "error recovery": {
105 |       "positive": [],
106 |       "negative": ["I am so confused right now.", "You're really confusing.", "I don't understand what you're saying."]
107 |     },
108 |     "consistent": {
109 |       "positive": [],
110 |       "negative": ["That's not what you said earlier!", "Stop contradicting yourself!"],
111 |     },
112 |     "diverse": {
113 |       "positive": [],
114 |       "negative": ["Stop saying the same thing repeatedly.", "Why are you repeating yourself?", "Stop repeating yourself!"]
115 |     },
116 |     "depth": {
117 |       "positive": [],
118 |       "negative": ["Stop changing the topic so much.", "Don't change the topic!"],
119 |     },
120 |     "likeable": {
121 |       "positive": ["I like you!", "You're super polite and fun to talk to", "Great talking to you."],
122 |       "negative": ["You're not very nice.", "You're not very fun to talk to.", "I don't like you."]
123 |     },
124 |     "understand": {
125 |       "positive": [],
126 |       "negative": ["You're not understanding me!", "What are you trying to say?", "I don't understand what you're saying."]
127 |     },
128 |     "flexible": {
129 |       "positive": ["You're very easy to talk to!", "Wow you can talk about a lot of things!"],
130 |       "negative": ["I don't want to talk about that!", "Do you know how to talk about something else?"],
131 |     },
132 |     "informative": {
133 |       "positive": ["Thanks for all the information!", "Wow that's a lot of information.", "You know a lot of facts!"],
134 |       "negative": ["You're really boring.", "You don't really know much."],
135 |     },
136 |     "inquisitive": {
137 |       "positive": ["You ask a lot of questions!", "That's a lot of questions!"],
138 |       "negative": ["You don't ask many questions.", "You don't seem interested."],
139 |     },
140 |   }
141 |   for metric,utts in dialog_level_utts.items():
142 |     pos = utts["positive"]
143 |     neg = utts["negative"]
144 | 
145 |     # Positive
146 |     high_score = 0
147 |     for m in pos:
148 |       hs = score(conversation + " <|endoftext|> " + m, tokenizer, model) 
149 |       high_score += hs 
150 | 
151 |     high_score = high_score/max(len(pos), 1)
152 | 
153 |     # Negative
154 |     low_score = 0
155 |     for m in neg:
156 |       ls = score(conversation + " <|endoftext|> " + m, tokenizer, model) 
157 |       low_score += ls 
158 |     low_score = low_score/max(len(neg), 1)
159 | 
160 |     scores[metric] = (low_score - high_score)
161 | 
162 |   return scores
163 | 


--------------------------------------------------------------------------------
/fed_demo.py:
--------------------------------------------------------------------------------
 1 | import fed
 2 | 
 3 | # Load model
 4 | model, tokenizer = fed.load_models("microsoft/DialoGPT-large")
 5 | 
 6 | # Evaluate
 7 | conversation = "<|endoftext|> Hi! <|endoftext|> Hello, how is your day? <|endoftext|> It's good. It's raining a bit, but I am enjoying a good book. How about you? <|endoftext|> It's good, I just got back from walking my dog What book did you read?"
 8 | scores = fed.evaluate(conversation, 
 9 |                       model, 
10 |                       tokenizer)
11 | 
12 | 


--------------------------------------------------------------------------------