├── README.md ├── fed.py └── fed_demo.py /README.md: -------------------------------------------------------------------------------- 1 | # FED 2 | 3 | ``` 4 | import fed 5 | 6 | # Load model 7 | model, tokenizer = fed.load_models("microsoft/DialoGPT-large") 8 | 9 | # Evaluate 10 | conversation = "<|endoftext|> Hi! <|endoftext|> Hello, how is your day? <|endoftext|> It's good. It's raining a bit, but I am enjoying a good book. How about you? <|endoftext|> It's good, I just got back from walking my dog What book did you read?" 11 | scores = fed.evaluate(conversation, 12 | model, 13 | tokenizer) 14 | ``` 15 | -------------------------------------------------------------------------------- /fed.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | 4 | import numpy as np 5 | import torch 6 | from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler 7 | from torch.utils.data.distributed import DistributedSampler 8 | 9 | 10 | import math 11 | from transformers import AutoTokenizer, AutoModelWithLMHead 12 | 13 | # Old loading code. Use for from-scratch models 14 | #tokenizer = GPT2Tokenizer.from_pretrained('dialogpt') 15 | #model = GPT2LMHeadModel.from_pretrained('gpt2') 16 | #weights = torch.load("dialogpt/small_fs.pkl") 17 | #weights = {k.replace("module.", ""): v for k,v in weights.items()} 18 | #weights["lm_head.weight"] = weights["lm_head.decoder.weight"] 19 | #weights.pop("lm_head.decoder.weight",None) 20 | #model.load_state_dict(weights) 21 | 22 | 23 | def load_models(name="microsoft/DialoGPT-large"): 24 | tokenizer = AutoTokenizer.from_pretrained(name) 25 | model = AutoModelWithLMHead.from_pretrained(name) 26 | model.to("cuda") 27 | return model, tokenizer 28 | 29 | def score(text, tokenizer, model): 30 | if not text.startswith("<|endoftext|> "): 31 | text = "<|endoftext|> " + text 32 | input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0) # Batch size 1 33 | tokenize_input = tokenizer.tokenize(text) 34 | #50256 is the token_id for <|endoftext|> 35 | tensor_input = torch.tensor([ tokenizer.convert_tokens_to_ids(tokenize_input)]).cuda() 36 | with torch.no_grad(): 37 | outputs = model(tensor_input, labels=tensor_input) 38 | loss, logits = outputs[:2] 39 | 40 | return loss.item() 41 | 42 | def evaluate(conversation, model, tokenizer): 43 | scores = {} 44 | turn_level_utts = { 45 | "interesting": { 46 | "positive": ["Wow that is really interesting.", "That's really interesting!", "Cool! That sounds super interesting."], 47 | "negative": ["That's not very interesting.", "That's really boring.", "That was a really boring response."] 48 | }, 49 | "engaging": { 50 | "positive": ["Wow! That's really cool!", "Tell me more!", "I'm really interested in learning more about this."], 51 | "negative": ["Let's change the topic.", "I don't really care. That's pretty boring.", "I want to talk about something else."] 52 | }, 53 | "specific": { 54 | "positive": ["That's good to know. Cool!", "I see, that's interesting.", "That's a good point."], 55 | "negative": ["That's a very generic response.", "Not really relevant here.", "That's not really relevant here."] 56 | }, 57 | "relevant": { 58 | "positive": [], 59 | "negative": ["That's not even related to what I said.", "Don't change the topic!", "Why are you changing the topic?"] 60 | }, 61 | "correct": { 62 | "positive": [], 63 | "negative": ["You're not understanding me!", "I am so confused right now!", "I don't understand what you're saying."] 64 | }, 65 | "semantically appropriate": { 66 | "positive": ["That makes sense!", "You have a good point."], 67 | "negative": ["That makes no sense!"] 68 | }, 69 | "understandable": { 70 | "positive": ["That makes sense!", "You have a good point."], 71 | "negative": ["I don't understand at all!", "I'm so confused!", "That makes no sense!", "What does that even mean?"] 72 | }, 73 | "fluent": { 74 | "positive": ["That makes sense!", "You have a good point."], 75 | "negative": ["Is that real English?", "I'm so confused right now!", "That makes no sense!"] 76 | }, 77 | } 78 | for metric,utts in turn_level_utts.items(): 79 | pos = utts["positive"] 80 | neg = utts["negative"] 81 | 82 | # Positive score 83 | high_score = 0 84 | for m in pos: 85 | hs = score(conversation + " <|endoftext|> " + m, tokenizer, model) 86 | high_score += hs 87 | 88 | high_score = high_score/max(len(pos), 1) 89 | 90 | # Negative score 91 | low_score = 0 92 | for m in neg: 93 | ls = score(conversation + " <|endoftext|> " + m, tokenizer, model) 94 | low_score += ls 95 | low_score = low_score/max(len(neg), 1) 96 | 97 | scores[metric] = (low_score - high_score) 98 | 99 | dialog_level_utts = { 100 | "coherent": { 101 | "positive": [], 102 | "negative": ["You're making no sense at all.", "You're changing the topic so much!", "You are so confusing."] 103 | }, 104 | "error recovery": { 105 | "positive": [], 106 | "negative": ["I am so confused right now.", "You're really confusing.", "I don't understand what you're saying."] 107 | }, 108 | "consistent": { 109 | "positive": [], 110 | "negative": ["That's not what you said earlier!", "Stop contradicting yourself!"], 111 | }, 112 | "diverse": { 113 | "positive": [], 114 | "negative": ["Stop saying the same thing repeatedly.", "Why are you repeating yourself?", "Stop repeating yourself!"] 115 | }, 116 | "depth": { 117 | "positive": [], 118 | "negative": ["Stop changing the topic so much.", "Don't change the topic!"], 119 | }, 120 | "likeable": { 121 | "positive": ["I like you!", "You're super polite and fun to talk to", "Great talking to you."], 122 | "negative": ["You're not very nice.", "You're not very fun to talk to.", "I don't like you."] 123 | }, 124 | "understand": { 125 | "positive": [], 126 | "negative": ["You're not understanding me!", "What are you trying to say?", "I don't understand what you're saying."] 127 | }, 128 | "flexible": { 129 | "positive": ["You're very easy to talk to!", "Wow you can talk about a lot of things!"], 130 | "negative": ["I don't want to talk about that!", "Do you know how to talk about something else?"], 131 | }, 132 | "informative": { 133 | "positive": ["Thanks for all the information!", "Wow that's a lot of information.", "You know a lot of facts!"], 134 | "negative": ["You're really boring.", "You don't really know much."], 135 | }, 136 | "inquisitive": { 137 | "positive": ["You ask a lot of questions!", "That's a lot of questions!"], 138 | "negative": ["You don't ask many questions.", "You don't seem interested."], 139 | }, 140 | } 141 | for metric,utts in dialog_level_utts.items(): 142 | pos = utts["positive"] 143 | neg = utts["negative"] 144 | 145 | # Positive 146 | high_score = 0 147 | for m in pos: 148 | hs = score(conversation + " <|endoftext|> " + m, tokenizer, model) 149 | high_score += hs 150 | 151 | high_score = high_score/max(len(pos), 1) 152 | 153 | # Negative 154 | low_score = 0 155 | for m in neg: 156 | ls = score(conversation + " <|endoftext|> " + m, tokenizer, model) 157 | low_score += ls 158 | low_score = low_score/max(len(neg), 1) 159 | 160 | scores[metric] = (low_score - high_score) 161 | 162 | return scores 163 | -------------------------------------------------------------------------------- /fed_demo.py: -------------------------------------------------------------------------------- 1 | import fed 2 | 3 | # Load model 4 | model, tokenizer = fed.load_models("microsoft/DialoGPT-large") 5 | 6 | # Evaluate 7 | conversation = "<|endoftext|> Hi! <|endoftext|> Hello, how is your day? <|endoftext|> It's good. It's raining a bit, but I am enjoying a good book. How about you? <|endoftext|> It's good, I just got back from walking my dog What book did you read?" 8 | scores = fed.evaluate(conversation, 9 | model, 10 | tokenizer) 11 | 12 | --------------------------------------------------------------------------------