├── Demo ├── Tanul.PNG ├── sa.PNG └── saving.PNG ├── Humorous Sentence Completion ├── config.py ├── inference.py └── training.py ├── Joke Generation ├── Inference.py ├── config.py └── training.py └── README.md /Demo/Tanul.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tanulsingh/Humour.ai-Language-model-that-can-crack-Jokes/cee5fddd0939705487b04ae9f96505d81c6e45d6/Demo/Tanul.PNG -------------------------------------------------------------------------------- /Demo/sa.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tanulsingh/Humour.ai-Language-model-that-can-crack-Jokes/cee5fddd0939705487b04ae9f96505d81c6e45d6/Demo/sa.PNG -------------------------------------------------------------------------------- /Demo/saving.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tanulsingh/Humour.ai-Language-model-that-can-crack-Jokes/cee5fddd0939705487b04ae9f96505d81c6e45d6/Demo/saving.PNG -------------------------------------------------------------------------------- /Humorous Sentence Completion/config.py: -------------------------------------------------------------------------------- 1 | from transformers import GPT2Tokenizer 2 | 3 | BATCH_SIZE = 16 4 | EPOCHS = 4 5 | LEARNING_RATE = 3e-5 6 | MAX_LEN = 64 7 | TRAIN_PATH = "/content/gdrive/My Drive/shortjokes.csv" 8 | MODEL_FOLDER = "/content/gdrive/My Drive/Colab Notebooks/trained_models" 9 | Tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium') -------------------------------------------------------------------------------- /Humorous Sentence Completion/inference.py: -------------------------------------------------------------------------------- 1 | # Preliminaries 2 | import os 3 | import numpy as np 4 | import pandas as pd 5 | 6 | #transformers 7 | from transformers import GPT2LMHeadModel 8 | 9 | # Pytorch 10 | import torch 11 | import torch.nn as nn 12 | 13 | #warnings 14 | import warnings 15 | warnings.filterwarnings('ignore') 16 | 17 | # My Module 18 | import config 19 | 20 | # HElper Function 21 | def choose_from_top(probs, n=5): 22 | ind = np.argpartition(probs, -n)[-n:] 23 | top_prob = probs[ind] 24 | top_prob = top_prob / np.sum(top_prob) # Normalize 25 | choice = np.random.choice(n, 1, p = top_prob) 26 | token_id = ind[choice][0] 27 | return int(token_id) 28 | 29 | # Model Loading 30 | model = GPT2LMHeadModel.from_pretrained('gpt2-medium') 31 | special_tokens_dict = {'pad_token': '','bos_token':'','sep_token':''} 32 | num_added_toks = config.Tokenizer.add_special_tokens(special_tokens_dict) 33 | print('We have added', num_added_toks, 'tokens') 34 | model.resize_token_embeddings(len(config.Tokenizer)) 35 | 36 | #loading Model state 37 | models_path = "/kaggle/input/fine-tuning-open-gp-2/trained_models/gpt2_medium_joker_0.pt" # ADD PATH TO YOUR SAVED MODEL HERE 38 | model.load_state_dict(torch.load(models_path)) 39 | 40 | device='cuda' 41 | model.to(device) 42 | 43 | def predict(start_of_joke,length_of_joke=96,number_of_jokes=2): 44 | joke_num = 0 45 | model.eval() 46 | with torch.no_grad(): 47 | for joke_idx in range(number_of_jokes): 48 | 49 | joke_finished = False 50 | 51 | cur_ids = torch.tensor(config.Tokenizer.encode(start_of_joke)).unsqueeze(0).to(device) 52 | 53 | for i in range(length_of_joke): 54 | outputs = model(cur_ids, labels=cur_ids) 55 | loss, logits = outputs[:2] 56 | softmax_logits = torch.softmax(logits[0,-1], dim=0) #Take the first(from only one in this case) batch and the last predicted embedding 57 | if i < 3: 58 | n = 20 59 | else: 60 | n = 3 61 | next_token_id = choose_from_top(softmax_logits.to('cpu').numpy(), n=n) #Randomly(from the topN probability distribution) select the next word 62 | cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to(device) * next_token_id], dim = 1) # Add the last word to the running sequence 63 | 64 | if next_token_id in config.Tokenizer.encode('<|endoftext|>'): 65 | joke_finished = True 66 | break 67 | 68 | 69 | if joke_finished: 70 | 71 | joke_num = joke_num + 1 72 | 73 | output_list = list(cur_ids.squeeze().to('cpu').numpy()) 74 | output_text = config.Tokenizer.decode(output_list) 75 | 76 | print(output_text+'\n') 77 | 78 | # Start Predicting 79 | predict("How do you feel",64,1) -------------------------------------------------------------------------------- /Humorous Sentence Completion/training.py: -------------------------------------------------------------------------------- 1 | # Preliminaries 2 | import os 3 | import pandas as pd 4 | import numpy as np 5 | 6 | #Pytorch 7 | import torch 8 | import torch.nn as nn 9 | from torch.utils.data import DataLoader,Dataset 10 | 11 | #Transformers 12 | from transformers import GPT2Tokenizer, GPT2LMHeadModel 13 | from transformers import AdamW, WarmUp, get_linear_schedule_with_warmup 14 | 15 | #Warning 16 | import warnings 17 | warnings.filterwarnings('ignore') 18 | 19 | #Mymodule 20 | import config 21 | 22 | # Processing Data 23 | def process_jokes(raw_fp): 24 | df = pd.read_csv(raw_fp) 25 | 26 | # Append token at the end of each joke to indicate the end of a joke 27 | 28 | what_jokes = df[df.Joke.str.lower().str.startswith("what")].Joke.str.split("?") 29 | how_jokes = df[df.Joke.str.lower().str.startswith("how")].Joke.str.split("?") 30 | why_jokes = df[df.Joke.str.lower().str.startswith("why")].Joke.str.split("?") 31 | when_jokes = df[df.Joke.str.lower().str.startswith("when")].Joke.str.split("?") 32 | where_jokes = df[df.Joke.str.lower().str.startswith("where")].Joke.str.split("?") 33 | 34 | jokes = [] 35 | for joke_ in [what_jokes, how_jokes, why_jokes, when_jokes, where_jokes]: 36 | joke_df_ = pd.DataFrame(joke_.values.tolist()).iloc[:, :2].dropna() 37 | joke_df_.columns = ["questions", "answer"] 38 | jokes.append(joke_df_) 39 | 40 | jokes_df = pd.concat(jokes) 41 | jokes_df = ( 42 | jokes_df[~(jokes_df.answer.isin([""]))].drop_duplicates().reset_index(drop=True) 43 | ) 44 | 45 | riddle_jokes_list = ( 46 | " " + jokes_df.questions + " " + jokes_df.answer + " <|endoftext|>" 47 | ).values.tolist() 48 | riddle_jokes = "\n".join(riddle_jokes_list) 49 | 50 | return riddle_jokes_list 51 | 52 | 53 | # Creating Custom DataSet 54 | 55 | class Jokesdataset(Dataset): 56 | def __init__(self,data,tokenizer): 57 | self.data = data 58 | self.tokenizer = tokenizer 59 | 60 | def __len__(self): 61 | return len(self.data) 62 | 63 | def __getitem__(self,idx): 64 | joke = self.data[idx] 65 | 66 | inputs = self.tokenizer.encode_plus( 67 | joke, 68 | None, 69 | add_special_tokens = True, 70 | max_length = config.MAX_LEN, 71 | pad_to_max_length = True 72 | ) 73 | 74 | ids = inputs["input_ids"] 75 | mask = inputs["attention_mask"] 76 | 77 | return {'ids':torch.tensor(ids,dtype=torch.long), 78 | 'mask': torch.tensor(mask,dtype=torch.long), 79 | 'target':torch.tensor(ids,dtype=torch.long)} 80 | 81 | 82 | # Initializing Model and adding our special Tokens to model vocab 83 | 84 | model = GPT2LMHeadModel.from_pretrained('gpt2-medium') 85 | special_tokens_dict = {'pad_token': '','bos_token':'','sep_token':''} 86 | num_added_toks = config.Tokenizer.add_special_tokens(special_tokens_dict) 87 | print('We have added', num_added_toks, 'tokens') 88 | model.resize_token_embeddings(len(config.Tokenizer)) 89 | 90 | # Training Function 91 | 92 | def train_fn(data_loader, model, optimizer, device, scheduler,epoch): 93 | model.train() 94 | 95 | for bi, d in enumerate(data_loader): 96 | ids = d["ids"] 97 | mask = d["mask"] 98 | labels = d['target'] 99 | 100 | ids = ids.to(device, dtype=torch.long) 101 | mask = mask.to(device, dtype=torch.long) 102 | labels = labels.to(device,dtype=torch.long) 103 | 104 | optimizer.zero_grad() 105 | outputs = model( 106 | input_ids =ids, 107 | attention_mask=mask, 108 | labels = labels 109 | ) 110 | 111 | loss, logits = outputs[:2] 112 | loss.backward() 113 | 114 | optimizer.step() 115 | if scheduler is not None: 116 | scheduler.step() 117 | 118 | if (bi+1) % 100 == 0: 119 | print('Epoch [{}/{}], bi[{}/{}], Loss: {:.4f}' 120 | .format(epoch+1, config.EPOCHS, bi+1,len(data_loader), loss.item())) 121 | 122 | 123 | #ENGINE 124 | 125 | def run(): 126 | joke_list = process_jokes(config.TRAIN_PATH) 127 | 128 | jokes_dataset = Jokesdataset(joke_list,config.Tokenizer) 129 | jokes_dataloader = DataLoader(jokes_dataset, 130 | batch_size=config.BATCH_SIZE, 131 | shuffle=True, 132 | num_workers=4) 133 | 134 | model.to(device) 135 | 136 | num_train_steps = int(len(jokes_dataloader) / config.BATCH_SIZE * config.EPOCHS) 137 | 138 | optimizer = AdamW(model.parameters(), lr=config.LEARNING_RATE) 139 | scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps=num_train_steps) 140 | 141 | for epoch in range(config.EPOCHS): 142 | print(f"EPOCH {epoch+1} started" + '=' * 30) 143 | train_fn(jokes_dataloader, model, optimizer, device, scheduler,epoch=epoch) 144 | 145 | models_folder = config.MODEL_FOLDER 146 | if not os.path.exists(models_folder): 147 | os.mkdir(models_folder) 148 | torch.save(model.state_dict(), os.path.join(models_folder, f"gpt2_medium_joker_3.pt")) 149 | 150 | 151 | # Begin Training 152 | run() -------------------------------------------------------------------------------- /Joke Generation/Inference.py: -------------------------------------------------------------------------------- 1 | # Preliminaries 2 | import os 3 | import numpy as np 4 | import pandas as pd 5 | 6 | #transformers 7 | from transformers import GPT2LMHeadModel 8 | 9 | # Pytorch 10 | import torch 11 | import torch.nn as nn 12 | 13 | #warnings 14 | import warnings 15 | warnings.filterwarnings('ignore') 16 | 17 | # My Module 18 | import config 19 | 20 | # HElper Function 21 | def choose_from_top(probs, n=5): 22 | ind = np.argpartition(probs, -n)[-n:] 23 | top_prob = probs[ind] 24 | top_prob = top_prob / np.sum(top_prob) # Normalize 25 | choice = np.random.choice(n, 1, p = top_prob) 26 | token_id = ind[choice][0] 27 | return int(token_id) 28 | 29 | # Model Loading 30 | model = GPT2LMHeadModel.from_pretrained('gpt2-medium') 31 | special_tokens_dict = {'pad_token': ''} 32 | num_added_toks = config.Tokenizer.add_special_tokens(special_tokens_dict) 33 | print('We have added', num_added_toks, 'tokens') 34 | model.resize_token_embeddings(len(config.Tokenizer)) 35 | 36 | #loading Model State 37 | models_path = "/kaggle/input/fine-tuning-open-gp-2/trained_models/gpt2_medium_joker_0.pt" # ADD PATH TO YOUR SAVED MODEL HERE 38 | model.load_state_dict(torch.load(models_path)) 39 | 40 | device='cuda' 41 | model.to(device) 42 | 43 | def predict(length_of_joke,number_of_jokes): 44 | joke_num = 0 45 | model.eval() 46 | with torch.no_grad(): 47 | for joke_idx in range(number_of_jokes): 48 | 49 | joke_finished = False 50 | 51 | cur_ids = torch.tensor(config.Tokenizer.encode('JOKE')).unsqueeze(0).to(device) 52 | 53 | for i in range(length_of_joke): 54 | outputs = model(cur_ids, labels=cur_ids) 55 | loss, logits = outputs[:2] 56 | softmax_logits = torch.softmax(logits[0,-1], dim=0) #Take the first(from only one in this case) batch and the last predicted embedding 57 | if i < 3: 58 | n = 20 59 | else: 60 | n = 3 61 | next_token_id = choose_from_top(softmax_logits.to('cpu').numpy(), n=n) #Randomly(from the topN probability distribution) select the next word 62 | cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to(device) * next_token_id], dim = 1) # Add the last word to the running sequence 63 | 64 | if next_token_id in config.Tokenizer.encode('<|endoftext|>'): 65 | joke_finished = True 66 | break 67 | 68 | 69 | if joke_finished: 70 | 71 | joke_num = joke_num + 1 72 | 73 | output_list = list(cur_ids.squeeze().to('cpu').numpy()) 74 | output_text = config.Tokenizer.decode(output_list) 75 | 76 | print(output_text+'\n') 77 | 78 | # Start Predicting 79 | predict(64,5) -------------------------------------------------------------------------------- /Joke Generation/config.py: -------------------------------------------------------------------------------- 1 | from transformers import GPT2Tokenizer 2 | 3 | 4 | BATCH_SIZE = 16 5 | EPOCHS = 4 6 | LEARNING_RATE = 2e-5 7 | MAX_LEN = 64 8 | TRAIN_PATH = "/kaggle/input/short-jokes/shortjokes.csv" #ADD PATH TO YOUR DATASET HERE 9 | MODEL_FOLDER = "/kaggle/working/trained_models" # ADD PATH TO WHERE YOU WANT TO SAVE YOUR MODEL 10 | Tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium') -------------------------------------------------------------------------------- /Joke Generation/training.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This file containing the training code for Joke Generation Model 3 | ''' 4 | # Preliminaries 5 | import os 6 | import pandas as pd 7 | import numpy as np 8 | 9 | #Pytorch 10 | import torch 11 | import torch.nn as nn 12 | from torch.utils.data import DataLoader,Dataset 13 | 14 | # Transformers 15 | from transformers import GPT2LMHeadModel 16 | from transformers import AdamW, WarmUp, get_linear_schedule_with_warmup 17 | 18 | #Warnings 19 | import warnings 20 | warnings.filterwarnings('ignore') 21 | 22 | # MyModule 23 | import config 24 | 25 | # INITIALIZING MODEL AND ADDING THE PAD TOKEN 26 | model = GPT2LMHeadModel.from_pretrained('gpt2-medium') 27 | special_tokens_dict = {'pad_token': ''} 28 | num_added_toks = config.Tokenizer.add_special_tokens(special_tokens_dict) 29 | print('We have added', num_added_toks, 'tokens') 30 | model.resize_token_embeddings(len(config.Tokenizer)) 31 | 32 | 33 | # Dataset 34 | class Jokesdataset(Dataset): 35 | ''' 36 | This class builds the custom dataset for Dataloader 37 | ''' 38 | def __init__(self,data,tokenizer): 39 | self.data = data 40 | self.tokenizer = tokenizer 41 | self.eos_tok = "<|endoftext|>" 42 | #Adding JOKE: at the start and EOS TOKEN at end 43 | self.data['Joke'] = self.data['Joke'].apply(lambda x: "JOKE:" + str(x) + self.eos_tok) 44 | 45 | def __len__(self): 46 | return len(self.data) 47 | 48 | def __getitem__(self,idx): 49 | joke = self.data.iloc[idx,1] 50 | 51 | inputs = self.tokenizer.encode_plus( 52 | joke, 53 | None, 54 | add_special_tokens = True, 55 | max_length = config.MAX_LEN, 56 | pad_to_max_length = True 57 | ) 58 | 59 | ids = inputs["input_ids"] 60 | mask = inputs["attention_mask"] 61 | 62 | return {'ids':torch.tensor(ids,dtype=torch.long), 63 | 'mask': torch.tensor(mask,dtype=torch.long), 64 | 'target':torch.tensor(ids,dtype=torch.long)} 65 | 66 | 67 | # Training Function 68 | 69 | def train_fn(data_loader, model, optimizer, device, scheduler,epoch): 70 | model.train() 71 | for bi, d in enumerate(data_loader): 72 | ids = d["ids"] 73 | mask = d["mask"] 74 | labels = d['target'] 75 | 76 | ids = ids.to(device, dtype=torch.long) 77 | mask = mask.to(device, dtype=torch.long) 78 | labels = labels.to(device,dtype=torch.long) 79 | 80 | optimizer.zero_grad() 81 | outputs = model( 82 | input_ids =ids, 83 | attention_mask=mask, 84 | labels = labels 85 | ) 86 | 87 | loss, logits = outputs[:2] 88 | loss.backward() 89 | 90 | optimizer.step() 91 | if scheduler is not None: 92 | scheduler.step() 93 | 94 | if (bi+1) % 500 == 0: 95 | print('Epoch [{}/{}], bi[{}/{}], Loss: {:.4f}' 96 | .format(epoch+1, config.EPOCHS, bi+1,len(data_loader), loss.item())) 97 | 98 | device = 'cuda' # Selecting Device 99 | 100 | #ENGINE 101 | 102 | def run(): 103 | jokes = pd.read_csv(config.TRAIN_PATH) #add the path to your Dataset in config File 104 | 105 | jokes_dataset = Jokesdataset(jokes,config.Tokenizer) 106 | jokes_dataloader = DataLoader(jokes_dataset, 107 | batch_size=config.BATCH_SIZE, 108 | shuffle=True, 109 | num_workers=4) 110 | 111 | model.to(device) 112 | 113 | num_train_steps = int(len(jokes_dataloader) / config.BATCH_SIZE * config.EPOCHS) 114 | 115 | optimizer = AdamW(model.parameters(), lr=config.LEARNING_RATE) 116 | scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps=num_train_steps) 117 | 118 | for epoch in range(config.EPOCHS): 119 | print(f"EPOCH {epoch+1} started" + '=' * 30) 120 | train_fn(jokes_dataloader, model, optimizer, device, scheduler,epoch=epoch) 121 | 122 | models_folder = config.MODEL_FOLDER 123 | if not os.path.exists(models_folder): 124 | os.mkdir(models_folder) 125 | # Saving Model after each Epoch 126 | torch.save(model.state_dict(), os.path.join(models_folder, f"gpt2_joke_generator{epoch}.pt")) 127 | 128 | 129 | # BEGINNING TRAINING 130 | run() 131 | 132 | 133 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Humour.ai 2 | 3 | I have seen a lot of people do cool projects in Computer Vision (the more hyped one) but hardly I have ever seen something good in NLP. After learning about transformers, I thought I should do something in NLP. I have fine tuned GPT-2 with a language model head on short jokes scrapped from reddit. 4 | 5 | **Humor.ai tries to complete the sentences in a humourous way given some input words** 6 | 7 | I tested my model on unseen words and sentences and got some really cool and surprising results 8 | 9 | The first one is really hilarious given the fact that model doesn't know my name 😂😂 Language Model that can make you Laugh 10 | 11 | ![Image description](https://github.com/tanulsingh/Humour.ai/blob/master/Demo/sa.PNG) 12 | 13 | ![Image description](https://github.com/tanulsingh/Humour.ai/blob/master/Demo/Tanul.PNG) 14 | 15 | ![Image description](https://github.com/tanulsingh/Humour.ai/blob/master/Demo/saving.PNG) 16 | 17 | 18 | # Data 19 | 20 | The first challenge for any Machine Learning project is getting the data that would do the task. Fortunately I didn't have to do a lot in getting the data , I found this awesome dataset on [Kaggle]( https://www.kaggle.com/abhinavmoudgil95/short-jokes) . It consists of short Jokes scrapped from reddit in well laid DataFrame 21 | 22 | # Pre-Processing 23 | 24 | Open GPT-2 is a transformer type architecture which uses the decoder of transformers . It is well known for it's language modelling tasks and thus I used it to create Humour.ai 25 | 26 | **There are two ways in which data can be presented to the model, depending on the objective you want to achieve** 27 | 28 | * Joke generator 29 | * Humorous Sentence Completion 30 | 31 | Let's look at these two seperately 32 | 33 | ### Joke Generation 34 | 35 | In this task the model simply tries to generate jokes, given the length of joke and number of jokes you want it to generate. 36 | Here we append 'JOKE:' at the start of every joke in our dataframe and '<|endoftext|>' at the end of each joke which tells our model that our joke has ended. 37 | At the time of inference , we simply provide number of jokes and length of each joke and our model will print out jokes based on what it has learned 38 | 39 | ### Humorous Sentence Completion 40 | 41 | This is something new , a simple tweak to above mentioned task . In this our model tries to complete a sentence in a humorous way given any input word or words it has never seen before. 42 | 43 | For this task , I took only the Jokes in our dataset which were question,answer types and started with Why,When,How,etc. Then processed the data in this format

44 | <|soq|> question <|sep|> answer <|endoftext|> 45 | 46 | It looks like an input to Question answering system , only the whole string is treated as one string , instead of getting different token_type_ids for Questions and Asnwers 47 | 48 | # Model 49 | 50 | I have used HuggingFace Library for GPT-2 Model and the whole code is written in Pytorch. I will be more than happy to share if someone takes this model and writes its equivalent in Keras/TF (that would be a good exercise) .The modelling and inference are easy to understand and self-explanatory if one reads the HuggingFace Docs. 51 | 52 | # HyperParameters 53 | 54 | I have tested two batch_sizes and two learning rates , the later works better.It takes about 5 hours to train the first model for second task(Humorous Sentence Completion) on GPU's and 55 | 56 | | Task | Batch_Size | MAX_LEN | EPOCHS | Learning Rate| Train Time On GPU's | Train Time on TPU's| 57 | |----------| ------------- | ------------- |------------- | ------------- | ---------|-----------| 58 | |Humorous Sentence Completion| 32 | 64 | 4 | 3e-5 |4.5 hours| 2.5 hours| 59 | |Humorous Sentence Completition| 16 | 64 | 4 | 2e-5 | 5.5 hours | 3 hours| 60 | |Joke Generation | 32 | 64 | 4 | 3e-5 | 6.5 hours | 2.5 hours| 61 | |Joke Generation | 16 | 64 | 4 | 2e-5 | 7.5 hours | 3 hours| 62 | 63 | 64 | # End Notes 65 | 66 | * Feel Free to Fork, Experiment and play with the model . I have uploaded the code for the different tasks in different folders . 67 | * **I will also be uploading trained weights so that anyone can load it and play with the model by just running the inference file** 68 | * I will be uploading the codde for taining on TPU's soon 69 | 70 | --------------------------------------------------------------------------------