├── README.md ├── my_dataloader.py ├── create_vocabulary ├── utils.py ├── Extract.py ├── Train.py └── Model.py /README.md: -------------------------------------------------------------------------------- 1 | # Unsupervised-Deep-Keyphrase-Generation 2 | Main code for "unsupervised deep keyphrase generation" 3 | 4 | # Dataset 5 | Put the KP20K dataset training set in the home directory. 6 | 7 | ## Extract.py 8 | 9 | Extract present keyphrases and absent keyphrases from document, and generate the silver data 10 | 11 | ## Train.py 12 | 13 | Train the seq2seq generator model on the silver data 14 | -------------------------------------------------------------------------------- /my_dataloader.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import random 3 | import torch.utils.data as data 4 | import os 5 | import pickle 6 | import numpy as np 7 | from torch.nn.utils.rnn import pad_packed_sequence 8 | import pickle 9 | import random 10 | import nltk 11 | 12 | 13 | class MyDataset(data.Dataset): 14 | 15 | def __init__(self, data_name='silver.npy', vocab_name='vocab_kp20k.npy',cls2=0): 16 | 17 | self.f = list(np.load(data_name, allow_pickle=True)) 18 | 19 | self.vocab = np.load(vocab_name, allow_pickle=True).item() 20 | 21 | 22 | 23 | 24 | def __getitem__(self, index): 25 | 26 | 27 | x, trg= self.f[index] 28 | x = x.lower() 29 | x = nltk.tokenize.word_tokenize(x) 30 | 31 | 32 | for i in range(len(x)): 33 | x[i] = self.vocab(x[i]) 34 | x.append(self.vocab('')) 35 | x = [self.vocab('')] + x 36 | 37 | if len(x)>512: 38 | x = x[:512] 39 | while len(x) < 512: 40 | x.append(self.vocab('')) 41 | 42 | src = torch.Tensor(x) 43 | 44 | 45 | x = trg 46 | x = ','.join(x) 47 | x = nltk.tokenize.word_tokenize(x.lower()) 48 | 49 | 50 | for i in range(len(x)): 51 | x[i] = self.vocab(x[i]) 52 | x.append(self.vocab('')) 53 | x = [self.vocab('')] + x 54 | 55 | while len(x) < 30: 56 | x.append(self.vocab('')) 57 | 58 | trg = torch.Tensor(x) 59 | 60 | 61 | return src, trg 62 | 63 | def __len__(self): 64 | return len(self.f) 65 | 66 | -------------------------------------------------------------------------------- /create_vocabulary: -------------------------------------------------------------------------------- 1 | import nltk 2 | import pickle 3 | import numpy as np 4 | import re 5 | from collections import Counter 6 | from nltk.corpus import stopwords 7 | 8 | nltk.download('stopwords') 9 | nltk.download('punkt') 10 | stoplist = stopwords.words('english') 11 | 12 | 13 | 14 | class Vocabulary(object): 15 | def __init__(self): 16 | self.word2idx = {} 17 | self.idx2word = {} 18 | self.idx = 0 19 | 20 | def add_word(self, word): 21 | if not word in self.word2idx: 22 | self.word2idx[word] = self.idx 23 | self.idx2word[self.idx] = word 24 | self.idx += 1 25 | 26 | def __call__(self, word): 27 | if not word in self.word2idx: 28 | return self.word2idx[''] 29 | return self.word2idx[word] 30 | 31 | def __len__(self): 32 | return len(self.word2idx) 33 | 34 | def build_vocab(counter,threshold=3): 35 | 36 | 37 | # Ignore rare words 38 | words = [[cnt,word] for word, cnt in counter.items() if ((cnt >= threshold))] 39 | words.sort(reverse=True) 40 | words = [e[1] for e in words[:50000]] 41 | f = open('vocab_file.txt','w') 42 | 43 | 44 | # Create a vocabulary and initialize with special tokens 45 | vocab = Vocabulary() 46 | vocab.add_word('') 47 | vocab.add_word('') 48 | vocab.add_word('') 49 | vocab.add_word('') 50 | 51 | # Add the all the words 52 | for i, word in enumerate(words): 53 | vocab.add_word(word) 54 | return vocab 55 | 56 | if __name__ == '__main__': 57 | 58 | 59 | emb = dict() 60 | f = open('glove.6B.200d.txt','r',encoding='utf-8') 61 | e = f.readline() 62 | while e: 63 | line = e.split(' ') 64 | emb[line[0]] = line[1:] 65 | e = f.readline() 66 | 67 | corpus = list(np.load('document.npy', allow_pickle=True)) 68 | 69 | counter = Counter() 70 | i=0 71 | for sent in corpus: 72 | #sent = sent[1] 73 | tokens=sent.split() 74 | #tokens.extend(q.split()) 75 | #tokens = nltk.tokenize.word_tokenize(sent.lower()) 76 | counter.update(tokens) 77 | 78 | 79 | vocab = build_vocab(counter) 80 | 81 | np.save('vocab_kp20k.npy', vocab) 82 | 83 | 84 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.stem.porter import PorterStemmer 3 | 4 | def extract_candidates(text): 5 | 6 | GRAMMAR_EN = """ NP: 7 | {*}""" # Adjective(s)(optional) + Noun(s) 8 | keyphrase_candidate = set() 9 | 10 | 11 | np_parser = nltk.RegexpParser(GRAMMAR_EN) # Noun phrase parser 12 | 13 | tag = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)) 14 | 15 | 16 | trees = np_parser.parse_sents(tag) # Generator with one tree per sentence 17 | #print(text) 18 | 19 | for tree in trees: 20 | for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'): # For each nounphrase 21 | # Concatenate the token with a space 22 | keyphrase_candidate.add(' '.join(word for word, tag in subtree.leaves())) 23 | 24 | #print(keyphrase_candidate) 25 | keyphrase_candidate = {kp for kp in keyphrase_candidate if len(kp.split()) <= 4} 26 | #print(keyphrase_candidate) 27 | 28 | return keyphrase_candidate 29 | 30 | def get_ngram(text): 31 | 32 | can_list=[] 33 | can_set= set() 34 | 35 | i=-1 36 | for e in text: 37 | i+=1 38 | e = e[0] 39 | #if i == 2000: 40 | # break 41 | 42 | tmp = extract_candidates(e.lower()) 43 | tmp2 = set() 44 | t=set() 45 | for q in tmp: 46 | if '_of' in q: 47 | if q[-3:]=='_of': 48 | continue 49 | q = q.replace('_of', ' of') 50 | t.add(q) 51 | can_list.append(t) 52 | 53 | for q in all_can[-1]: 54 | 55 | ret.add(q) 56 | for m in q.split(): 57 | can_set.add(m) 58 | 59 | can_set = can_set | can_list[-1] 60 | 61 | 62 | return can_list, can_set 63 | 64 | def reduce(rank): 65 | new_rank = [] 66 | for q in rank: 67 | if q[1] not in trash: 68 | new_rank.append(q[1]) 69 | rank = new_rank 70 | ret=[] 71 | for q in rank: 72 | flg=0 73 | for p in ret: 74 | if (p in q) or (q in p): 75 | flg=1 76 | break 77 | if flg: 78 | continue 79 | ret.append(q) 80 | return ret 81 | 82 | def get_fscore(pred,label): 83 | 84 | new_pred=[] 85 | 86 | for e in pred: 87 | e = e.replace('-',' ') 88 | c='' 89 | e = e.split(' ') 90 | if len(e)>4: 91 | continue 92 | for q in e: 93 | c = c + ' ' + porter_stemmer.stem(q) 94 | if c.strip() not in new_pred: 95 | new_pred.append(c.strip()) 96 | 97 | tmp=[] 98 | 99 | for e in new_pred: 100 | flg=0 101 | for w in tmp: 102 | if w in e: 103 | flg=1 104 | break 105 | if flg==0: 106 | tmp.append(e) 107 | new_pred = tmp 108 | 109 | 110 | new_pred = new_pred[:min(10,len(new_pred))] 111 | pred=new_pred 112 | 113 | precision=0 114 | recall=0 115 | for e in label: 116 | if e in pred: 117 | recall += 1 118 | precision += 1 119 | 120 | if precision==0: 121 | return 0 122 | precision /= len(pred) 123 | recall /= len(label) 124 | 125 | return 2*precision*recall/(precision+recall) 126 | -------------------------------------------------------------------------------- /Extract.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from collections import defaultdict 4 | import pandas as pd 5 | import numpy as np 6 | from utils import * 7 | from nltk.stem.porter import PorterStemmer 8 | import nltk 9 | import math 10 | from gensim.models.doc2vec import Doc2Vec 11 | from gensim.test.utils import get_tmpfile 12 | from nltk.corpus import stopwords 13 | 14 | 15 | 16 | nltk.download('averaged_perceptron_tagger') 17 | nltk.download('stopwords') 18 | nltk.download('punkt') 19 | stoplist = stopwords.words('english') 20 | porter_stemmer = PorterStemmer() 21 | stoplist = ['the','a','no','if','an','and','but','is','are','be','were','in','wchich','of','for','.','!',',','?','that','not','this'] 22 | model = Doc2Vec.load('doc2vec.bin') 23 | 24 | 25 | 26 | 27 | def deal(p,t): 28 | 29 | p = p.split() 30 | if p[0] in pre_word: 31 | p=p[1:] 32 | b = ' '.join(p) 33 | tag = nltk.pos_tag(p) 34 | if len(p)<=2: 35 | return set([b]) 36 | else: 37 | pro=2 38 | 39 | if p[0] in stoplist: 40 | p = p[1:] 41 | if len(p)==1: 42 | return set([b]) 43 | 44 | ret=[b] 45 | for e in p: 46 | if e not in idf: 47 | return set(ret) 48 | 49 | if tag[0][1] not in ['NN','NNS','NNP']: 50 | r0=idf[p[0]] * t.count(p[0]) 51 | r1=idf[p[1]] * t.count(p[1]) 52 | if r0*5 < r1: 53 | ret=[] 54 | ret.append(' '.join(p[1:])) 55 | return set(ret) 56 | 57 | if idf[p[-1]]*t.count(p[-1])*5 < idf[p[-2]]*t.count(p[-2]): 58 | ret.append(' '.join(p[:-1])) 59 | return set(ret) 60 | 61 | return set(ret) 62 | 63 | 64 | 65 | 66 | def Extract(input): 67 | 68 | can_list,can_set = get_ngram(input) 69 | idf = np.load('word_dic.npy', allow_pickle=True).item() 70 | 71 | new_bb=set() 72 | if False: 73 | for i in range(len(all_can)): 74 | t=all_can[i] 75 | tmp2=set() 76 | for p in t: 77 | tmp2 = tmp2 | deal(p,a[i]) 78 | 79 | all_can[i] = tmp2 80 | new_bb = new_bb | tmp2 81 | 82 | record = [] 83 | idf_p = np.load('phrase_dic.npy', allow_pickle=True).item() 84 | 85 | for i,e in enumerate(input): 86 | 87 | pre = e.lower() 88 | 89 | pre_list = nltk.tokenize.word_tokenize(pre) 90 | stem_pre = [porter_stemmer.stem(q) for q in pre_list] 91 | stem_pre = ' '.join(stem_pre) 92 | 93 | 94 | doc_emb = model.infer_vector(pre_list) 95 | doc_emb = doc_emb / math.sqrt(sum([doc_emb[k]*doc_emb[k] for k in range(300)])) 96 | rank=[] 97 | rank2 = [] 98 | l = len(pre.split('.')) 99 | absent_can = set() 100 | for phrase in can_set: 101 | phrase = phrase.split() 102 | flg = 0 103 | for w in phrase: 104 | if w not in pre+list: 105 | flg = 1 106 | break 107 | if flg==0: 108 | absent_can.add(phrase) 109 | 110 | 111 | for j,q in enumerate(list(can_list[i])+list(absent_can)): 112 | 113 | if q not in idf: 114 | continue 115 | 116 | q_list = nltk.tokenize.word_tokenize(q) 117 | emb = model.infer_vector(q_list) 118 | emb = emb / math.sqrt(sum([emb[k]*emb[k] for k in range(300)])) 119 | 120 | emb = emb.reshape([1,300]) 121 | 122 | sim = float(np.dot(doc_emb.reshape([1,300]), emb.reshape([300,1]))) 123 | if l>10: 124 | sim = pre.count(q)*idf[q] * sim 125 | if j < len(can_list[i]): 126 | rank.append([sim,q]) 127 | else: 128 | ran2k.append([sim2,q]) 129 | 130 | 131 | 132 | rank.sort(reverse=True) 133 | rank2.sort(reverse=True) 134 | 135 | rank = reduce(rank) 136 | rank2 = reduce(rank2) 137 | 138 | record.append([input[i], list(set(rank[:5] + rank2[:5]))) 139 | np.save('silver.npy', record) 140 | 141 | if __name__ == '__main__': 142 | Extract(list(np.load('document.npy', allow_pickle=True))) 143 | 144 | 145 | 146 | 147 | -------------------------------------------------------------------------------- /Train.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pickle 3 | from torch.utils.data import DataLoader 4 | from my_dataloader import * 5 | from create_vocabulary import * 6 | from Model import Encoder, Decoder, Seq2Seq 7 | import torch.nn as nn 8 | import torch.optim as optim 9 | from torch.optim.lr_scheduler import StepLR 10 | 11 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 12 | 13 | 14 | #encoder = Encoder(input_dim=2999, name='emb_inspec.npy') 15 | #decoder = Decoder(output_dim=2999, name='emb_inspec.npy') 16 | encoder = Encoder() 17 | decoder = Decoder() 18 | model = Seq2Seq(encoder, decoder, device).to(device) 19 | #model.load_state_dict(torch.load('train.pt')) 20 | 21 | def init_weights(m): 22 | for name, param in m.named_parameters(): 23 | nn.init.normal_(param.data, mean=0, std=0.01) 24 | 25 | batch=64 26 | 27 | tot_epoch = 100 28 | 29 | vocab = np.load('vocab_kp20k2.npy', allow_pickle=True).item() 30 | #vocab = np.load('vocab_inspec.npy', allow_pickle=True).item() 31 | TRG_PAD_IDX = vocab('') 32 | criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX) 33 | optimizer = optim.Adam(model.parameters(), lr=0.0005) 34 | scheduler = StepLR(optimizer, step_size=6, gamma=0.8) 35 | #train_data = MyDataset(data_name='inspec2.npy', vocab_name='vocab_inspec.npy') 36 | train_data = MyDataset() 37 | test_data = MyDataset(cls2=1) 38 | 39 | train_loader = DataLoader(train_data, batch_size=batch,num_workers=2,shuffle=True) 40 | test_loader = DataLoader(test_data, batch_size=batch,num_workers=2,shuffle=False) 41 | #val_loader = DataLoader(val_data, batch_size=batch,num_workers=8,shuffle=False) 42 | #prev_tot = float('inf') 43 | 44 | def train(iterator): 45 | 46 | model.train() 47 | 48 | epoch_loss = 0 49 | cnt=0 50 | m = 0 51 | for i,(src,trg) in enumerate(iterator): 52 | #for i,(x,cls) in enumerate(iterator): 53 | src = src.long().permute(1,0).to(device) 54 | trg = trg.long().permute(1,0).to(device) 55 | 56 | 57 | optimizer.zero_grad() 58 | 59 | output = model.forward(src, trg) 60 | 61 | #print(output.shape, trg.shape) 62 | #trg = [trg len, batch size] 63 | #output = [trg len, batch size, output dim] 64 | 65 | output_dim = output.shape[-1] 66 | 67 | output = output[1:].view(-1, output_dim) 68 | 69 | trg = trg[1:].reshape(5*trg.shape[1]) 70 | 71 | 72 | #trg = [(trg len - 1) * batch size] 73 | #output = [(trg len - 1) * batch size, output dim] 74 | 75 | loss = criterion(output, trg) 76 | 77 | 78 | loss.backward() 79 | optimizer.step() 80 | 81 | 82 | epoch_loss += loss.item() 83 | torch.cuda.empty_cache() 84 | scheduler.step() 85 | return epoch_loss / len(iterator) 86 | 87 | def evaluate(iterator): 88 | 89 | model.eval() 90 | 91 | epoch_loss = 0 92 | 93 | with torch.no_grad(): 94 | 95 | for i, (src, trg) in enumerate(iterator): 96 | src = src.long().permute(1,0).to(device) 97 | trg = trg.long().permute(1,0).to(device) 98 | 99 | 100 | output = model.forward(src, trg) #turn off teacher forcing 101 | 102 | #trg = [trg len, batch size] 103 | #output = [trg len, batch size, output dim] 104 | 105 | output_dim = output.shape[-1] 106 | 107 | output = output[1:].view(-1, output_dim) 108 | trg = trg[1:].reshape(5*trg.shape[1]) 109 | 110 | 111 | #trg = [(trg len - 1) * batch size] 112 | #output = [(trg len - 1) * batch size, output dim] 113 | 114 | loss = criterion(output, trg) 115 | epoch_loss += loss.item() 116 | 117 | return epoch_loss / len(iterator) 118 | 119 | 120 | best_valid_loss = float('inf') 121 | 122 | for epoch in range(30): 123 | 124 | 125 | train_loss = train(train_loader) 126 | valid_loss = evaluate(test_loader) 127 | 128 | #valid_loss = train_loss 129 | if valid_loss < best_valid_loss: 130 | best_valid_loss = valid_loss 131 | print('saved') 132 | torch.save(model.state_dict(), 'train.pt') 133 | 134 | print(epoch,':') 135 | print(train_loss, valid_loss) 136 | 137 | print('****************************************\n') 138 | -------------------------------------------------------------------------------- /Model.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | import torch 4 | from torch.autograd import Variable 5 | import numpy as np 6 | import random 7 | 8 | class Encoder(nn.Module): 9 | def __init__(self, input_dim=50004, emb_dim=200, hid_dim=256, dropout=0.5,name='emb_kp20k2.npy'): 10 | super().__init__() 11 | 12 | self.hid_dim = hid_dim 13 | 14 | self.embedding = nn.Embedding(input_dim, emb_dim) #no dropout as only one layer! 15 | 16 | #emb = np.load(name) 17 | #self.embedding.weight.data.copy_(torch.from_numpy(emb)) 18 | 19 | 20 | self.rnn = nn.LSTM(emb_dim, hid_dim,bidirectional=True) 21 | 22 | self.dropout = nn.Dropout(dropout) 23 | 24 | def forward(self, src): 25 | 26 | #src = [src len, batch size] 27 | 28 | embedded = self.dropout(self.embedding(src)) 29 | 30 | #embedded = [src len, batch size, emb dim] 31 | 32 | outputs, hidden = self.rnn(embedded) #no cell state! 33 | 34 | 35 | #outputs = [src len, batch size, hid dim * n directions] 36 | #hidden = [n layers * n directions, batch size, hid dim] 37 | 38 | #outputs are always from the top hidden layer 39 | 40 | return hidden, outputs 41 | 42 | 43 | 44 | 45 | class Decoder(nn.Module): 46 | def __init__(self, output_dim=50004, emb_dim=200, hid_dim=256, dropout=0.5, name='emb_kp20k2.npy'): 47 | super().__init__() 48 | 49 | self.hid_dim = hid_dim 50 | self.output_dim = output_dim 51 | 52 | self.embedding = nn.Embedding(output_dim, emb_dim) 53 | 54 | 55 | self.attention_layer = nn.Sequential( 56 | nn.Linear(self.hid_dim, self.hid_dim), 57 | nn.ReLU(inplace=True) 58 | ) 59 | self.rnn = nn.LSTM(emb_dim, hid_dim) 60 | 61 | self.fc_out = nn.Linear(emb_dim + hid_dim, output_dim) 62 | 63 | self.dropout = nn.Dropout(dropout) 64 | 65 | def forward(self, input, hidden, context): 66 | 67 | #input = [batch size] 68 | #hidden = [n layers * n directions, batch size, hid dim] 69 | #context = [n layers * n directions, batch size, hid dim] 70 | 71 | #n layers and n directions in the decoder will both always be 1, therefore: 72 | #hidden = [1, batch size, hid dim] 73 | #context = [1, batch size, hid dim] 74 | 75 | input = input.unsqueeze(0) 76 | 77 | #input = [1, batch size] 78 | 79 | embedded = self.dropout(self.embedding(input)) 80 | 81 | #embedded = [1, batch size, emb dim] 82 | 83 | #emb_con = torch.cat((embedded, context), dim = 2) 84 | 85 | #emb_con = [1, batch size, emb dim + hid dim] 86 | 87 | output, hidden = self.rnn(embedded, hidden) 88 | 89 | #output = [seq len, batch size, hid dim * n directions] 90 | #hidden = [n layers * n directions, batch size, hid dim] 91 | 92 | #seq len, n layers and n directions will always be 1 in the decoder, therefore: 93 | #output = [1, batch size, hid dim] 94 | #hidden = [1, batch size, hid dim] 95 | h,c = hidden 96 | context = nn.Tanh()(context) 97 | h = self.attention_layer(h) 98 | w = torch.bmm(context, h.permute(1,2,0)) 99 | w = w.squeeze() 100 | w = F.softmax(w,dim=-1) 101 | #print(w.shape, context.shape) 102 | w = torch.bmm(w.unsqueeze(1), context) 103 | w = w.squeeze() 104 | output = torch.cat((embedded.squeeze(0),w), 105 | dim = 1) 106 | 107 | 108 | #output = [batch size, emb dim + hid dim * 2] 109 | 110 | prediction = self.fc_out(output) 111 | 112 | #prediction = [batch size, output dim] 113 | 114 | return prediction, hidden 115 | 116 | class Seq2Seq(nn.Module): 117 | def __init__(self, encoder, decoder, device): 118 | super().__init__() 119 | 120 | self.encoder = encoder 121 | self.decoder = decoder 122 | self.device = device 123 | 124 | def forward(self, src, trg, teacher_forcing_ratio = 0.5): 125 | 126 | #src = [src len, batch size] 127 | #trg = [trg len, batch size] 128 | #teacher_forcing_ratio is probability to use teacher forcing 129 | #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time 130 | 131 | batch_size = trg.shape[1] 132 | trg_len = trg.shape[0] 133 | if teacher_forcing_ratio==0: 134 | trg_len = 6 135 | trg_vocab_size = self.decoder.output_dim 136 | 137 | #tensor to store decoder outputs 138 | outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device) 139 | 140 | #last hidden state of the encoder is the context 141 | context,aa = self.encoder(src) 142 | 143 | #context also used as the initial hidden state of the decoder 144 | hidden = context 145 | 146 | h,c = hidden 147 | h = 0.5*(h[0][:][:].squeeze(0)+h[1][:][:].squeeze(0)) 148 | c = 0.5*(c[0][:][:].squeeze(0)+c[1][:][:].squeeze(0)) 149 | hidden = (h.unsqueeze(0),c.unsqueeze(0)) 150 | 151 | aa = 0.5*(aa[:,:,:256]+aa[:,:,256:]) 152 | aa = aa.permute(1,0,2) 153 | 154 | 155 | 156 | #first input to the decoder is the tokens 157 | input = trg[0,:] 158 | 159 | 160 | for t in range(1, trg_len): 161 | 162 | #insert input token embedding, previous hidden state and the context state 163 | #receive output tensor (predictions) and new hidden state 164 | output, hidden = self.decoder(input, hidden, aa) 165 | 166 | #place predictions in a tensor holding predictions for each token 167 | outputs[t] = output 168 | 169 | #decide if we are going to use teacher forcing or not 170 | teacher_force = random.random() < teacher_forcing_ratio 171 | 172 | #get the highest predicted token from our predictions 173 | top1 = output.argmax(1) 174 | 175 | #if teacher forcing, use actual next token as next input 176 | #if not, use predicted token 177 | input = trg[t] if teacher_force else top1 178 | 179 | return outputs 180 | 181 | 182 | 183 | --------------------------------------------------------------------------------