├── README.md └── code ├── main_with_args.py ├── utils.py ├── generate_data.py ├── event_chain.py ├── evaluate.py └── gnn_with_args.py /README.md: -------------------------------------------------------------------------------- 1 | # ConstructingNEEG_IJCAI_2018 2 | 3 | ## Paper Data and Code 4 | The data and code for our IJCAI-ECAI 2018 Paper: [Constructing Narrative Event Evolutionary Graph for Script Event Prediction](https://arxiv.org/abs/1805.05081). 5 | 6 | Data used in our paper can be found [here](https://drive.google.com/open?id=1WFBDL_zfNC1sSuz0dmaMux3w-OB_hUui). The codes here include PyTorch implementations of the PairLSTM baseline and our SGNN model. Code for EventComp model and how to extract the narrative event chains from raw NYT news corpus can be found [here](http://mark.granroth-wilding.co.uk/papers/what_happens_next/). 7 | 8 | ## How to run the code? 9 | 10 | You need to download the data I used from [google-drive](https://drive.google.com/open?id=1WFBDL_zfNC1sSuz0dmaMux3w-OB_hUui). Besides, you need Python3.5 or 3.6, PyTorch 0.3.0, and Nvidia GPU, perhaps Titan XP or Tesla P100. You can run python3 evaluate.py to get the results reported in my paper, and run python3 event_chain.py to train a PairLSTM model, and run python3 main_with_args.py to train a SGNN model. I have writen some annotations in my codes, please read them and run! 11 | 12 | It is a very time consuming extraction pipeline from the raw NYT/Gigaword raw corpus to get the preprocessed data. The good news is that you don't need to download the Gigaword corpus, because I have provided all the data you need to run the code. 13 | 14 | **Original Data**: 15 | `encoding_with_args.csv and data2.csv` are the constructed NEEG in the paper. `corpus_index_train0.txt, corpus_index_dev.txt and corpus_index_test.txt` are the original training, development and test sets I used to train the SGNN model. Just use the `pickle` module to load them. 16 | 17 | 18 | ## Requirements 19 | * Linux OS 20 | * Python 3.5 or 3.6 21 | * PyTorch 0.3.0 22 | * GPU (Tesla P100 or Others) 23 | 24 | -------------------------------------------------------------------------------- /code/main_with_args.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | # Run this code to train our SGNN model. 3 | # Generally we can train a model in about 1400 seconds (the code will automatically terminate by using early stop) using one Tesla P100 GPU. 4 | from gnn_with_args import * 5 | 6 | def main(): 7 | dev_data=Data_data(pickle.load(open('../data/corpus_index_dev_with_args_all_chain.data','rb'))) 8 | test_data=Data_data(pickle.load(open('../data/corpus_index_test_with_args_all_chain.data','rb'))) 9 | train_data=Data_data(pickle.load(open('../data/corpus_index_train0_with_args_all_chain.data','rb'))) 10 | ans=pickle.load(open('../data/dev.answer','rb')) 11 | dev_index=pickle.load(open('../data/dev_index.pickle','rb')) 12 | print('train data prepare done') 13 | word_id,id_vec,word_vec=get_hash_for_word('../data/deepwalk_128_unweighted_with_args.txt',verb_net3_mapping_with_args) 14 | print('word vector prepare done') 15 | 16 | if len(sys.argv)==9: 17 | L2_penalty,MARGIN,LR,T,BATCH_SIZE,EPOCHES,PATIENTS,METRIC=sys.argv[1:] 18 | else: 19 | HIDDEN_DIM = 128*4 20 | L2_penalty=0.00001 21 | LR=0.0001 22 | T=2 23 | MARGIN=0.015 24 | BATCH_SIZE=1000 25 | EPOCHES=520 26 | PATIENTS=500 27 | METRIC='euclid' 28 | 29 | if METRIC=='euclid': # 30 | L2_penalty=0.00001 31 | LR=0.0001 32 | BATCH_SIZE=1000 33 | MARGIN=0.015 34 | PATIENTS=500 35 | if METRIC=='dot': # 36 | # LR=0.004 37 | MARGIN=0.5 38 | if METRIC=='cosine': # 39 | # LR=0.001 40 | MARGIN=0.05 41 | if METRIC=='norm_euclid': # 42 | # LR=0.0011 43 | MARGIN=0.07 44 | if METRIC=='manhattan': # 45 | # LR=0.0015 46 | MARGIN=4.5 47 | if METRIC=='multi': # 48 | # LR=0.001 49 | MARGIN=0.015 50 | if METRIC=='nonlinear': # 51 | # LR=0.001 52 | MARGIN=0.015 53 | start=time.time() 54 | best_acc,best_epoch=train(dev_index,word_vec,ans,train_data,dev_data,test_data,float(L2_penalty),float(MARGIN),float(LR),int(T),int(BATCH_SIZE),int(EPOCHES),int(PATIENTS),int(HIDDEN_DIM),METRIC) 55 | end=time.time() 56 | print ("Run time: %f s" % (end-start)) 57 | with open('best_result.txt','a') as f: 58 | f.write('Best Acc: %f, Epoch %d , L2_penalty=%s ,MARGIN=%s ,LR=%s ,T=%s ,BATCH_SIZE=%s ,EPOCHES=%s ,PATIENTS=%s, HIDDEN_DIM=%s, METRIC=%s\n' % (best_acc,best_epoch,L2_penalty,MARGIN,LR,T,BATCH_SIZE,EPOCHES,PATIENTS,HIDDEN_DIM,METRIC)) 59 | f.close() 60 | 61 | 62 | if __name__ == '__main__': 63 | main() 64 | 65 | # 事件表示:事件链条的多维分布表示,加入频率和共现频次信息 66 | # 构建Graph: 统计bigram-过滤低频,删除自环,高频事件处理-图构建-计算概率 67 | # Context Extension By Ranking 68 | # Highway Networks 69 | # SRU 70 | # Attention 71 | # Subgraph Embedding 72 | # Adam 73 | -------------------------------------------------------------------------------- /code/utils.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | # This file defines some helper functions. 3 | 4 | import torch 5 | import pickle 6 | import torch.nn as nn 7 | from sklearn import preprocessing 8 | from torch.nn import Parameter,Module 9 | from torch.autograd import Variable 10 | import torch.autograd as autograd 11 | import torch.nn.functional as F 12 | import torch.optim as optim 13 | import pprint,copy,os,random,math,sys,pickle,time 14 | import numpy as np 15 | import networkx as nx 16 | torch.manual_seed(1) 17 | use_cuda = torch.cuda.is_available() 18 | # from whim_common.utils.progress import get_progress_bar 19 | # use_cuda = False 20 | from multiprocessing import Process,Pool 21 | 22 | verb_net3_mapping_with_args='../data/encoding_with_args.csv' 23 | 24 | 25 | def trans_to_cuda(variable): 26 | if use_cuda: 27 | return variable.cuda() 28 | else: 29 | return variable 30 | 31 | def id_to_vec(emb_file): 32 | dic={} 33 | for s in open(emb_file): 34 | s=s.strip().split() 35 | if len(s)==2: 36 | continue 37 | dic[s[0]]=np.array(s[1:],dtype=np.float32) 38 | dic['0']=np.zeros(len(dic['0']),dtype=np.float32) 39 | return dic 40 | 41 | def word_to_id(voc_file): 42 | dic={} 43 | for s in open(voc_file): 44 | s=s.strip().split() 45 | dic[s[1]]=s[0] 46 | return dic 47 | 48 | def get_word_vec(id_vec): 49 | word_vec=[] 50 | for i in range(len(id_vec)): 51 | word_vec.append(id_vec[str(i)]) 52 | return np.array(word_vec,dtype=np.float32) 53 | 54 | def get_hash_for_word(emb_file,voc_file): 55 | id_vec=id_to_vec(emb_file) 56 | return word_to_id(voc_file),id_vec,get_word_vec(id_vec) 57 | 58 | class Data_data(object): 59 | def __init__(self, questions,questions2=None): 60 | super(Data_data, self).__init__() 61 | if questions2==None: 62 | self.A,self.input_data,self.targets= questions[0],questions[1],questions[2] 63 | else: 64 | self.A = torch.cat((questions[0],questions2[0])) 65 | self.input_data = torch.cat((questions[1],questions2[1])) 66 | self.targets = torch.cat((questions[2],questions2[2])) 67 | self.corpus_length=len(self.targets) 68 | self.start=0 69 | def next_batch(self,batch_size): 70 | start=self.start 71 | end=(self.start+batch_size) if (self.start+batch_size)<=self.corpus_length else self.corpus_length 72 | self.start=(self.start+batch_size) 73 | if self.start