├── README.md └── convertdata.py /README.md: -------------------------------------------------------------------------------- 1 | # TextSummarization 2 | Generating Dataset for Google's Text Summarization Code by Xin Pan and Peter Liu 3 | 4 | Repository Link: https://github.com/tensorflow/models/tree/master/research/textsum 5 | 6 | Dataset can be obtained here: CNN and DailyMail stories http://cs.nyu.edu/~kcho/DMQA/ 7 | 8 | 9 | Working: 10 | 11 | The valid data format requires article and abstract key for the TextSum algorithm to train and decode. 12 | 13 | Both articles and abstracts are tagged for sentence, paragraph and document start and end. 14 | 15 | abstract is extracted using all @highlights in data. 16 | 17 | Vocabulary with 200000 words include UNK and PAD tokens are generated. 18 | 19 | 20 | 21 | Usage: 22 | 23 | CNN and DailyMail data should be present in %pwd%/cnn/stories and %pwd%/dailymail/stories 24 | 25 | run mkdir data in the present working directory 26 | 27 | You can opt for generating both Datasets or one of them using the following arguments- 28 | 29 | run python convertdata.py --both or --CNN or --DM 30 | 31 | -------------------------------------------------------------------------------- /convertdata.py: -------------------------------------------------------------------------------- 1 | '''CNN and DM data should be present in ./cnn/stories and ./dailymail/stories''' 2 | import os 3 | import re 4 | import sys 5 | import struct 6 | import numpy as np 7 | import collections 8 | from nltk.tokenize import sent_tokenize 9 | from tensorflow.core.example import example_pb2 10 | 11 | counter = collections.Counter() 12 | temp=0 13 | #train, test, validation split 14 | tr_r=0.85 15 | v_r=0.08 16 | if len(sys.argv)==1 or len(sys.argv)>2: 17 | print "Incorrect Usage" 18 | print "Usage: python convertdata.py --CNN or --DM or --both" 19 | exit() 20 | 21 | if sys.argv[1]=="--CNN" or sys.argv[1]=="--both": 22 | print 'Generating CNN data....' 23 | print 24 | files=os.listdir("cnn/stories/") 25 | n_files=len(files) 26 | 27 | print "Total Files:", n_files 28 | print 29 | train=files[:int(n_files*0.8)] 30 | validation=files[len(train):len(train)+int(n_files*0.12)] 31 | test=files[len(train)+len(validation):] 32 | 33 | def convert_text2bin1(docs, writer): 34 | global counter 35 | for i, fi in enumerate(docs): 36 | with open("cnn/stories/"+fi,'r') as f: 37 | wholetext=f.read().decode('utf8').lower() 38 | wholetext=re.sub(r'[^\x00-\x7F]+','', wholetext) 39 | wholetext=re.sub(r"(\s?[\']\s+|\s+[\']\s?)"," ' ", wholetext) 40 | wholetext=re.sub(r'(\s?[\"]\s+|\s+[\"]\s?)',' " ', wholetext) 41 | wholetext=re.sub(r"(\'[s]\s+)"," 's ", wholetext) 42 | wholetext=wholetext.replace("."," . ") 43 | wholetext=wholetext.replace(","," , ") 44 | wholetext=wholetext.replace('-',' - ') 45 | wholetext=wholetext.replace('?',' ? ') 46 | wholetext=wholetext.replace('(','( ') 47 | wholetext=wholetext.replace(')',' )') 48 | data=wholetext.split("@highlight") 49 | news=data[0] 50 | highlights=data[1].replace('\n\n','') 51 | news=(" ".join(news.split('\n\n'))).strip() 52 | sentences = sent_tokenize(news) 53 | news = ' ' + ' '.join([' ' + sentence + ' ' for sentence in sentences]) + ' ' 54 | highlights = ' ' + highlights + ' ' 55 | words = (news+" "+highlights).split() 56 | counter.update(words) 57 | tf_example = example_pb2.Example() 58 | tf_example.features.feature['article'].bytes_list.value.extend([(' '.join(news.split())).encode('utf-8')]) 59 | tf_example.features.feature['abstract'].bytes_list.value.extend([(' '.join(highlights.split())).encode('utf-8')]) 60 | tf_example_str = tf_example.SerializeToString() 61 | str_len = len(tf_example_str) 62 | writer.write(struct.pack('q', str_len)) 63 | writer.write(struct.pack('%ds' % str_len, tf_example_str)) 64 | if i%3000==0: 65 | print int((float(i)/ len(docs))*100), "%" 66 | print (float(len(docs))/ len(docs))*100, "%...." "converted\n\n" 67 | 68 | print "Generating Training Data\n" 69 | with open('data/trainCNN.bin', 'wb') as writer: 70 | convert_text2bin1(train,writer) 71 | print "Generating Validation Data\n" 72 | with open('data/validationCNN.bin', 'wb') as writer: 73 | convert_text2bin1(validation,writer) 74 | print "Generating Testing Data\n" 75 | with open('data/testCNN.bin', 'wb') as writer: 76 | convert_text2bin1(test,writer) 77 | 78 | ntrain=len(train) 79 | nval=len(validation) 80 | ntest=len(test) 81 | print "CNN Data Generated" 82 | print "Train:\t\t",len(train) 83 | print "Validation:\t",len(validation) 84 | print "Test:\t\t",len(test) 85 | print 86 | print 87 | temp=n_files 88 | 89 | if sys.argv[1]=="--DM" or sys.argv[1]=="--both": 90 | print "Generating DailyMail data...." 91 | print 92 | files=os.listdir("dailymail/stories/") 93 | n_files=len(files) 94 | 95 | 96 | print "Total Files:", n_files 97 | print 98 | train=files[:int(n_files*0.8)] 99 | validation=files[len(train):len(train)+int(n_files*0.12)] 100 | test=files[len(train)+len(validation):] 101 | 102 | def convert_text2bin2(docs, writer): 103 | global counter 104 | for i, fi in enumerate(docs): 105 | with open("dailymail/stories/"+fi,'r') as f: 106 | wholetext=f.read().decode('utf8').lower() 107 | wholetext=re.sub(r'[^\x00-\x7F]+','', wholetext) 108 | wholetext=re.sub(r"(\s?[\']\s+|\s+[\']\s?)"," ' ", wholetext) 109 | wholetext=re.sub(r'(\s?[\"]\s+|\s+[\"]\s?)',' " ', wholetext) 110 | wholetext=re.sub(r"(\'[s]\s+)"," 's ", wholetext) 111 | wholetext=wholetext.replace("."," . ") 112 | wholetext=wholetext.replace(","," , ") 113 | wholetext=wholetext.replace('-',' - ') 114 | wholetext=wholetext.replace('?',' ? ') 115 | wholetext=wholetext.replace('(','( ') 116 | wholetext=wholetext.replace(')',' )') 117 | data=wholetext.split("@highlight") 118 | news=data[0] 119 | try: 120 | news=news.split("updated:")[1] 121 | news=news[news.find('20')+4:] 122 | except: 123 | None 124 | news=(" ".join(news.split('\n'))).strip() 125 | highlights=data[1].replace('\n\n','') 126 | news=(" ".join(news.split('\n\n'))).strip() 127 | sentences = sent_tokenize(news) 128 | news = ' ' + ' '.join([' ' + sentence + ' ' for sentence in sentences]) + ' ' 129 | highlights = ' ' + highlights + ' ' 130 | words = (news+" "+highlights).split() 131 | counter.update(words) 132 | tf_example = example_pb2.Example() 133 | tf_example.features.feature['article'].bytes_list.value.extend([(' '.join(news.split())).encode('utf-8')]) 134 | tf_example.features.feature['abstract'].bytes_list.value.extend([(' '.join(highlights.split())).encode('utf-8')]) 135 | tf_example_str = tf_example.SerializeToString() 136 | str_len = len(tf_example_str) 137 | writer.write(struct.pack('q', str_len)) 138 | writer.write(struct.pack('%ds' % str_len, tf_example_str)) 139 | if i%3000==0: 140 | print int((float(i)/ len(docs))*100), "%" 141 | print (float(len(docs))/ len(docs))*100, "%...." "converted\n\n" 142 | 143 | 144 | print "Generating Training Data\n" 145 | with open('data/trainDM.bin', 'wb') as writer: 146 | convert_text2bin2(train,writer) 147 | print "Generating Validation Data\n" 148 | with open('data/validationDM.bin', 'wb') as writer: 149 | convert_text2bin2(validation,writer) 150 | print "Generating Testing Data\n" 151 | with open('data/testDM.bin', 'wb') as writer: 152 | convert_text2bin2(test,writer) 153 | 154 | print "DailyMail Data Generated" 155 | print "Train:\t\t",len(train) 156 | print "Validation:\t",len(validation) 157 | print "Test:\t\t",len(test) 158 | print 159 | 160 | 161 | print "Generating Vocabulary" 162 | 163 | mc=counter.most_common(200000-2) 164 | with open("data/vocab", 'w') as writer: 165 | for word, count in mc: 166 | writer.write(word + ' ' + str(count) + '\n') 167 | writer.write(' 0\n') 168 | writer.write(' 0\n') 169 | 170 | 171 | print "\n\nData Generation Finished...\n\n" 172 | if sys.argv[1]=="--CNN": 173 | print "CNN Generated" 174 | temp=0 175 | elif sys.argv[1]=="--DM": 176 | print "DM Generated" 177 | else: 178 | print "CNN+DailyMail Data Generated" 179 | 180 | print "Total Records",temp+n_files 181 | print "Total Train",ntrain+len(train) 182 | print "Total Validation",nval+len(validation) 183 | print "Total Test",ntest+len(test) 184 | print "Vocab Generated with total no. of words:",len(mc)+2 185 | --------------------------------------------------------------------------------
' + ' '.join([' ' + sentence + ' ' for sentence in sentences]) + '
' + highlights + '