├── README.md
└── convertdata.py


/README.md:
--------------------------------------------------------------------------------
 1 | # TextSummarization
 2 | Generating Dataset for Google's Text Summarization Code by Xin Pan and Peter Liu 
 3 | 
 4 | Repository Link: https://github.com/tensorflow/models/tree/master/research/textsum
 5 | 
 6 | Dataset can be obtained here: CNN and DailyMail stories http://cs.nyu.edu/~kcho/DMQA/
 7 | 
 8 | 
 9 | Working:
10 | 
11 | The valid data format requires article and abstract key for the TextSum algorithm to train and decode.
12 | 
13 | Both articles and abstracts are tagged for sentence, paragraph and document start and end.
14 | 
15 | abstract is extracted using all @highlights in data.
16 | 
17 | Vocabulary with 200000 words include UNK and PAD tokens are generated.
18 | 
19 | 
20 | 
21 | Usage:
22 | 
23 | CNN and DailyMail data should be present in %pwd%/cnn/stories and %pwd%/dailymail/stories
24 | 
25 | run mkdir data in the present working directory
26 | 
27 | You can opt for generating both Datasets or one of them using the following arguments-
28 | 
29 | run python convertdata.py --both or --CNN or --DM
30 | 
31 | 


--------------------------------------------------------------------------------
/convertdata.py:
--------------------------------------------------------------------------------
  1 | '''CNN and DM data should be present in ./cnn/stories and ./dailymail/stories'''
  2 | import os
  3 | import re
  4 | import sys
  5 | import struct
  6 | import numpy as np
  7 | import collections
  8 | from nltk.tokenize import sent_tokenize
  9 | from tensorflow.core.example import example_pb2
 10 | 
 11 | counter = collections.Counter()
 12 | temp=0
 13 | #train, test, validation split
 14 | tr_r=0.85
 15 | v_r=0.08
 16 | if len(sys.argv)==1 or len(sys.argv)>2:
 17 |     print "Incorrect Usage"
 18 |     print "Usage: python convertdata.py --CNN or --DM or --both"
 19 |     exit()
 20 | 
 21 | if sys.argv[1]=="--CNN" or sys.argv[1]=="--both":
 22 |     print 'Generating CNN data....'
 23 |     print
 24 |     files=os.listdir("cnn/stories/")
 25 |     n_files=len(files)
 26 | 
 27 |     print "Total Files:", n_files
 28 |     print
 29 |     train=files[:int(n_files*0.8)]
 30 |     validation=files[len(train):len(train)+int(n_files*0.12)]
 31 |     test=files[len(train)+len(validation):]
 32 | 
 33 |     def convert_text2bin1(docs, writer):
 34 |         global counter
 35 |         for i, fi in enumerate(docs):
 36 |             with open("cnn/stories/"+fi,'r') as f:
 37 |                 wholetext=f.read().decode('utf8').lower()
 38 |                 wholetext=re.sub(r'[^\x00-\x7F]+','', wholetext)
 39 |                 wholetext=re.sub(r"(\s?[\']\s+|\s+[\']\s?)"," ' ", wholetext)
 40 |                 wholetext=re.sub(r'(\s?[\"]\s+|\s+[\"]\s?)',' " ', wholetext)
 41 |                 wholetext=re.sub(r"(\'[s]\s+)"," 's ", wholetext)
 42 |                 wholetext=wholetext.replace("."," . ")
 43 |                 wholetext=wholetext.replace(","," , ")
 44 |                 wholetext=wholetext.replace('-',' - ')
 45 |                 wholetext=wholetext.replace('?',' ? ')
 46 |                 wholetext=wholetext.replace('(','( ')
 47 |                 wholetext=wholetext.replace(')',' )')
 48 |                 data=wholetext.split("@highlight")
 49 |                 news=data[0]
 50 |                 highlights=data[1].replace('\n\n','')
 51 |                 news=(" ".join(news.split('\n\n'))).strip()
 52 |                 sentences = sent_tokenize(news)
 53 |                 news = '<d> <p> ' + ' '.join(['<s> ' + sentence + ' </s>' for sentence in sentences]) + ' </p> </d>'
 54 |                 highlights = '<d> <p> <s> ' + highlights + ' </s> </p> </d>'
 55 |                 words = (news+" "+highlights).split()
 56 |                 counter.update(words)
 57 |                 tf_example = example_pb2.Example()
 58 |                 tf_example.features.feature['article'].bytes_list.value.extend([(' '.join(news.split())).encode('utf-8')])
 59 |                 tf_example.features.feature['abstract'].bytes_list.value.extend([(' '.join(highlights.split())).encode('utf-8')])
 60 |                 tf_example_str = tf_example.SerializeToString()
 61 |                 str_len = len(tf_example_str)
 62 |                 writer.write(struct.pack('q', str_len))
 63 |                 writer.write(struct.pack('%ds' % str_len, tf_example_str))
 64 |                 if i%3000==0:
 65 |                     print int((float(i)/ len(docs))*100), "%"
 66 |         print (float(len(docs))/ len(docs))*100, "%...." "converted\n\n"
 67 | 
 68 |     print "Generating Training Data\n"
 69 |     with open('data/trainCNN.bin', 'wb') as writer:
 70 |         convert_text2bin1(train,writer)
 71 |     print "Generating Validation Data\n"
 72 |     with open('data/validationCNN.bin', 'wb') as writer:
 73 |         convert_text2bin1(validation,writer)
 74 |     print "Generating Testing Data\n"
 75 |     with open('data/testCNN.bin', 'wb') as writer:
 76 |         convert_text2bin1(test,writer)
 77 | 
 78 |     ntrain=len(train)
 79 |     nval=len(validation)
 80 |     ntest=len(test)
 81 |     print "CNN Data Generated"
 82 |     print "Train:\t\t",len(train)
 83 |     print "Validation:\t",len(validation)
 84 |     print "Test:\t\t",len(test)
 85 |     print
 86 |     print
 87 |     temp=n_files
 88 | 
 89 | if sys.argv[1]=="--DM" or sys.argv[1]=="--both":
 90 |     print "Generating DailyMail data...."
 91 |     print
 92 |     files=os.listdir("dailymail/stories/")
 93 |     n_files=len(files)
 94 | 
 95 | 
 96 |     print "Total Files:", n_files
 97 |     print
 98 |     train=files[:int(n_files*0.8)]
 99 |     validation=files[len(train):len(train)+int(n_files*0.12)]
100 |     test=files[len(train)+len(validation):]
101 | 
102 |     def convert_text2bin2(docs, writer):
103 |         global counter
104 |         for i, fi in enumerate(docs):
105 |             with open("dailymail/stories/"+fi,'r') as f:
106 |                 wholetext=f.read().decode('utf8').lower()
107 |                 wholetext=re.sub(r'[^\x00-\x7F]+','', wholetext)
108 |                 wholetext=re.sub(r"(\s?[\']\s+|\s+[\']\s?)"," ' ", wholetext)
109 |                 wholetext=re.sub(r'(\s?[\"]\s+|\s+[\"]\s?)',' " ', wholetext)
110 |                 wholetext=re.sub(r"(\'[s]\s+)"," 's ", wholetext)
111 |                 wholetext=wholetext.replace("."," . ")
112 |                 wholetext=wholetext.replace(","," , ")
113 |                 wholetext=wholetext.replace('-',' - ')
114 |                 wholetext=wholetext.replace('?',' ? ')
115 |                 wholetext=wholetext.replace('(','( ')
116 |                 wholetext=wholetext.replace(')',' )')
117 |                 data=wholetext.split("@highlight")
118 |                 news=data[0]
119 |                 try:
120 |                     news=news.split("updated:")[1]
121 |                     news=news[news.find('20')+4:]
122 |                 except:
123 |                     None
124 |                 news=(" ".join(news.split('\n'))).strip()
125 |                 highlights=data[1].replace('\n\n','')
126 |                 news=(" ".join(news.split('\n\n'))).strip()
127 |                 sentences = sent_tokenize(news)
128 |                 news = '<d> <p> ' + ' '.join(['<s> ' + sentence + ' </s>' for sentence in sentences]) + ' </p> </d>'
129 |                 highlights = '<d> <p> <s> ' + highlights + ' </s> </p> </d>'
130 |                 words = (news+" "+highlights).split()
131 |                 counter.update(words)
132 |                 tf_example = example_pb2.Example()
133 |                 tf_example.features.feature['article'].bytes_list.value.extend([(' '.join(news.split())).encode('utf-8')])
134 |                 tf_example.features.feature['abstract'].bytes_list.value.extend([(' '.join(highlights.split())).encode('utf-8')])
135 |                 tf_example_str = tf_example.SerializeToString()
136 |                 str_len = len(tf_example_str)
137 |                 writer.write(struct.pack('q', str_len))
138 |                 writer.write(struct.pack('%ds' % str_len, tf_example_str))
139 |                 if i%3000==0:
140 |                     print int((float(i)/ len(docs))*100), "%"
141 |         print (float(len(docs))/ len(docs))*100, "%...." "converted\n\n"
142 | 
143 | 
144 |     print "Generating Training Data\n"
145 |     with open('data/trainDM.bin', 'wb') as writer:
146 |         convert_text2bin2(train,writer)
147 |     print "Generating Validation Data\n"
148 |     with open('data/validationDM.bin', 'wb') as writer:
149 |         convert_text2bin2(validation,writer)
150 |     print "Generating Testing Data\n"
151 |     with open('data/testDM.bin', 'wb') as writer:
152 |         convert_text2bin2(test,writer)
153 | 
154 |     print "DailyMail Data Generated"
155 |     print "Train:\t\t",len(train)
156 |     print "Validation:\t",len(validation)
157 |     print "Test:\t\t",len(test)
158 |     print
159 | 
160 | 
161 | print "Generating Vocabulary"
162 | 
163 | mc=counter.most_common(200000-2)
164 | with open("data/vocab", 'w') as writer:
165 |     for word, count in mc:
166 |         writer.write(word + ' ' + str(count) + '\n')
167 |     writer.write('<UNK> 0\n')
168 |     writer.write('<PAD> 0\n')
169 | 
170 | 
171 | print "\n\nData Generation Finished...\n\n"
172 | if sys.argv[1]=="--CNN":
173 |     print "CNN Generated"
174 |     temp=0
175 | elif sys.argv[1]=="--DM":
176 |     print "DM Generated"
177 | else:
178 |     print "CNN+DailyMail Data Generated"
179 | 
180 | print "Total Records",temp+n_files
181 | print "Total Train",ntrain+len(train)
182 | print "Total Validation",nval+len(validation)
183 | print "Total Test",ntest+len(test)
184 | print "Vocab Generated with total no. of words:",len(mc)+2
185 | 


--------------------------------------------------------------------------------