├── .gitignore
├── LICENSE
├── README.md
└── textsum_data_convert.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 surmenok
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # TextSum
2 | Preparing a dataset for TensorFlow text summarization (TextSum) model.
3 | 


--------------------------------------------------------------------------------
/textsum_data_convert.py:
--------------------------------------------------------------------------------
  1 | """Example of Converting TextSum model data.
  2 | Usage:
  3 | python textsum_data_convert.py --command text_to_binary --in_directories dailymail/stories --out_files dailymail-train.bin,dailymail-validation.bin,dailymail-test.bin --split 0.8,0.15,0.05
  4 | python textsum_data_convert.py --command text_to_vocabulary --in_directories cnn/stories,dailymail/stories --out_files vocab
  5 | """
  6 | 
  7 | import collections
  8 | import struct
  9 | import sys
 10 | 
 11 | from os import listdir
 12 | from os.path import isfile, join
 13 | 
 14 | from nltk.tokenize import sent_tokenize
 15 | 
 16 | import tensorflow as tf
 17 | from tensorflow.core.example import example_pb2
 18 | 
 19 | from numpy.random import seed as random_seed
 20 | from numpy.random import shuffle as random_shuffle
 21 | 
 22 | random_seed(123)  # Reproducibility
 23 | 
 24 | FLAGS = tf.app.flags.FLAGS
 25 | tf.app.flags.DEFINE_string('command', 'text_to_binary',
 26 |                            'Either text_to_vocabulary or text_to_binary.'
 27 |                            'Specify FLAGS.in_directories accordingly.')
 28 | tf.app.flags.DEFINE_string('in_directories', '', 'path to directory')
 29 | tf.app.flags.DEFINE_string('out_files', '', 'comma separated paths to files')
 30 | tf.app.flags.DEFINE_string('split', '', 'comma separated fractions of data')
 31 | 
 32 | def _text_to_binary(input_directories, output_filenames, split_fractions):
 33 |   filenames = _get_filenames(input_directories)
 34 |   
 35 |   random_shuffle(filenames)
 36 |   
 37 |   start_from_index = 0
 38 |   for index, output_filename in enumerate(output_filenames):
 39 |     sample_count = int(len(filenames) * split_fractions[index])
 40 |     print(output_filename + ': ' + str(sample_count))
 41 |     
 42 |     end_index = min(start_from_index + sample_count, len(filenames))
 43 |     _convert_files_to_binary(filenames[start_from_index:end_index], output_filename)
 44 |     
 45 |     start_from_index = end_index
 46 | 
 47 | def _text_to_vocabulary(input_directories, vocabulary_filename, max_words=200000):
 48 |   filenames = _get_filenames(input_directories)
 49 |     
 50 |   counter = collections.Counter()
 51 |     
 52 |   for filename in filenames:
 53 |     with open(filename, 'r') as f:
 54 |       document = f.read()
 55 |     
 56 |     words = document.split()
 57 |     counter.update(words)
 58 | 
 59 |   with open(vocabulary_filename, 'w') as writer:
 60 |     for word, count in counter.most_common(max_words - 2):
 61 |       writer.write(word + ' ' + str(count) + '\n')
 62 |     writer.write('<s> 0\n')
 63 |     writer.write('</s> 0\n')
 64 |     writer.write('<UNK> 0\n')
 65 |     writer.write('<PAD> 0\n')
 66 | 
 67 | def _get_filenames(input_directories):
 68 |   filenames = []
 69 |   for directory_name in input_directories:
 70 |     filenames.extend([join(directory_name, f) for f in listdir(directory_name) if isfile(join(directory_name, f))])
 71 |   return filenames
 72 |         
 73 | def _convert_files_to_binary(input_filenames, output_filename):
 74 |   with open(output_filename, 'wb') as writer:
 75 |     for filename in input_filenames:
 76 |       with open(filename, 'r') as f:
 77 |         document = f.read()
 78 |     
 79 |       document_parts = document.split('\n', 1)
 80 |       assert len(document_parts) == 2
 81 |     
 82 |       title = '<d><p><s>' + document_parts[0] + '</s></p></d>'
 83 |       
 84 |       body = document_parts[1].decode('utf8').replace('\n', ' ').replace('\t', ' ')
 85 |       sentences = sent_tokenize(body)
 86 |       body = '<d><p>' + ' '.join(['<s>' + sentence + '</s>' for sentence in sentences]) + '</p></d>'
 87 |       body = body.encode('utf8')
 88 |     
 89 |       tf_example = example_pb2.Example()
 90 |       tf_example.features.feature['article'].bytes_list.value.extend([body])
 91 |       tf_example.features.feature['abstract'].bytes_list.value.extend([title])
 92 |       tf_example_str = tf_example.SerializeToString()
 93 |       str_len = len(tf_example_str)
 94 |       writer.write(struct.pack('q', str_len))
 95 |       writer.write(struct.pack('%ds' % str_len, tf_example_str))
 96 | 
 97 | def main(unused_argv):
 98 |   assert FLAGS.command and FLAGS.in_directories and FLAGS.out_files
 99 |   output_filenames = FLAGS.out_files.split(',')
100 |   input_directories = FLAGS.in_directories.split(',')
101 |   
102 |   if FLAGS.command == 'text_to_binary':
103 |     assert FLAGS.split
104 |     
105 |     split_fractions = [float(s) for s in FLAGS.split.split(',')]
106 |     
107 |     assert len(output_filenames) == len(split_fractions)
108 |     
109 |     _text_to_binary(input_directories, output_filenames, split_fractions)
110 |   
111 |   elif FLAGS.command == 'text_to_vocabulary':
112 |     assert len(output_filenames) == 1
113 |     
114 |     _text_to_vocabulary(input_directories, output_filenames[0])
115 | 
116 | if __name__ == '__main__':
117 |   tf.app.run()


--------------------------------------------------------------------------------