├── .gitignore ├── LICENSE ├── README.md └── textsum_data_convert.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 surmenok 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TextSum 2 | Preparing a dataset for TensorFlow text summarization (TextSum) model. 3 | -------------------------------------------------------------------------------- /textsum_data_convert.py: -------------------------------------------------------------------------------- 1 | """Example of Converting TextSum model data. 2 | Usage: 3 | python textsum_data_convert.py --command text_to_binary --in_directories dailymail/stories --out_files dailymail-train.bin,dailymail-validation.bin,dailymail-test.bin --split 0.8,0.15,0.05 4 | python textsum_data_convert.py --command text_to_vocabulary --in_directories cnn/stories,dailymail/stories --out_files vocab 5 | """ 6 | 7 | import collections 8 | import struct 9 | import sys 10 | 11 | from os import listdir 12 | from os.path import isfile, join 13 | 14 | from nltk.tokenize import sent_tokenize 15 | 16 | import tensorflow as tf 17 | from tensorflow.core.example import example_pb2 18 | 19 | from numpy.random import seed as random_seed 20 | from numpy.random import shuffle as random_shuffle 21 | 22 | random_seed(123) # Reproducibility 23 | 24 | FLAGS = tf.app.flags.FLAGS 25 | tf.app.flags.DEFINE_string('command', 'text_to_binary', 26 | 'Either text_to_vocabulary or text_to_binary.' 27 | 'Specify FLAGS.in_directories accordingly.') 28 | tf.app.flags.DEFINE_string('in_directories', '', 'path to directory') 29 | tf.app.flags.DEFINE_string('out_files', '', 'comma separated paths to files') 30 | tf.app.flags.DEFINE_string('split', '', 'comma separated fractions of data') 31 | 32 | def _text_to_binary(input_directories, output_filenames, split_fractions): 33 | filenames = _get_filenames(input_directories) 34 | 35 | random_shuffle(filenames) 36 | 37 | start_from_index = 0 38 | for index, output_filename in enumerate(output_filenames): 39 | sample_count = int(len(filenames) * split_fractions[index]) 40 | print(output_filename + ': ' + str(sample_count)) 41 | 42 | end_index = min(start_from_index + sample_count, len(filenames)) 43 | _convert_files_to_binary(filenames[start_from_index:end_index], output_filename) 44 | 45 | start_from_index = end_index 46 | 47 | def _text_to_vocabulary(input_directories, vocabulary_filename, max_words=200000): 48 | filenames = _get_filenames(input_directories) 49 | 50 | counter = collections.Counter() 51 | 52 | for filename in filenames: 53 | with open(filename, 'r') as f: 54 | document = f.read() 55 | 56 | words = document.split() 57 | counter.update(words) 58 | 59 | with open(vocabulary_filename, 'w') as writer: 60 | for word, count in counter.most_common(max_words - 2): 61 | writer.write(word + ' ' + str(count) + '\n') 62 | writer.write(' 0\n') 63 | writer.write(' 0\n') 64 | writer.write(' 0\n') 65 | writer.write(' 0\n') 66 | 67 | def _get_filenames(input_directories): 68 | filenames = [] 69 | for directory_name in input_directories: 70 | filenames.extend([join(directory_name, f) for f in listdir(directory_name) if isfile(join(directory_name, f))]) 71 | return filenames 72 | 73 | def _convert_files_to_binary(input_filenames, output_filename): 74 | with open(output_filename, 'wb') as writer: 75 | for filename in input_filenames: 76 | with open(filename, 'r') as f: 77 | document = f.read() 78 | 79 | document_parts = document.split('\n', 1) 80 | assert len(document_parts) == 2 81 | 82 | title = '

' + document_parts[0] + '

' 83 | 84 | body = document_parts[1].decode('utf8').replace('\n', ' ').replace('\t', ' ') 85 | sentences = sent_tokenize(body) 86 | body = '

' + ' '.join(['' + sentence + '' for sentence in sentences]) + '

' 87 | body = body.encode('utf8') 88 | 89 | tf_example = example_pb2.Example() 90 | tf_example.features.feature['article'].bytes_list.value.extend([body]) 91 | tf_example.features.feature['abstract'].bytes_list.value.extend([title]) 92 | tf_example_str = tf_example.SerializeToString() 93 | str_len = len(tf_example_str) 94 | writer.write(struct.pack('q', str_len)) 95 | writer.write(struct.pack('%ds' % str_len, tf_example_str)) 96 | 97 | def main(unused_argv): 98 | assert FLAGS.command and FLAGS.in_directories and FLAGS.out_files 99 | output_filenames = FLAGS.out_files.split(',') 100 | input_directories = FLAGS.in_directories.split(',') 101 | 102 | if FLAGS.command == 'text_to_binary': 103 | assert FLAGS.split 104 | 105 | split_fractions = [float(s) for s in FLAGS.split.split(',')] 106 | 107 | assert len(output_filenames) == len(split_fractions) 108 | 109 | _text_to_binary(input_directories, output_filenames, split_fractions) 110 | 111 | elif FLAGS.command == 'text_to_vocabulary': 112 | assert len(output_filenames) == 1 113 | 114 | _text_to_vocabulary(input_directories, output_filenames[0]) 115 | 116 | if __name__ == '__main__': 117 | tf.app.run() --------------------------------------------------------------------------------