├── README.md ├── check_data.py ├── generate_data.py ├── sample ├── file1.txt └── file2.txt └── test.sh /README.md: -------------------------------------------------------------------------------- 1 | # textsum 2 | 3 | - description 4 | - test code for textsum 5 | - [textsum](https://github.com/tensorflow/models/tree/master/textsum) 6 | - [Text summarization with TensorFlow](https://research.googleblog.com/2016/08/text-summarization-with-tensorflow.html) 7 | - [English Gigaword](https://catalog.ldc.upenn.edu/LDC2012T21) 8 | - reference paper [A Neural Attention Model for Abstractive Sentence Summarization](https://arxiv.org/abs/1509.00685) 9 | - an implementation using tensorflow, [neural-summary-tensorflow](https://github.com/carpedm20/neural-summary-tensorflow) 10 | 11 | - pre-requesite and setting 12 | - follow instructions in https://github.com/tensorflow/models/tree/master/textsum 13 | 14 | - data format 15 | - how does data file look like? 16 | 17 | ``` 18 | # you need to install google protobuf(http://dchua.com/2016/04/08/installing-grpc,-protobuf-and-its-dependencies-for-python-development/) 19 | $ python check_data.py --data_path=data/data 20 | 21 | features { 22 | feature { 23 | key: "abstract" 24 | value { 25 | bytes_list { 26 | value: "

sri lanka closes schools as war escalates .

" 27 | } 28 | } 29 | } 30 | feature { 31 | key: "article" 32 | value { 33 | bytes_list { 34 | value: "

the sri lankan government on wednesday announced the closure of government schools with immediate effect as a military campaign against tamil separatists escalated in the north of the country . t 35 | he cabinet wednesday decided to advance the december holidays by one month because of a threat from the liberation tigers of tamil eelam -lrb- ltte -rrb- against school children , a government official said . `` there are i 36 | ntelligence reports that the tigers may try to kill a lot of children to provoke a backlash against tamils in colombo . `` if that happens , troops will have to be withdrawn from the north to maintain law and order here , \ 37 | '\' a police official said . he said education minister richard pathirana visited several government schools wednesday before the closure decision was taken . the government will make alternate arrangements to hold 38 | end of term examinations , officials said . earlier wednesday , president chandrika kumaratunga said the ltte may step up their attacks in the capital to seek revenge for the ongoing military offensive which she described 39 | as the biggest ever drive to take the tiger town of jaffna . .

" 40 | } 41 | } 42 | } 43 | feature { 44 | key: "publisher" 45 | value { 46 | bytes_list { 47 | value: "AFP" 48 | } 49 | } 50 | } 51 | } 52 | ... 53 | ``` 54 | 55 | - how to create own training data? 56 | 57 | ``` 58 | $ python generate_data.py --input_dir=sample --data_path=sample-0 59 | $ python check_data.py --data_path=sample-0 --crc=4 60 | 61 | features { 62 | feature { 63 | key: "abstract" 64 | value { 65 | bytes_list { 66 | value: "

hello tensorflow

" 67 | } 68 | } 69 | } 70 | feature { 71 | key: "article" 72 | value { 73 | bytes_list { 74 | value: "

this is a sample file

" 75 | } 76 | } 77 | } 78 | } 79 | 80 | article \t

this is a sample file

81 | abstract \t

hello tensorflow

82 | ... 83 | ``` 84 | - recent version of textsum provides `data_convert_example.py` 85 | ``` 86 | $ python data_convert_example.py --command binary_to_text --in_file data/data --out_file data/text_data 87 | $ python data_convert_example.py --command text_to_binary --in_file data/text_data --out_file data/binary_data 88 | $ python data_convert_example.py --command binary_to_text --in_file data/binary_data --out_file data/text_data2 89 | $ diff data/text_data2 data/text_data 90 | 91 | # your text_data format looks like 92 | 93 | abstract=hello world article=this is a test file 94 | abstract=... article=.... 95 | ... 96 | ``` 97 | 98 | - test 99 | ```shell 100 | $ ./test.sh -v -v 101 | ... 102 | running_avg_loss: 1.002997 103 | running_avg_loss: 1.384698 104 | running_avg_loss: 0.865053 105 | ... 106 | ``` 107 | 108 | - gpu setting tips 109 | ``` 110 | # if you want to prevent full gpu-memory allocation, set allow_growth option to 111 | # `seq2seq_attention.py`, `seq2seq_attention_decode.py` 112 | 113 | device_config=tf.ConfigProto(allow_soft_placement=True) 114 | device_config.gpu_options.allow_growth = True 115 | sess = tf.Session(config=device_config) 116 | ... 117 | ``` 118 | -------------------------------------------------------------------------------- /check_data.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | 3 | import sys 4 | import os 5 | 6 | import glob 7 | import random 8 | import struct 9 | import tensorflow as tf 10 | from tensorflow.core.example import example_pb2 11 | 12 | from google.protobuf import json_format 13 | import json 14 | import base64 15 | 16 | FLAGS = tf.app.flags.FLAGS 17 | tf.app.flags.DEFINE_string('data_path', 'data/data', 'Path expression to tf.Example.') 18 | tf.app.flags.DEFINE_string('crc', '0', 'crc size') 19 | FLAGS.crc = int(FLAGS.crc) 20 | 21 | def ExampleGen(recordio_path, crc=0, num_epochs=None): 22 | """Generates tf.Examples from path of recordio files. 23 | 24 | Args: 25 | recordio_path: CNS path to tf.Example recordio 26 | num_epochs: Number of times to go through the data. None means infinite. 27 | 28 | Yields: 29 | Deserialized tf.Example. 30 | 31 | If there are multiple files specified, they accessed in a random order. 32 | """ 33 | epoch = 0 34 | while True: 35 | if num_epochs is not None and epoch >= num_epochs: 36 | break 37 | filelist = glob.glob(recordio_path) 38 | assert filelist, 'Empty filelist.' 39 | random.shuffle(filelist) 40 | for f in filelist: 41 | ''' 42 | for example_str in tf.python_io.tf_record_iterator(f): 43 | yield example_pb2.Example.FromString(example_str) 44 | ''' 45 | reader = open(f, 'rb') 46 | while True: 47 | len_bytes = reader.read(8) 48 | skip_bytes = reader.read(crc) # skip crc bytes 49 | if not len_bytes: break 50 | str_len = struct.unpack('q', len_bytes)[0] 51 | example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0] 52 | skip_bytes = reader.read(crc) # skip crc bytes 53 | yield example_pb2.Example.FromString(example_str) 54 | epoch += 1 55 | 56 | for ret in ExampleGen(FLAGS.data_path, FLAGS.crc, num_epochs=1) : 57 | print type(ret) 58 | print ret 59 | json_string = json_format.MessageToJson(ret) 60 | json_obj = json.loads(json_string) 61 | feature = json_obj['features']['feature'] 62 | for key, val in feature.iteritems() : 63 | print key + '\t', 64 | bytesList = val['bytesList'] 65 | for v in bytesList['value'] : 66 | print base64.b64decode(v), 67 | print '\n', 68 | print '\n', 69 | -------------------------------------------------------------------------------- /generate_data.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | 3 | import sys 4 | import os 5 | 6 | import tensorflow as tf 7 | 8 | # Special tokens 9 | PARAGRAPH_START = '

' 10 | PARAGRAPH_END = '

' 11 | SENTENCE_START = '' 12 | SENTENCE_END = '' 13 | UNKNOWN_TOKEN = '' 14 | PAD_TOKEN = '' 15 | DOCUMENT_START = '' 16 | DOCUMENT_END = '' 17 | 18 | FLAGS = tf.app.flags.FLAGS 19 | tf.app.flags.DEFINE_string('input_dir', 'sample', 'input directory path') 20 | tf.app.flags.DEFINE_string('data_path', 'sample-0', 'output file path') 21 | 22 | 23 | ''' 24 | features { 25 | feature { 26 | key: "abstract" 27 | value { 28 | bytes_list { 29 | value: "...." 30 | } 31 | } 32 | } 33 | feature { 34 | key: "article" 35 | value { 36 | bytes_list { 37 | value: "...." 38 | } 39 | } 40 | } 41 | } 42 | ''' 43 | 44 | files = os.listdir(FLAGS.input_dir) 45 | # python_io : https://www.tensorflow.org/versions/r0.10/api_docs/python/python_io.html 46 | writer = tf.python_io.TFRecordWriter(FLAGS.data_path) 47 | for i, file in enumerate(files): 48 | fid = open(os.path.join(FLAGS.input_dir, file), 'r') 49 | for line in fid : 50 | line = line.strip() 51 | if line == "" : 52 | continue 53 | try : key, val = line.split('\t',1) 54 | except : continue 55 | key = DOCUMENT_START + ' ' + PARAGRAPH_START + ' ' + SENTENCE_START + ' ' + key + ' ' + SENTENCE_END + ' ' + PARAGRAPH_END + ' ' + DOCUMENT_END 56 | val = DOCUMENT_START + ' ' + PARAGRAPH_START + ' ' + SENTENCE_START + ' ' + val + ' ' + SENTENCE_END + ' ' + PARAGRAPH_END + ' ' + DOCUMENT_END 57 | example = tf.train.Example( 58 | features = tf.train.Features( 59 | feature = { 60 | 'abstract': tf.train.Feature(bytes_list=tf.train.BytesList(value=[key])), 61 | 'article': tf.train.Feature(bytes_list=tf.train.BytesList(value=[val])) 62 | }, 63 | ) 64 | ) 65 | serialized = example.SerializeToString() 66 | writer.write(serialized) 67 | fid.close() 68 | writer.close() 69 | -------------------------------------------------------------------------------- /sample/file1.txt: -------------------------------------------------------------------------------- 1 | hello world this is a test file 2 | a a 3 | -------------------------------------------------------------------------------- /sample/file2.txt: -------------------------------------------------------------------------------- 1 | hello tensorflow this is a sample file 2 | -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -o nounset 4 | set -o errexit 5 | 6 | VERBOSE_MODE=0 7 | 8 | function error_handler() 9 | { 10 | local STATUS=${1:-1} 11 | [ ${VERBOSE_MODE} == 0 ] && exit ${STATUS} 12 | echo "Exits abnormally at line "`caller 0` 13 | exit ${STATUS} 14 | } 15 | trap "error_handler" ERR 16 | 17 | PROGNAME=`basename ${BASH_SOURCE}` 18 | DRY_RUN_MODE=0 19 | 20 | function print_usage_and_exit() 21 | { 22 | set +x 23 | local STATUS=$1 24 | echo "Usage: ${PROGNAME} [-v] [-v] [--dry-run] [-h] [--help]" 25 | echo "" 26 | echo " Options -" 27 | echo " -v enables verbose mode 1" 28 | echo " -v -v enables verbose mode 2" 29 | echo " --dry-run show what would have been dumped" 30 | echo " -h, --help shows this help message" 31 | exit ${STATUS:-0} 32 | } 33 | 34 | function debug() 35 | { 36 | if [ "$VERBOSE_MODE" != 0 ]; then 37 | echo $@ 38 | fi 39 | } 40 | 41 | GETOPT=`getopt -o vh --long dry-run,help -n "${PROGNAME}" -- "$@"` 42 | if [ $? != 0 ] ; then print_usage_and_exit 1; fi 43 | 44 | eval set -- "${GETOPT}" 45 | 46 | while true 47 | do case "$1" in 48 | -v) let VERBOSE_MODE+=1; shift;; 49 | --dry-run) DRY_RUN_MODE=1; shift;; 50 | -h|--help) print_usage_and_exit 0;; 51 | --) shift; break;; 52 | *) echo "Internal error!"; exit 1;; 53 | esac 54 | done 55 | 56 | if (( VERBOSE_MODE > 1 )); then 57 | set -x 58 | fi 59 | 60 | 61 | # template area is ended. 62 | # ----------------------------------------------------------------------------- 63 | if [ ${#} != 0 ]; then print_usage_and_exit 1; fi 64 | 65 | # current dir of this script 66 | CDIR=$(readlink -f $(dirname $(readlink -f ${BASH_SOURCE[0]}))) 67 | PDIR=$(readlink -f $(dirname $(readlink -f ${BASH_SOURCE[0]}))/..) 68 | 69 | # ----------------------------------------------------------------------------- 70 | # functions 71 | 72 | function make_calmness() 73 | { 74 | exec 3>&2 # save 2 to 3 75 | exec 2> /dev/null 76 | } 77 | 78 | function revert_calmness() 79 | { 80 | exec 2>&3 # restore 2 from previous saved 3(originally 2) 81 | } 82 | 83 | function close_fd() 84 | { 85 | exec 3>&- 86 | } 87 | 88 | function jumpto 89 | { 90 | label=$1 91 | cmd=$(sed -n "/$label:/{:a;n;p;ba};" $0 | grep -v ':$') 92 | eval "$cmd" 93 | exit 94 | } 95 | 96 | 97 | # end functions 98 | # ----------------------------------------------------------------------------- 99 | 100 | 101 | 102 | # ----------------------------------------------------------------------------- 103 | # main 104 | 105 | make_calmness 106 | if (( VERBOSE_MODE > 1 )); then 107 | revert_calmness 108 | fi 109 | 110 | cp -rf ${CDIR}/textsum/data ${CDIR} 111 | cp -rf ${CDIR}/data/data ${CDIR}/data/training-0 112 | cp -rf ${CDIR}/data/data ${CDIR}/data/validation-0 113 | cp -rf ${CDIR}/data/data ${CDIR}/data/test-0 114 | 115 | # training 116 | function train { 117 | ${CDIR}/bazel-bin/textsum/seq2seq_attention \ 118 | --mode=train \ 119 | --article_key=article \ 120 | --abstract_key=abstract \ 121 | --data_path=data/training-* \ 122 | --vocab_path=data/vocab \ 123 | --log_root=textsum/log_root \ 124 | --train_dir=textsum/log_root/train 125 | } 126 | 127 | # evaluation 128 | function evaluate { 129 | ${CDIR}/bazel-bin/textsum/seq2seq_attention \ 130 | --mode=eval \ 131 | --article_key=article \ 132 | --abstract_key=abstract \ 133 | --data_path=data/validation-* \ 134 | --vocab_path=data/vocab \ 135 | --log_root=textsum/log_root \ 136 | --eval_dir=textsum/log_root/eval 137 | } 138 | 139 | # decode 140 | function decode { 141 | ${CDIR}/bazel-bin/textsum/seq2seq_attention \ 142 | --mode=decode \ 143 | --article_key=article \ 144 | --abstract_key=abstract \ 145 | --data_path=data/test-* \ 146 | --vocab_path=data/vocab \ 147 | --log_root=textsum/log_root \ 148 | --decode_dir=textsum/log_root/decode \ 149 | --beam_size=8 150 | } 151 | 152 | train 153 | evaluate 154 | decode 155 | 156 | close_fd 157 | 158 | # end main 159 | # ----------------------------------------------------------------------------- 160 | --------------------------------------------------------------------------------