├── README.md
├── check_data.py
├── generate_data.py
├── sample
    ├── file1.txt
    └── file2.txt
└── test.sh


/README.md:
--------------------------------------------------------------------------------
  1 | # textsum
  2 | 
  3 | - description
  4 |   - test code for textsum
  5 |   - [textsum](https://github.com/tensorflow/models/tree/master/textsum)
  6 |     - [Text summarization with TensorFlow](https://research.googleblog.com/2016/08/text-summarization-with-tensorflow.html)
  7 | 	- [English Gigaword](https://catalog.ldc.upenn.edu/LDC2012T21)
  8 |     - reference paper [A Neural Attention Model for Abstractive Sentence Summarization](https://arxiv.org/abs/1509.00685)
  9 |       - an implementation using tensorflow, [neural-summary-tensorflow](https://github.com/carpedm20/neural-summary-tensorflow)
 10 |   
 11 | - pre-requesite and setting
 12 |   - follow instructions in https://github.com/tensorflow/models/tree/master/textsum
 13 | 
 14 | - data format
 15 |   - how does data file look like?
 16 |   
 17 | ```
 18 |   # you need to install google protobuf(http://dchua.com/2016/04/08/installing-grpc,-protobuf-and-its-dependencies-for-python-development/)
 19 |   $ python check_data.py --data_path=data/data
 20 | 
 21 |   features {
 22 |     feature {
 23 |       key: "abstract"
 24 |       value {
 25 |         bytes_list {
 26 |           value: "<d> <p> <s> sri lanka closes schools as war escalates . </s> </p> </d>"
 27 |         }
 28 |       }
 29 |     }
 30 |     feature {
 31 |       key: "article"
 32 |       value {
 33 |         bytes_list {
 34 |           value: "<d> <p> <s> the sri lankan government on wednesday announced the closure of government schools with immediate effect as a military campaign against tamil separatists escalated in the north of the country . </s> <s> t
 35 | he cabinet wednesday decided to advance the december holidays by one month because of a threat from the liberation tigers of tamil eelam -lrb- ltte -rrb- against school children , a government official said . </s> <s> `` there are i
 36 | ntelligence reports that the tigers may try to kill a lot of children to provoke a backlash against tamils in colombo . </s> <s> `` if that happens , troops will have to be withdrawn from the north to maintain law and order here , \
 37 | '\' a police official said . </s> <s> he said education minister richard pathirana visited several government schools wednesday before the closure decision was taken . </s> <s> the government will make alternate arrangements to hold
 38 |  end of term examinations , officials said . </s> <s> earlier wednesday , president chandrika kumaratunga said the ltte may step up their attacks in the capital to seek revenge for the ongoing military offensive which she described
 39 | as the biggest ever drive to take the tiger town of jaffna . . </s> </p> </d>"
 40 |         }
 41 |       }
 42 |     }
 43 |     feature {
 44 |       key: "publisher"
 45 |       value {
 46 |         bytes_list {
 47 |           value: "AFP"
 48 |         }
 49 |       }
 50 |     }
 51 |   }
 52 |   ...
 53 | ```
 54 | 
 55 |   - how to create own training data?
 56 |   
 57 | ```
 58 |   $ python generate_data.py --input_dir=sample --data_path=sample-0
 59 |   $ python check_data.py --data_path=sample-0 --crc=4
 60 |   <class 'tensorflow.core.example.example_pb2.Example'>
 61 |   features {
 62 |     feature {
 63 |       key: "abstract"
 64 |       value {
 65 |         bytes_list {
 66 |           value: "<d> <p> <s> hello tensorflow </s> </p> </d>"
 67 |         }
 68 |       }
 69 |     }
 70 |     feature {
 71 |       key: "article"
 72 |       value {
 73 |         bytes_list {
 74 |           value: "<d> <p> <s> this is a sample file </s> </p> </d>"
 75 |         }
 76 |       }
 77 |     }
 78 |   }
 79 | 
 80 |   article   \t  <d> <p> <s> this is a sample file </s> </p> </d>
 81 |   abstract  \t  <d> <p> <s> hello tensorflow </s> </p> </d>
 82 |   ...
 83 |   ```
 84 |   - recent version of textsum provides `data_convert_example.py`
 85 |   ```
 86 |   $ python data_convert_example.py --command binary_to_text --in_file data/data --out_file data/text_data
 87 |   $ python data_convert_example.py --command text_to_binary --in_file data/text_data --out_file data/binary_data
 88 |   $ python data_convert_example.py --command binary_to_text --in_file data/binary_data --out_file data/text_data2
 89 |   $ diff data/text_data2 data/text_data
 90 |   
 91 |   # your text_data format looks like 
 92 |   
 93 |   abstract=hello world    article=this is a test file
 94 |   abstract=...            article=....
 95 |   ...
 96 | ```
 97 | 
 98 | - test
 99 | ```shell
100 | $ ./test.sh -v -v
101 | ...
102 | running_avg_loss: 1.002997
103 | running_avg_loss: 1.384698
104 | running_avg_loss: 0.865053
105 | ...
106 | ```
107 | 
108 | - gpu setting tips
109 | ```
110 | # if you want to prevent full gpu-memory allocation, set allow_growth option to 
111 | # `seq2seq_attention.py`, `seq2seq_attention_decode.py`
112 | 
113 | device_config=tf.ConfigProto(allow_soft_placement=True)
114 | device_config.gpu_options.allow_growth = True
115 | sess = tf.Session(config=device_config)
116 | ...
117 | ```
118 | 


--------------------------------------------------------------------------------
/check_data.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | 
 3 | import sys
 4 | import os
 5 | 
 6 | import glob
 7 | import random
 8 | import struct
 9 | import tensorflow as tf
10 | from tensorflow.core.example import example_pb2
11 | 
12 | from google.protobuf import json_format
13 | import json
14 | import base64
15 | 
16 | FLAGS = tf.app.flags.FLAGS
17 | tf.app.flags.DEFINE_string('data_path', 'data/data', 'Path expression to tf.Example.')
18 | tf.app.flags.DEFINE_string('crc', '0', 'crc size')
19 | FLAGS.crc = int(FLAGS.crc)
20 | 
21 | def ExampleGen(recordio_path, crc=0, num_epochs=None):
22 |   """Generates tf.Examples from path of recordio files.
23 | 
24 |   Args:
25 |     recordio_path: CNS path to tf.Example recordio
26 |     num_epochs: Number of times to go through the data. None means infinite.
27 | 
28 |   Yields:
29 |     Deserialized tf.Example.
30 | 
31 |   If there are multiple files specified, they accessed in a random order.
32 |   """
33 |   epoch = 0
34 |   while True:
35 |     if num_epochs is not None and epoch >= num_epochs:
36 |       break
37 |     filelist = glob.glob(recordio_path)
38 |     assert filelist, 'Empty filelist.'
39 |     random.shuffle(filelist)
40 |     for f in filelist:
41 |       '''
42 |       for example_str in tf.python_io.tf_record_iterator(f):
43 |         yield example_pb2.Example.FromString(example_str)
44 |       '''
45 |       reader = open(f, 'rb')
46 |       while True:
47 |         len_bytes = reader.read(8)
48 |         skip_bytes = reader.read(crc) # skip crc bytes
49 |         if not len_bytes: break
50 |         str_len = struct.unpack('q', len_bytes)[0]
51 |         example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0]
52 |         skip_bytes = reader.read(crc) # skip crc bytes
53 |         yield example_pb2.Example.FromString(example_str)
54 |     epoch += 1
55 | 
56 | for ret in ExampleGen(FLAGS.data_path, FLAGS.crc, num_epochs=1) :
57 |   print type(ret)
58 |   print ret
59 |   json_string = json_format.MessageToJson(ret)
60 |   json_obj = json.loads(json_string)
61 |   feature = json_obj['features']['feature']
62 |   for key, val in feature.iteritems() :
63 |     print key + '\t',
64 |     bytesList = val['bytesList']
65 |     for v in bytesList['value'] :
66 |       print base64.b64decode(v),
67 |     print '\n',
68 |   print '\n',
69 | 


--------------------------------------------------------------------------------
/generate_data.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | 
 3 | import sys
 4 | import os
 5 | 
 6 | import tensorflow as tf
 7 | 
 8 | # Special tokens
 9 | PARAGRAPH_START = '<p>'
10 | PARAGRAPH_END = '</p>'
11 | SENTENCE_START = '<s>'
12 | SENTENCE_END = '</s>'
13 | UNKNOWN_TOKEN = '<UNK>'
14 | PAD_TOKEN = '<PAD>'
15 | DOCUMENT_START = '<d>'
16 | DOCUMENT_END = '</d>'
17 | 
18 | FLAGS = tf.app.flags.FLAGS
19 | tf.app.flags.DEFINE_string('input_dir', 'sample', 'input directory path')
20 | tf.app.flags.DEFINE_string('data_path', 'sample-0', 'output file path')
21 | 
22 | 
23 | '''
24 | features {
25 |   feature {
26 |       key: "abstract"
27 |       value {
28 |         bytes_list {
29 |           value: "...."
30 |         }
31 |       }
32 |   }
33 |   feature {
34 |       key: "article"
35 |       value {
36 |         bytes_list {
37 |           value: "...."
38 |         }
39 |       }
40 |   }
41 | }
42 | '''
43 | 
44 | files = os.listdir(FLAGS.input_dir)
45 | # python_io : https://www.tensorflow.org/versions/r0.10/api_docs/python/python_io.html
46 | writer = tf.python_io.TFRecordWriter(FLAGS.data_path)
47 | for i, file in enumerate(files):
48 |     fid = open(os.path.join(FLAGS.input_dir, file), 'r')
49 |     for line in fid :
50 |         line = line.strip()
51 |         if line == "" :
52 |             continue
53 |         try : key, val = line.split('\t',1)
54 |         except : continue
55 |         key = DOCUMENT_START + ' ' + PARAGRAPH_START + ' ' + SENTENCE_START + ' ' + key + ' ' + SENTENCE_END + ' ' + PARAGRAPH_END + ' ' + DOCUMENT_END
56 |         val = DOCUMENT_START + ' ' + PARAGRAPH_START + ' ' + SENTENCE_START + ' ' + val + ' ' + SENTENCE_END + ' ' + PARAGRAPH_END + ' ' + DOCUMENT_END
57 |         example = tf.train.Example(
58 |             features = tf.train.Features(
59 |                 feature = {
60 |                     'abstract': tf.train.Feature(bytes_list=tf.train.BytesList(value=[key])),
61 |                     'article': tf.train.Feature(bytes_list=tf.train.BytesList(value=[val]))
62 |                 },
63 |             )
64 |         )
65 |         serialized = example.SerializeToString()
66 |         writer.write(serialized)
67 |     fid.close()
68 | writer.close()
69 | 


--------------------------------------------------------------------------------
/sample/file1.txt:
--------------------------------------------------------------------------------
1 | hello world	this is a test file
2 | a	a
3 | 


--------------------------------------------------------------------------------
/sample/file2.txt:
--------------------------------------------------------------------------------
1 | hello tensorflow	this is a sample file
2 | 


--------------------------------------------------------------------------------
/test.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -o nounset
  4 | set -o errexit
  5 | 
  6 | VERBOSE_MODE=0
  7 | 
  8 | function error_handler()
  9 | {
 10 |   local STATUS=${1:-1}
 11 |   [ ${VERBOSE_MODE} == 0 ] && exit ${STATUS}
 12 |   echo "Exits abnormally at line "`caller 0`
 13 |   exit ${STATUS}
 14 | }
 15 | trap "error_handler" ERR
 16 | 
 17 | PROGNAME=`basename ${BASH_SOURCE}`
 18 | DRY_RUN_MODE=0
 19 | 
 20 | function print_usage_and_exit()
 21 | {
 22 |   set +x
 23 |   local STATUS=$1
 24 |   echo "Usage: ${PROGNAME} [-v] [-v] [--dry-run] [-h] [--help]"
 25 |   echo ""
 26 |   echo " Options -"
 27 |   echo "  -v                 enables verbose mode 1"
 28 |   echo "  -v -v              enables verbose mode 2"
 29 |   echo "      --dry-run      show what would have been dumped"
 30 |   echo "  -h, --help         shows this help message"
 31 |   exit ${STATUS:-0}
 32 | }
 33 | 
 34 | function debug()
 35 | {
 36 |   if [ "$VERBOSE_MODE" != 0 ]; then
 37 |     echo $@
 38 |   fi
 39 | }
 40 | 
 41 | GETOPT=`getopt -o vh --long dry-run,help -n "${PROGNAME}" -- "$@"`
 42 | if [ $? != 0 ] ; then print_usage_and_exit 1; fi
 43 | 
 44 | eval set -- "${GETOPT}"
 45 | 
 46 | while true
 47 | do case "$1" in
 48 |      -v)            let VERBOSE_MODE+=1; shift;;
 49 |      --dry-run)     DRY_RUN_MODE=1; shift;;
 50 |      -h|--help)     print_usage_and_exit 0;;
 51 |      --)            shift; break;;
 52 |      *) echo "Internal error!"; exit 1;;
 53 |    esac
 54 | done
 55 | 
 56 | if (( VERBOSE_MODE > 1 )); then
 57 |   set -x
 58 | fi
 59 | 
 60 | 
 61 | # template area is ended.
 62 | # -----------------------------------------------------------------------------
 63 | if [ ${#} != 0 ]; then print_usage_and_exit 1; fi
 64 | 
 65 | # current dir of this script
 66 | CDIR=$(readlink -f $(dirname $(readlink -f ${BASH_SOURCE[0]})))
 67 | PDIR=$(readlink -f $(dirname $(readlink -f ${BASH_SOURCE[0]}))/..)
 68 | 
 69 | # -----------------------------------------------------------------------------
 70 | # functions
 71 | 
 72 | function make_calmness()
 73 | {
 74 | 	exec 3>&2 # save 2 to 3
 75 | 	exec 2> /dev/null
 76 | }
 77 | 
 78 | function revert_calmness()
 79 | {
 80 | 	exec 2>&3 # restore 2 from previous saved 3(originally 2)
 81 | }
 82 | 
 83 | function close_fd()
 84 | {
 85 | 	exec 3>&-
 86 | }
 87 | 
 88 | function jumpto
 89 | {
 90 | 	label=$1
 91 | 	cmd=$(sed -n "/$label:/{:a;n;p;ba};" $0 | grep -v ':$')
 92 | 	eval "$cmd"
 93 | 	exit
 94 | }
 95 | 
 96 | 
 97 | # end functions
 98 | # -----------------------------------------------------------------------------
 99 | 
100 | 
101 | 
102 | # -----------------------------------------------------------------------------
103 | # main 
104 | 
105 | make_calmness
106 | if (( VERBOSE_MODE > 1 )); then
107 | 	revert_calmness
108 | fi
109 | 
110 | cp -rf ${CDIR}/textsum/data ${CDIR}
111 | cp -rf ${CDIR}/data/data ${CDIR}/data/training-0
112 | cp -rf ${CDIR}/data/data ${CDIR}/data/validation-0
113 | cp -rf ${CDIR}/data/data ${CDIR}/data/test-0
114 | 
115 | # training
116 | function train {
117 | 	${CDIR}/bazel-bin/textsum/seq2seq_attention \
118 | 	  --mode=train \
119 | 	  --article_key=article \
120 | 	  --abstract_key=abstract \
121 | 	  --data_path=data/training-* \
122 | 	  --vocab_path=data/vocab \
123 | 	  --log_root=textsum/log_root \
124 | 	  --train_dir=textsum/log_root/train
125 | }
126 | 
127 | # evaluation
128 | function evaluate {
129 | 	${CDIR}/bazel-bin/textsum/seq2seq_attention \
130 | 	  --mode=eval \
131 | 	  --article_key=article \
132 | 	  --abstract_key=abstract \
133 | 	  --data_path=data/validation-* \
134 | 	  --vocab_path=data/vocab \
135 | 	  --log_root=textsum/log_root \
136 | 	  --eval_dir=textsum/log_root/eval
137 | }
138 | 
139 | # decode
140 | function decode {
141 | 	${CDIR}/bazel-bin/textsum/seq2seq_attention \
142 | 	  --mode=decode \
143 | 	  --article_key=article \
144 | 	  --abstract_key=abstract \
145 | 	  --data_path=data/test-* \
146 | 	  --vocab_path=data/vocab \
147 | 	  --log_root=textsum/log_root \
148 | 	  --decode_dir=textsum/log_root/decode \
149 | 	  --beam_size=8
150 | }
151 | 
152 | train
153 | evaluate
154 | decode
155 | 
156 | close_fd
157 | 
158 | # end main
159 | # -----------------------------------------------------------------------------
160 | 


--------------------------------------------------------------------------------