├── .gitignore ├── README.rst ├── RESULTS.rst ├── autoencoder.py ├── prepare_easy_corpus.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[co] 2 | *.sw[po] 3 | *~ 4 | .*.swp 5 | *.egg-info 6 | 7 | *.txt 8 | models/ 9 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | RNN Character Autoencoder 2 | ========================= 3 | 4 | RNN character-level sequence autoencoder built with TensorFlow: learns by 5 | reconstructing sentences in order to build good sentence representations. 6 | 7 | Runnable on CPU and GPU, image on AWS with TensorFlow: 8 | (from https://gist.github.com/erikbern/78ba519b97b440e10640) ``ami-cf5028a5``. 9 | -------------------------------------------------------------------------------- /RESULTS.rst: -------------------------------------------------------------------------------- 1 | Results 2 | ======= 3 | 4 | .. contents:: 5 | 6 | Note that loss is NOT perplexity. 7 | 8 | words with reverse 9 | ------------------ 10 | 11 | 061e6b0ce3ed1129cd437af6b6b2653277a9cd2d 12 | 13 | :: 14 | 15 | $ ./autoencoder.py ~/programming/ling/corpora/opencorpora.txt \ 16 | --n-steps=10000 --reverse --words --max-seq-length=12 17 | Namespace(batch_size=32, filename='/Users/kostia/programming/ling/corpora/opencorpora.txt', max_seq_length=12, min_char_count=100, n_steps=10000, report_step=100, reverse=True, state_size=100, words=True) 18 | input_size 158 19 | 0: 4.5964474678 20 | 1: 2.42313051224 21 | 2: 1.8466616869 22 | 3: 1.60127961636 23 | ... 24 | 99: 0.0807262957096 25 | 26 | 27 | words, no reverse 28 | ----------------- 29 | 30 | 061e6b0ce3ed1129cd437af6b6b2653277a9cd2d 31 | 32 | :: 33 | 34 | $ ./autoencoder.py ~/programming/ling/corpora/opencorpora.txt \ 35 | --n-steps=10000 --words --max-seq-length=12 36 | Namespace(batch_size=32, filename='/Users/kostia/programming/ling/corpora/opencorpora.txt', max_seq_length=12, min_char_count=100, n_steps=10000, report_step=100, reverse=False, state_size=100, words=True) 37 | input_size 158 38 | 0: 4.61248397827 39 | 1: 2.28991627693 40 | 2: 1.62505555153 41 | 3: 1.50579571724 42 | ... 43 | 99: 0.136829450727 44 | 45 | phrases 46 | ------- 47 | 48 | 061e6b0ce3ed1129cd437af6b6b2653277a9cd2d 49 | 50 | :: 51 | 52 | $ ./autoencoder.py ~/programming/ling/corpora/opencorpora.txt \ 53 | --n-steps=10000 --reverse 54 | Namespace(batch_size=32, filename='/Users/kostia/programming/ling/corpora/opencorpora.txt', max_seq_length=100, min_char_count=100, n_steps=10000, report_step=100, reverse=True, state_size=100, words=False) 55 | input_size 158 56 | 0: 5.00039815903 57 | 1: 2.70567774773 58 | 2: 1.94724774361 59 | 3: 1.91385316849 60 | ... 61 | 53: 1.22337055206 62 | 63 | phrases 64 | ------- 65 | 66 | 27e9ce95f43aabacf2ca97c83d36805820ee77ba 67 | 68 | :: 69 | 70 | $ ./autoencoder.py ../opencorpora.txt \ 71 | --save models/oc_ph_nor --load models/oc_ph_nor-301 72 | (второй прогон) 73 | 73: loss 1.1324 in 6739 s 74 | oc_ph_nor-31301: loss 1.2424 75 | 76 | $ ./autoencoder.py ../opencorpora.txt \ 77 | --save models/oc_ph_nor_50 --max-seq-length=50 78 | 73: loss 1.7140 in 3374 s 79 | oc_ph_nor_50-63101: loss 1.0997 80 | 81 | $ ./autoencoder.py ~/programming/ling/corpora/opencorpora.txt \ 82 | --save=oc_ph_nor_20 --max-seq-length=20 --load=oc_ph_nor_20-9901 83 | (второй прогон) 84 | 99: loss 0.2436 in 2139 s 85 | 86 | Learning gone bad:: 87 | 88 | $ ./autoencoder.py ~/programming/ling/corpora/opencorpora.txt \ 89 | --max-seq-length=60 --n-layers=2 --state-size=256 --save=oc_ph_60_2_256 90 | 136: loss 0.8219 in 29395 s 91 | ... 92 | 775: loss 0.1490 in 133614 s 93 | ... 94 | 809: loss 0.1506 in 139551 s 95 | 810: loss 0.9073 in 139726 s 96 | 811: loss 3.2113 in 139898 s 97 | 98 | 99 | 100 | :: 101 | 102 | $ ./autoencoder.py --state-size=100 --n-layers=2 ../corpora/opencorpora.txt --save=models/oc_100_2 103 | Namespace(batch_size=64, evaluate=False, filename='../corpora/opencorpora.txt', load=None, max_seq_length=60, min_char_count=100, n_layers=2, n_steps=100000, predict=False, report_step=100, reverse=False, save='models/oc_100_2', state_size=100, words=False) 104 | 158 chars, train set size 92112, valid set size 2000 105 | 1: train loss 4.9637, valid loss 4.9021 in 12 s 106 | 100: train loss 3.1555, valid loss 2.7622 in 157 s 107 | 200: train loss 2.7090, valid loss 2.6445 in 307 s 108 | 300: train loss 2.5380, valid loss 2.4422 in 445 s 109 | 400: train loss 2.3865, valid loss 2.3276 in 582 s 110 | 500: train loss 2.3022, valid loss 2.2505 in 721 s 111 | 600: train loss 2.2390, valid loss 2.1890 in 857 s 112 | 113 | :: 114 | 115 | $ ./autoencoder.py --state-size=100 --n-layers=2 easy-5k-mean.txt --save=models/easy_100_2 116 | Namespace(batch_size=64, evaluate=False, filename='easy-5k-mean.txt', load=None, max_seq_length=60, min_char_count=100, n_layers=2, n_steps=100000, predict=False, report_step=100, reverse=False, save='models/easy_100_2', state_size=100, words=False) 117 | 87 chars, train set size 4500, valid set size 500 118 | 1: train loss 4.3770, valid loss 4.3390 in 5 s 119 | 100: train loss 2.8475, valid loss 2.5244 in 110 s 120 | 200: train loss 2.5042, valid loss 2.4364 in 216 s 121 | 300: train loss 2.3494, valid loss 2.2305 in 322 s 122 | 400: train loss 2.1734, valid loss 2.0840 in 427 s 123 | 500: train loss 2.0741, valid loss 2.0237 in 540 s 124 | 600: train loss 2.0151, valid loss 1.9727 in 650 s 125 | 700: train loss 1.9562, valid loss 1.9334 in 759 s 126 | 800: train loss 1.9228, valid loss 1.9041 in 871 s 127 | 900: train loss 1.9010, valid loss 1.8723 in 984 s 128 | 1000: train loss 1.8661, valid loss 1.8517 in 1096 s 129 | 1100: train loss 1.8402, valid loss 1.8177 in 1205 s 130 | 1200: train loss 1.8362, valid loss 1.8067 in 1314 s 131 | 132 | 133 | :: 134 | 135 | $ ./autoencoder.py ../easy-5k-mean.txt --state-size=256 --n-layers=2 --save models/easy_256_2 --max-seq-length 100 136 | Namespace(batch_size=64, evaluate=False, filename='../easy-5k-mean.txt', load=None, max_seq_length=100, min_char_count=100, n_layers=2, n_steps=100000, predict=False, report_step 137 | =100, reverse=False, save='models/easy_256_2', state_size=256, words=False) 138 | 87 chars, train set size 4500, valid set size 500 139 | 1: train loss 4.4241, valid loss 4.2667 in 11 s 140 | 100: train loss 2.2933, valid loss 2.0396 in 174 s 141 | 200: train loss 1.9721, valid loss 1.8683 in 336 s 142 | 300: train loss 1.7892, valid loss 1.7046 in 498 s 143 | 400: train loss 1.6760, valid loss 1.6166 in 659 s 144 | 500: train loss 1.5971, valid loss 1.5617 in 819 s 145 | 600: train loss 1.5493, valid loss 1.5217 in 980 s 146 | 700: train loss 1.4865, valid loss 1.4815 in 1141 s 147 | 800: train loss 1.4489, valid loss 1.4511 in 1302 s 148 | 900: train loss 1.4336, valid loss 1.4294 in 1463 s 149 | 1000: train loss 1.3983, valid loss 1.4019 in 1625 s 150 | 1100: train loss 1.3877, valid loss 1.3807 in 1786 s 151 | 1200: train loss 1.3583, valid loss 1.3598 in 1947 s 152 | 153 | 154 | :: 155 | 156 | ./autoencoder.py ../opencorpora.txt --state-size=256 --n-layers=2 --save models/oc_256_2 --max-seq-length 100 --report-step=200 157 | Namespace(batch_size=64, evaluate=False, filename='../opencorpora.txt', load=None, max_seq_length=100, min_char_count=100, n_layers=2, n_steps=100000, predict=False, report_step=200, reverse=False, save='models/oc_256_2', state_size=256, words=False) 158 | 158 chars, train set size 93112, valid set size 1000 159 | 1: train loss 5.0125, valid loss 4.9021 in 12 s 160 | 200: train loss 2.6352, valid loss 2.3482 in 338 s 161 | 162 | 163 | random-limit 164 | ------------ 165 | 166 | random-limit speeds up training a lot (028ff8422a0dd7fcd451d3dcf78d0b7c226eb4dc):: 167 | 168 | 169 | ./autoencoder.py --state-size=256 frank.txt --max-seq-length=40 --random-limit --min-char-count 10 --save models/frank_256_1_40_rand 170 | Namespace(batch_size=64, evaluate=False, filename='frank.txt', load=None, max_gradient_norm=5.0, max_seq_length=40, min_char_count=10, n_layers=1, n_steps=100000, predict=False, random_limit=True, report_step=100, reverse=False, save='models/frank_256_1_40_rand', state_size=256, words=False) 171 | 71 chars, train set size 1116, valid set size 125 172 | 1: train loss 4.2112, valid loss 4.1370 in 1 s 173 | 100: train loss 1.8699, valid loss 1.6533 in 50 s 174 | 200: train loss 1.3919, valid loss 1.5078 in 100 s 175 | 300: train loss 1.3081, valid loss 1.4206 in 149 s 176 | 400: train loss 1.2193, valid loss 1.3295 in 197 s 177 | 500: train loss 1.1664, valid loss 1.2877 in 247 s 178 | 600: train loss 1.0852, valid loss 1.2488 in 298 s 179 | 700: train loss 1.0703, valid loss 1.2186 in 347 s 180 | 800: train loss 1.0545, valid loss 1.2320 in 395 s 181 | 900: train loss 1.0142, valid loss 1.1727 in 443 s 182 | 1000: train loss 1.0199, valid loss 1.1898 in 492 s 183 | 1100: train loss 0.9755, valid loss 1.1397 in 541 s 184 | 1200: train loss 0.9711, valid loss 1.1185 in 592 s 185 | 1300: train loss 0.9422, valid loss 1.1158 in 657 s 186 | 1400: train loss 0.9245, valid loss 1.0925 in 724 s 187 | 1500: train loss 0.9045, valid loss 1.0827 in 774 s 188 | 1600: train loss 0.8787, valid loss 1.0723 in 825 s 189 | 1700: train loss 0.8769, valid loss 1.0493 in 877 s 190 | 1800: train loss 0.8309, valid loss 1.0453 in 926 s 191 | 1900: train loss 0.8317, valid loss 1.0446 in 975 s 192 | 2000: train loss 0.8111, valid loss 1.0243 in 1028 s 193 | 2100: train loss 0.7998, valid loss 1.0261 in 1080 s 194 | 2200: train loss 0.7740, valid loss 1.0078 in 1133 s 195 | 2300: train loss 0.7568, valid loss 1.0014 in 1184 s 196 | 2400: train loss 0.7449, valid loss 0.9908 in 1233 s 197 | 198 | same test on opencorpora - "harder" corpora does not matter much:: 199 | 200 | $ ./autoencoder.py --state-size=256 ../corpora/opencorpora.txt --max-seq-length=40 --random-limit --save models/oc_256_1_40_rand 201 | Namespace(batch_size=64, evaluate=False, filename='../corpora/opencorpora.txt', load=None, max_gradient_norm=5.0, max_seq_length=40, min_char_count=100, n_layers=1, n_steps=100000, predict=False, random_limit=True, report_step=100, reverse=False, save='models/oc_256_1_40_rand', state_size=256, words=False) 202 | 158 chars, train set size 92112, valid set size 2000 203 | 1: train loss 4.9348, valid loss 4.8035 in 7 s 204 | 100: train loss 1.9603, valid loss 1.5191 in 84 s 205 | 200: train loss 1.4415, valid loss 1.3879 in 162 s 206 | 300: train loss 1.3460, valid loss 1.2915 in 241 s 207 | 400: train loss 1.2405, valid loss 1.2089 in 319 s 208 | 500: train loss 1.1727, valid loss 1.1614 in 426 s 209 | 600: train loss 1.1233, valid loss 1.0886 in 501 s 210 | 700: train loss 1.0895, valid loss 1.0604 in 575 s 211 | 800: train loss 1.0549, valid loss 1.0389 in 648 s 212 | 900: train loss 1.0435, valid loss 1.0111 in 721 s 213 | 1000: train loss 1.0261, valid loss 1.0093 in 794 s 214 | 1100: train loss 0.9910, valid loss 0.9871 in 870 s 215 | 216 | 2 layers:: 217 | 218 | $ ./autoencoder.py --state-size=256 --n-layers=2 ../corpora/opencorpora.txt --max-seq-length=60 --random-limit --save models/oc_256_2_60_rand 219 | 1: train loss 4.9641, valid loss 4.6453 in 21 s 220 | 100: train loss 1.7707, valid loss 1.4325 in 270 s 221 | 200: train loss 1.3694, valid loss 1.3340 in 523 s 222 | ... 223 | 3600: train loss 0.5975, valid loss 0.5989 in 8845 s 224 | 225 | 2 larger layers on AWS GPU:: 226 | 227 | $ ./autoencoder.py --state-size=512 --n-layers=2 opencorpora.txt --max-seq-length=60 --random-limit --save models/oc_512_2_60_rand 228 | 1: train loss 4.9794, valid loss 4.5997 in 15 s 229 | 100: train loss 1.6813, valid loss 1.3991 in 110 s 230 | 200: train loss 1.3115, valid loss 1.2470 in 204 s 231 | ... 232 | 5300: train loss 0.2590, valid loss 0.2558 in 4908 s 233 | 5400: train loss 0.2307, valid loss 0.2460 in 5001 s 234 | ... 235 | 8200: train loss 0.1362, valid loss 0.1441 in 2032 s 236 | 237 | **TODO - go to convergence** 238 | 239 | 2 larger layers on AWS GPU:: 240 | 241 | $ ./autoencoder.py --state-size=1024 --n-layers=2 opencorpora.txt --max-seq-length=60 --random-limit --save models/oc_1024_2_60_ 242 | rand --report-step 200 243 | Namespace(batch_size=64, evaluate=False, filename='opencorpora.txt', load=None, max_gradient_norm=5.0, max_seq_length=60, min_char_count=100, n_layers=2, n_steps=100000, predict= 244 | False, random_limit=True, report_step=200, reverse=False, save='models/oc_1024_2_60_rand', state_size=1024, words=False) 245 | 158 chars, train set size 92112, valid set size 2000 246 | 1: train loss 5.0210, valid loss 4.5363 in 9 s 247 | 200: train loss 1.4709, valid loss 1.2170 in 211 s 248 | 400: train loss 1.1323, valid loss 1.0838 in 413 s 249 | ... 250 | 32000: train loss 0.0128, valid loss 0.0192 in 32624 s 251 | 252 | GRU & LSTM-basic 253 | ---------------- 254 | 255 | f2921e50065173636d5e64fa52866be5e43a114d, GRU:: 256 | 257 | $ ./autoencoder.py --state-size=256 --n-layers=2 ../corpora/opencorpora.txt --max-seq-length=60 --random-limit --cell=gru --save models/oc_256_2_60_rand_gru 258 | Namespace(batch_size=64, cell='gru', evaluate=False, filename='../corpora/opencorpora.txt', load=None, max_gradient_norm=5.0, max_seq_length=60, min_char_count=100, n_layers=2, n_steps=100000, predict=False, random_limit=True, report_step=100, reverse=False, save='models/oc_256_2_60_rand_gru', state_size=256, words=False) 259 | 158 chars, train set size 92112, valid set size 2000 260 | 1: train loss 4.9506, valid loss 4.4924 in 23 s 261 | 100: train loss 1.4869, valid loss 1.2472 in 319 s 262 | 200: train loss 1.1386, valid loss 1.1019 in 615 s 263 | 300: train loss 1.0402, valid loss 1.0187 in 897 s 264 | ... 265 | 2000: train loss 0.4305, valid loss 0.4274 in 5600 s 266 | 267 | LSTM-basic is slower, but looks better than default LSTM:: 268 | 269 | $ ./autoencoder.py --state-size=256 --n-layers=2 ../corpora/opencorpora.txt --max-seq-length=60 --random-limit --cell=lstm-basic --save models/oc_256_2_60_rand_lstmbasic 270 | Namespace(batch_size=64, cell='lstm-basic', evaluate=False, filename='../corpora/opencorpora.txt', load=None, max_gradient_norm=5.0, max_seq_length=60, min_char_count=100, n_layers=2, n_steps=100000, predict=False, random_limit=True, report_step=100, reverse=False, save='models/oc_256_2_60_rand_lstmbasic', state_size=256, words=False) 271 | 158 chars, train set size 92112, valid set size 2000 272 | 1: train loss 4.9580, valid loss 4.5178 in 37 s 273 | 100: train loss 1.5467, valid loss 1.2869 in 384 s 274 | 200: train loss 1.1840, valid loss 1.1549 in 719 s 275 | 300: train loss 1.1181, valid loss 1.1116 in 1102 s 276 | 277 | -------------------------------------------------------------------------------- /autoencoder.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | import re 5 | import codecs 6 | import argparse 7 | import random 8 | import time 9 | from itertools import chain, repeat, izip 10 | from collections import Counter 11 | import cPickle as pickle 12 | 13 | import numpy as np 14 | import tensorflow as tf 15 | import tensorflow.contrib.rnn as rnn_cell 16 | import tensorflow.contrib.legacy_seq2seq as seq2seq 17 | 18 | from utils import chunks, split 19 | 20 | 21 | PAD_ID, GO_ID, UNK_D = _RESERVED = range(3) 22 | 23 | 24 | def main(): 25 | parser = argparse.ArgumentParser() 26 | arg = parser.add_argument 27 | arg('filename') 28 | arg('--state-size', type=int, default=100) 29 | arg('--batch-size', type=int, default=64) 30 | arg('--max-seq-length', type=int, default=60) 31 | arg('--n-steps', type=int, default=100000) 32 | arg('--report-step', type=int, default=100) 33 | arg('--min-char-count', type=int, default=100) 34 | arg('--n-layers', type=int, default=1) 35 | arg('--cell', default='lstm', help='Cell type: lstm, lstm-basic, gru') 36 | arg('--max-gradient-norm', type=float, default=5.0) 37 | arg('--reverse', action='store_true', help='reverse input') 38 | arg('--words', action='store_true', help='encode only single words') 39 | arg('--random-limit', action='store_true', 40 | help='randomly reduce input length') 41 | arg('--load', help='restore model from given file') 42 | arg('--save', help='save model to given file (plus step number)') 43 | arg('--predict', action='store_true') 44 | arg('--evaluate', action='store_true') 45 | args = parser.parse_args() 46 | print args 47 | seed = 1 48 | random.seed(seed) 49 | np.random.seed(seed) 50 | tf.set_random_seed(seed) 51 | inputs, char_to_id = _read_inputs(args) 52 | random.shuffle(inputs) 53 | train_inputs, valid_inputs = split(inputs, 0.9, max_valid=2000) 54 | input_size = len(char_to_id) + len(_RESERVED) 55 | print '{} chars, train set size {}, valid set size {}'.format( 56 | input_size, len(train_inputs), len(valid_inputs)) 57 | 58 | model = Model(input_size, args) 59 | saver = tf.train.Saver(tf.global_variables()) 60 | 61 | with tf.Session() as sess: 62 | if args.load: 63 | saver.restore(sess, args.load) 64 | else: 65 | sess.run(tf.global_variables_initializer()) 66 | if args.predict: 67 | id_to_char = {id_: ch for ch, id_ in char_to_id.iteritems()} 68 | for id_ in _RESERVED: 69 | id_to_char[id_] = '' 70 | for batch in chunks(inputs, args.batch_size): 71 | feed_dict = model.prepare_batch(batch) 72 | outputs = sess.run(model.decoder_outputs, feed_dict) 73 | input_lines = [[id_to_char[id_] for id_ in line] 74 | for line in batch] 75 | output_lines = [[] for _ in xrange(args.batch_size)] 76 | for char_block in outputs: 77 | for i, id_ in enumerate(np.argmax(char_block, axis=1)): 78 | output_lines[i].append(id_to_char[id_]) 79 | for inp, out in zip(input_lines, output_lines): 80 | print 81 | print ''.join(inp) 82 | print ''.join(out) 83 | import pdb; pdb.set_trace() 84 | else: 85 | model.train(sess, saver, train_inputs, valid_inputs) 86 | 87 | 88 | class Model(object): 89 | def __init__(self, input_size, args): 90 | self.input_size = input_size 91 | self.args = args 92 | self.batch_size = args.batch_size 93 | 94 | if args.cell == 'lstm': 95 | cell = rnn_cell.LSTMCell( 96 | args.state_size, input_size, num_proj=input_size) 97 | else: 98 | if args.cell == 'gru': 99 | cell_class = rnn_cell.GRUCell 100 | elif args.cell == 'lstm-basic': 101 | cell_class = rnn_cell.BasicLSTMCell 102 | basic_cell = cell_class(args.state_size) 103 | # TODO - do bulk input/output projecttions 104 | cell = rnn_cell.InputProjectionWrapper( 105 | rnn_cell.OutputProjectionWrapper(basic_cell, input_size), 106 | input_size) 107 | if args.n_layers > 1: 108 | cell = rnn_cell.MultiRNNCell([cell] * args.n_layers) 109 | 110 | self.encoder_inputs, self.decoder_inputs = [[ 111 | tf.placeholder(tf.float32, shape=[None, input_size], 112 | name='{}{}'.format(name, i)) 113 | for i in xrange(length)] for name, length in [ 114 | ('encoder', self.args.max_seq_length), 115 | ('decoder', self.args.max_seq_length)]] 116 | # TODO - maybe also use during training, 117 | # to avoid building one-hot representation (just an optimization). 118 | # Another (maybe better) way to do is described here 119 | # https://www.tensorflow.org/versions/master/tutorials/mnist/tf/index.html#loss 120 | embeddings = tf.constant(np.eye(input_size), dtype=tf.float32) 121 | loop_function = None 122 | if args.predict: 123 | def loop_function(prev, _): 124 | prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) 125 | return tf.nn.embedding_lookup(embeddings, prev_symbol) 126 | self.decoder_outputs, _ = seq2seq.tied_rnn_seq2seq( 127 | self.encoder_inputs, self.decoder_inputs, cell, 128 | loop_function=loop_function) 129 | # TODO - add weights 130 | targets = self.decoder_inputs[1:] 131 | # FIXME - this scaling by max_seq_length does not take 132 | # padding into account (see also weights) 133 | self.decoder_loss = (1. / self.args.max_seq_length) * \ 134 | tf.reduce_mean(tf.add_n([ 135 | tf.nn.softmax_cross_entropy_with_logits( 136 | logits=logits, labels=target, name='seq_loss_{}'.format(i)) 137 | for i, (logits, target) in enumerate( 138 | zip(self.decoder_outputs, targets))])) 139 | tf.summary.scalar('train loss', self.decoder_loss) 140 | self.valid_loss = 1.0 * self.decoder_loss # FIXME 141 | tf.summary.scalar('valid loss', self.valid_loss) 142 | self.global_step = tf.Variable(0, name='global_step', trainable=False) 143 | optimizer = tf.train.AdamOptimizer() 144 | params = tf.trainable_variables() 145 | gradients = tf.gradients(self.decoder_loss, params) 146 | clipped_gradients, _norm = tf.clip_by_global_norm( 147 | gradients, self.args.max_gradient_norm) 148 | # TODO - monitor norm 149 | self.train_op = optimizer.apply_gradients( 150 | zip(clipped_gradients, params), global_step=self.global_step) 151 | self.summary_op = tf.summary.merge_all() 152 | 153 | def prepare_batch(self, inputs): 154 | ''' Prepare batch for training: return batch_inputs and batch_outputs, 155 | where each is a list of float32 arrays of shape (batch_size, input_size), 156 | adding padding and "GO" symbol. 157 | ''' 158 | batch_size = len(inputs) 159 | batch_inputs, batch_outputs = [ 160 | [np.zeros([batch_size, self.input_size], dtype=np.float32) 161 | for _ in xrange(self.args.max_seq_length)] for _ in xrange(2)] 162 | for n_batch, input_ in enumerate(inputs): 163 | n_pad = (self.args.max_seq_length - len(input_)) 164 | padded_input = [PAD_ID] * n_pad + list( 165 | reversed(input_) if self.args.reverse else input_) 166 | for values, seq in [ 167 | (batch_inputs, [padded_input]), 168 | (batch_outputs, [[GO_ID], input_ , repeat(PAD_ID, n_pad - 1)]) 169 | ]: 170 | for i, id_ in enumerate(chain(*seq)): 171 | values[i][n_batch][id_] = 1.0 172 | feed_dict = { 173 | var.name: val for var, val in 174 | chain(izip(self.encoder_inputs, batch_inputs), 175 | izip(self.decoder_inputs, batch_outputs))} 176 | return feed_dict 177 | 178 | def train(self, sess, saver, train_inputs, valid_inputs): 179 | losses = [] 180 | summary_writer = None 181 | if self.args.save: 182 | summary_writer = tf.summary.FileWriter( 183 | self.args.save, flush_secs=10) 184 | t0 = time.time() 185 | for i in xrange(self.args.n_steps): 186 | loss = self._train_step(sess, train_inputs, summary_writer) 187 | losses.append(loss) 188 | step = self.global_step.eval() 189 | if i == 0 or step % self.args.report_step == 0: 190 | print '{:>5}: train loss {:.4f}, valid loss {:.4f} in {} s'\ 191 | .format( 192 | step, 193 | np.mean(losses), 194 | self._valid_loss(sess, valid_inputs, summary_writer), 195 | int(time.time() - t0)) 196 | losses = [] 197 | if self.args.save: 198 | saver.save(sess, self.args.save, global_step=step) 199 | if self.args.evaluate: 200 | break 201 | 202 | def _train_step(self, sess, inputs, summary_writer): 203 | b_inputs = [random.choice(inputs) for _ in xrange(self.args.batch_size)] 204 | feed_dict = self.prepare_batch(b_inputs) 205 | ops = [self.decoder_loss, self.summary_op] 206 | if not self.args.evaluate: 207 | ops.append(self.train_op) 208 | loss, summary_str = sess.run(ops, feed_dict)[:2] 209 | step = self.global_step.eval() 210 | if summary_writer and step % 10 == 0: 211 | summary_writer.add_summary(summary_str, step) 212 | return loss 213 | 214 | def _valid_loss(self, sess, valid_inputs, summary_writer): 215 | loss, summary_str = sess.run( 216 | [self.valid_loss, self.summary_op], 217 | feed_dict=self.prepare_batch(valid_inputs)) 218 | if summary_writer: 219 | summary_writer.add_summary(summary_str, self.global_step.eval()) 220 | return loss 221 | 222 | 223 | def _read_inputs(args): 224 | ''' Return a list of inputs (int lists), and an encoding dict. 225 | ''' 226 | word_re = re.compile(r'\w+', re.U) 227 | with codecs.open(args.filename, 'rb', 'utf-8') as textfile: 228 | if args.load: 229 | with open(args.load.rsplit('-', 1)[0] + '.mapping', 'rb') as f: 230 | char_to_id = pickle.load(f) 231 | else: 232 | char_counts = Counter(ch for line in textfile for ch in line) 233 | char_to_id = { 234 | ch: id_ for id_, ch in enumerate( 235 | (ch for ch, count in char_counts.iteritems() 236 | if count >= args.min_char_count), 237 | len(_RESERVED))} 238 | textfile.seek(0) 239 | if args.save: 240 | with open(args.save + '.mapping', 'wb') as f: 241 | pickle.dump(char_to_id, f, protocol=-1) 242 | def inputs_iter(): 243 | for line in textfile: 244 | line = line.strip() 245 | if args.words: 246 | for word in word_re.findall(line): 247 | yield word + ' ' 248 | else: 249 | yield line 250 | inputs = [] 251 | for string in inputs_iter(): 252 | limit = args.max_seq_length - 1 # one more for "GO" 253 | if args.random_limit: 254 | limit = random.randint(2, limit) 255 | if len(string) > limit: 256 | string = string[:limit - 1].rsplit(None, 1)[0] + ' ' 257 | if len(string) <= limit: 258 | inputs.append([char_to_id.get(ch, UNK_D) for ch in string]) 259 | return inputs, char_to_id 260 | 261 | 262 | if __name__ == '__main__': 263 | main() 264 | -------------------------------------------------------------------------------- /prepare_easy_corpus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | import re 5 | import codecs 6 | import argparse 7 | from collections import Counter 8 | from operator import itemgetter 9 | 10 | import numpy as np 11 | 12 | 13 | WORDS_RE = re.compile(r'\w+', re.U) 14 | 15 | 16 | def main(): 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('filename') 19 | parser.add_argument('--n', type=int, default=1000) 20 | parser.add_argument('--fn', default='min', help='min or mean') 21 | args = parser.parse_args() 22 | with codecs.open(args.filename, 'rb', 'utf-8') as f: 23 | freqs = Counter(word for line in f 24 | for word in WORDS_RE.findall(line)) 25 | f.seek(0) 26 | fn = getattr(np, args.fn) 27 | weights = [(i, line_weight(line, freqs, fn)) 28 | for i, line in enumerate(f)] 29 | weights.sort(key=itemgetter(1), reverse=True) 30 | f.seek(0) 31 | easy = set(map(itemgetter(0), weights[:args.n])) 32 | for i, line in enumerate(f): 33 | if i in easy: 34 | print line.encode('utf-8'), 35 | 36 | 37 | def line_weight(line, freqs, fn): 38 | return fn([freqs[w] for w in WORDS_RE.findall(line)] or [0]) 39 | 40 | 41 | if __name__ == '__main__': 42 | main() 43 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | import time 4 | import traceback 5 | from functools import wraps 6 | 7 | 8 | def debug_exec(*deco_args, **deco_kwargs): 9 | ''' Выводит в logger.debug время выполнения функции. 10 | Дополнительне возможности: 11 | profile = True - профилировка при помощи cProfile, 12 | stat_profile = True - профилировка при помощи statprof, 13 | traceback = True - печатает traceback перед каждым вызовом 14 | queries = True - выводит запросы, сделанные при выполнении функции 15 | queries_limit (по умолчанию 50) - лимит при печати запросов 16 | log_fn - функция для логирования (по умолчанию logger.debug), 17 | ''' 18 | def deco(fn): 19 | @wraps(fn) 20 | def inner(*args, **kwargs): 21 | if deco_kwargs.get('traceback'): 22 | traceback.print_stack() 23 | print 'starting %s' % fn.__name__ 24 | start = time.time() 25 | stat_profile = deco_kwargs.get('stat_profile') 26 | if stat_profile: 27 | import statprof 28 | statprof.reset(frequency=1000) 29 | statprof.start() 30 | try: 31 | return fn(*args, **kwargs) 32 | finally: 33 | fn_name = fn.__name__ 34 | print 'finished %s in %.3f s' % (fn_name, time.time() - start) 35 | if stat_profile: 36 | statprof.stop() 37 | statprof.display() 38 | if deco_kwargs.get('profile'): 39 | import profilehooks 40 | inner = profilehooks.profile(immediate=True)(inner) 41 | return inner 42 | if deco_args: 43 | return deco(deco_args[0]) 44 | else: 45 | return deco 46 | 47 | 48 | def chunks(lst, n): 49 | for i in xrange(0, len(lst), n): 50 | yield lst[i:i+n] 51 | 52 | 53 | def split(lst, proportion, max_valid=None): 54 | split_at = int(len(lst) * proportion) 55 | if max_valid is not None: 56 | split_at = max(split_at, len(lst) - max_valid) 57 | return lst[:split_at], lst[split_at:] 58 | --------------------------------------------------------------------------------