├── neuralgpu
    ├── __init__.py
    ├── config.py
    ├── data_utils.py
    ├── curriculum.py
    ├── records.py
    ├── mytf.py
    ├── model.py
    ├── generators.py
    └── trainer.py
├── .gitignore
├── neural_gpu_trainer.py
├── examples
    └── loading_and_using_model.py
├── plots
    ├── plot_carry_thresholds.py
    ├── carries.py
    ├── paper_carries3.py
    ├── large_carries.py
    ├── construct_table.py
    └── get_pretty_score.py
└── README.md


/neuralgpu/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *~
3 | *.pdf
4 | *.aux
5 | *.log
6 | *.pickle
7 | *.csv
8 | 


--------------------------------------------------------------------------------
/neural_gpu_trainer.py:
--------------------------------------------------------------------------------
 1 | """Start and train the NeuralGPU.
 2 | 
 3 | See neuralgpu/trainer.py for flags and more information.
 4 | """
 5 | 
 6 | import tensorflow as tf
 7 | from neuralgpu import trainer
 8 | 
 9 | 
10 | def main(_):
11 |   trainer.start_and_train()
12 | 
13 | if __name__ == "__main__":
14 |   tf.app.run()
15 | 


--------------------------------------------------------------------------------
/examples/loading_and_using_model.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | 
 4 | import os
 5 | import sys
 6 | sys.path.append(os.path.dirname(os.path.dirname(__file__)))
 7 | from neuralgpu import trainer, generators
 8 | 
 9 | DIR = '/tmp/moo/cow3'
10 | 
11 | sess = tf.Session()
12 | model = trainer.load_model(sess, DIR)
13 | 
14 | example = generators.generators['baddet'].get_batch(8,32)
15 | 
16 | result = model.step(sess, example, False)
17 | print(result.to_string())
18 | 


--------------------------------------------------------------------------------
/plots/plot_carry_thresholds.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import pylab
 3 | import numpy as np
 4 | from matplotlib import rc
 5 | 
 6 | rc('font',  size='9')
 7 | rc('axes', labelsize='large')
 8 | rc('lines', linewidth=3)
 9 | 
10 | #pylab.ion()
11 | 
12 | data = np.array([int(open(fname).read().strip()) for fname in glob.glob('cachedlogs/September-0*/*/threshold2')])
13 | data.sort()
14 | 
15 | pylab.figure(figsize=(4,4))
16 | 
17 | pylab.clf()
18 | pylab.plot(1-np.arange(len(data)) * 1./ len(data), data, marker='o')
19 | pylab.loglog()
20 | pylab.xlabel('Fraction of training runs')
21 | pylab.ylabel('Decimal addition carry length with 50% failure')
22 | pylab.title('A small fraction of trials can carry\nover longer intervals (log log plot)')
23 | pylab.tight_layout()
24 | pylab.savefig('../neuralgpu_paper/carry_runs_loglog.pdf')
25 | 
26 | pylab.clf()
27 | pylab.plot(data[::-1], marker='o')
28 | pylab.xlabel('Run')
29 | pylab.ylabel('# carries before failure')
30 | pylab.title('5% of runs carry much better, but still not perfectly')
31 | pylab.savefig('carry_runs.pdf')
32 | 
33 | 


--------------------------------------------------------------------------------
/neuralgpu/config.py:
--------------------------------------------------------------------------------
 1 | """Config object.
 2 | 
 3 | Ideally, all used FLAGS would be passed through here.  However, the
 4 | code is lazy in parts and uses FLAGS directly.
 5 | """
 6 | 
 7 | from . import data_utils
 8 | 
 9 | class NeuralConfig(object):
10 |   """Initial configuration settings for model"""
11 | 
12 |   config_keys = '''nmaps niclass noclass dropout rx_step max_grad_norm
13 |   cutoff nconvs kw kh height mode lr pull pull_incr
14 |   min_length batch_size task init_weight curriculum_bound layer_scale
15 |   '''.split()
16 | 
17 |   def __init__(self, FLAGS, **kws):
18 |     for key in self.config_keys:
19 |       val = kws.get(key, getattr(FLAGS, key, None))
20 |       setattr(self, key, val)
21 | 
22 |     min_length = 5
23 |     max_length = min(FLAGS.max_length, data_utils.bins[-1])
24 |     assert max_length + 1 > min_length
25 |     self.max_length = max_length
26 |     self.min_length = min_length
27 | 
28 |   def __str__(self):
29 |     msg1 = ("layers %d kw %d h %d kh %d relax %d batch %d task %s"
30 |             % (self.nconvs, self.kw, self.height, self.kh, self.rx_step,
31 |                self.batch_size, self.task))
32 |     msg2 = ("cut %.2f pull %.3f lr %.2f iw %.2f cr %.2f nm %d d%.4f gn %.2f %s" %
33 |             (self.cutoff, self.pull_incr, self.lr, self.init_weight,
34 |              self.curriculum_bound, self.nmaps, self.dropout, self.max_grad_norm, msg1))
35 |     return msg2
36 | 
37 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Code for the Neural GPU model originally described in
 2 | [[http://arxiv.org/abs/1511.08228]].
 3 | 
 4 | 
 5 | Running experiments
 6 | ===================
 7 | 
 8 | Running one instance
 9 | --------------------
10 | 
11 | The following would use 256 filters to train on binary multiplication,
12 | then 4-ary, then decimal:
13 | ```
14 | python neural_gpu_trainer.py --nmaps=256 --task=bmul,qmul,mul --progressive_curriculum=5
15 | ```
16 | 
17 | My typical invocation is something like
18 | 
19 | ```
20 |   CUDA_VISIBLE_DEVICES=0 python neural_gpu_trainer.py --random_seed=0 --max_steps=200000 --forward_max=201 --nmaps=256 --task=bmul,qmul,mul --time_till_eval=4 --progressive_curriculum=5 --train_dir=../logs/August-12-curriculum/forward_max=201-nmaps=256-task=bmul,qmul,mul-progressive_curriculum=5-random_seed=0
21 | ```
22 | 
23 | The tests on decimal carry were done using invocations like the following:
24 | ```
25 |   CUDA_VISIBLE_DEVICES=0 neural_gpu_trainer.py --train_dir=../logs/run1 --random_seed=1 --max_steps=100000 --forward_max=201 --nmaps=128 --task=add --time_till_eval=4 --time_till_ckpt=1
26 | ```
27 | 
28 | You can find a list of options, and their default values, in `neuralgpu/trainer.py`.
29 | 
30 | Examining results
31 | =================
32 | 
33 | Loading and examining a model
34 | -----------------------------
35 | 
36 | `examples/examples_for_loading_model.py` gives a simple instance of loading a
37 | model and running it on an instance.
38 | 
39 | Plotting results
40 | ----------------
41 | 
42 | Something like `python plots/get_pretty_score.py cachedlogs/*/*task=bmul,qmul,mul-*` works.  There are a lot of options to make it prettier (renaming stuff, removing some runs, changing titles, reordering, etc.).  For example, one of my plots was made with
43 | 
44 | ```
45 | python get_pretty_score.py cachedlogs/A*/*256*[=,]mul-* --titles '256 filters|' --title 'Decimal multiplication is easier with curriculum' --task mul --remove_strings='|-progressive_curriculum=5' --exclude='layer|progressive' --order '4,2,1,3' --global-legend=1
46 | ```
47 | 
48 | Requirements
49 | ============
50 | 
51 | * TensorFlow (see tensorflow.org for how to install)
52 | * Matplotlib for Python (sudo apt-get install python-matplotlib)
53 | * joblib
54 | 
55 | Credits
56 | =======
57 | 
58 | Original code by Lukasz Kaiser (lukaszkaiser).  Modified by Eric Price
59 | (ecprice)
60 | 


--------------------------------------------------------------------------------
/plots/carries.py:
--------------------------------------------------------------------------------
 1 | """Class for constructing problem inputs featuring lots of carries."""
 2 | from __future__ import print_function
 3 | 
 4 | import tensorflow as tf, numpy as np
 5 | 
 6 | import operator
 7 | import pandas
 8 | import random
 9 | import time
10 | import glob
11 | import sys
12 | import os
13 | 
14 | sys.path.append(os.path.dirname(os.path.dirname(__file__)))
15 | from neuralgpu import generators
16 | 
17 | 
18 | 
19 | def get_generator(base, sep, aligned=False, randloc=False):
20 |     base_class = generators.AlignedOpGenerator if aligned else generators.OpGenerator
21 |     class CarryGenerator(base_class):
22 |         def __init__(self, carry, overflow, randloc=randloc, base=base, sep=sep, zero_pad=True):
23 |             super(CarryGenerator, self).__init__(base, operator.add, sep)
24 |             self.carry = carry
25 |             self.overflow = overflow
26 |             self.randloc = randloc
27 | 
28 |         def _rand_inputs(self, k):
29 |             n1 = random.randint(1 if self.overflow else 0, self.base**self.carry-1)
30 |             n2 = self.base**self.carry - n1 - (0 if self.overflow else 1)
31 |             loc = random.randint(0, k - self.carry) if self.randloc else 0
32 |             vals = [n1*self.base**loc, n2*self.base**loc]
33 |             if random.random() > .5:
34 |                 return vals
35 |             else:
36 |                 return vals[::-1]
37 | 
38 |         @classmethod
39 |         def get_error_rate(cls, sess, model, carry_length, do_overflow, max_length, num):
40 |             if max_length is None:
41 |                 max_length = 2*carry_length + 3
42 |             example = cls(carry_length, do_overflow).get_batch(max_length, num)
43 |             result = model.step(sess, example, False)
44 |             return result.accuracy()[2]
45 | 
46 |         @classmethod
47 |         def get_rates(cls, sess, model, carries, max_length=201, numblocks=1, blocksize=32, verbose=True):
48 |             df = pandas.DataFrame(index=carries, columns=[False, True])
49 |             for carry in carries:
50 |                 for col in df.columns:
51 |                     ans = 0
52 |                     for i in range(numblocks):
53 |                         ans += cls.get_error_rate(sess, model, carry, col, max_length, blocksize)
54 |                     df[col][carry] = ans
55 |                 if verbose:
56 |                     print(carry, ':', df[False][carry], df[True][carry])
57 |             return df
58 | 
59 |     return CarryGenerator
60 | 


--------------------------------------------------------------------------------
/plots/paper_carries3.py:
--------------------------------------------------------------------------------
 1 | import pandas
 2 | import pylab
 3 | from matplotlib import rc
 4 | import matplotlib.ticker as mtick
 5 | 
 6 | rc('font',  size='9')
 7 | rc('axes', labelsize='large')
 8 | rc('lines', linewidth=3)
 9 | 
10 | supsize=12
11 | titlesize=9
12 | legsize=8
13 | 
14 | if __name__ != '__main__':
15 |     pylab.ion()
16 | 
17 | vertical = True
18 | #vertical = False
19 | #scp 6.cirrascale.sci.openai.org:models/neural_gpu/*.csv .
20 | 
21 | 
22 | def make_plot(csv_name, kws1={}):
23 |     data = pandas.read_csv(csv_name)
24 |     data = data.set_index(data.columns[0])
25 |     data.index.name = 'Carries'
26 |     data = data * 1. / data.values.max()
27 |     data = data[:60]
28 | 
29 |     my_kws = dict(marker='o', ms=5)
30 |     k1 = my_kws.copy()
31 |     k1.update(kws1)
32 |     #k2 = my_kws.copy()
33 |     #k2.update(label="Result barely carries")
34 |     #k2.update(kws2)
35 |     #pylab.plot(data['False'], **k1)
36 |     #pylab.plot(data['True'], **k2)
37 |     pylab.plot((data['True'] + data['False'])/2, **k1)
38 | 
39 | 
40 | if vertical:
41 |     pylab.figure(figsize=(4,4))
42 |     orientation = (2,1)
43 | else:
44 |     pylab.figure(figsize=(6,3))
45 |     orientation = (1,2)
46 | 
47 | pylab.subplot(*(orientation+(1,)))
48 | make_plot('csv/carry_errors_big.csv',
49 |           dict(label='Train on random examples'),
50 |          )
51 | make_plot('csv/carry_errors_big.baddt.csv',
52 |           dict(label='Train with some hard examples'),
53 |          )
54 | #pylab.xlabel("Number of carries $k$")
55 | #pylab.legend(loc=0, prop={'size': legsize})
56 | pylab.gca().yaxis.set_major_formatter(mtick.FuncFormatter(
57 |     lambda x, pos: '% 2d%%' % (x*100)))
58 | if not vertical:
59 |     pylab.xlabel("Number of carries $k$")
60 | pylab.ylabel("Error rate")
61 | pylab.title("Binary", size=titlesize)
62 | pylab.locator_params(axis='y',nbins=4)#
63 | 
64 | pylab.subplot(*(orientation+(2,)))
65 | make_plot('csv/carry_errors_add_large.csv',
66 |           dict(label='Train on random examples'),
67 |          )
68 | make_plot('csv/carry_errors_addt_large.csv',
69 |           dict(label='Train with some hard examples'),
70 |          )
71 | pylab.xlabel("Number of carries $k$")
72 | #pylab.legend(loc=0, prop={'size': legsize})
73 | if vertical:
74 |     pylab.ylabel("Error rate")
75 | pylab.gca().yaxis.set_major_formatter(mtick.FuncFormatter(
76 |     lambda x, pos: '% 2d%%' % (x*100)))
77 | pylab.title("Decimal", size=titlesize)
78 | pylab.suptitle("Additions with long carries", size=supsize)
79 | pylab.locator_params(axis='y',nbins=4)#
80 | pylab.gcf().legend(*pylab.gca().get_legend_handles_labels(), loc='lower center', ncol=1, labelspacing=0)
81 | if vertical:
82 |     pylab.tight_layout(rect=[0, 0.14, 1, 0.95])
83 | else:
84 |     pylab.tight_layout(rect=[0, 0.18, 1, 0.93])
85 | if vertical:
86 |     pylab.savefig('../neuralgpu_paper/fig_carries_all_vertical.pdf')
87 | else:
88 |     pylab.savefig('../neuralgpu_paper/fig_carries_all_horizontal.pdf')
89 | 


--------------------------------------------------------------------------------
/neuralgpu/data_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Utilities for the NeuralGPU
 16 | 
 17 | This file has two main components:
 18 | 
 19 |  - generators is a dict mapping task names to DataGenerator instances, which construct individual problem input/output pairs
 20 |  - Utilities for converting those input/output pairs to/from string representations.
 21 | """
 22 | 
 23 | import math
 24 | import random
 25 | import sys
 26 | import time
 27 | import operator
 28 | import functools
 29 | import numpy as np
 30 | import tensorflow as tf
 31 | 
 32 | from tensorflow.python.platform import gfile
 33 | 
 34 | FLAGS = tf.app.flags.FLAGS
 35 | 
 36 | # Lengths of NeuralGPU instances.  Inputs will be padded to the next
 37 | # larger one.
 38 | bins = [8, 12, 16, 20, 24, 28, 32, 36, 40, 48, 64, 128]
 39 | forward_max = 128
 40 | log_filename = ""
 41 | 
 42 | 
 43 | DIGITS = range(1, 11)
 44 | NULL = 0
 45 | DUP = 22
 46 | SPACE = 23
 47 | START = 24
 48 | MINUS = 25
 49 | 
 50 | def pad(l):
 51 |   for b in bins + [forward_max]:
 52 |     if b >= l: return b
 53 |   raise IndexError("Length %s longer than max length %s" % (l, forward_max))
 54 | 
 55 | 
 56 | @np.vectorize
 57 | def char_to_symbol(i):
 58 |   """Covert ids to text."""
 59 |   i = int(i)
 60 |   if i == 0: return "_"
 61 |   if i in [11,12,13]: return "+"
 62 |   if i in [14,15,16]: return "*"
 63 |   if i in [17,18,19]: return "/"
 64 |   if i in [20,21,22]: return "-"
 65 |   if i in [START]: return '^'
 66 |   if i in [SPACE]: return '.'
 67 |   if i in [MINUS]: return '-'
 68 |   return str(i-1)
 69 | 
 70 | def join_array(array, rev=False):
 71 |   if len(array.shape) == 1:
 72 |     if rev:
 73 |       array = array[::-1]
 74 |     return ''.join(array).rstrip(' ')
 75 |   elif len(array.shape) == 2:
 76 |     if rev:
 77 |       array = array[:,::-1]
 78 |     return '\n'.join([''.join(a).rstrip(' ') for a in array])
 79 |   else:
 80 |     raise ValueError("Weird shape for joining: %s" % array.shape)
 81 | 
 82 | def to_string(array, rev=True):
 83 |   if isinstance(array, tuple):
 84 |     if len(array) == 3: # Batches
 85 |       inp, outp = array[:2]
 86 |       return '\n\n'.join(to_string((i,o), rev) for i,o in zip(inp, outp))
 87 |     inp, outp = [to_string(a, rev) for a in array[:2]]
 88 |     return '%s\n%s\n%s' % (inp, '-'*len(inp.split('\n')[0]), outp)
 89 |   return join_array(char_to_symbol(array), rev=rev)
 90 | 
 91 | @np.vectorize
 92 | def to_id(s):
 93 |   """Covert text to ids."""
 94 |   if s == "+": return 11
 95 |   if s == "*": return 14
 96 |   return int(s) + 1
 97 | 
 98 | class TeeErr(object):
 99 |     def __init__(self, f):
100 |         self.file = f
101 |         self.stderr = sys.stderr
102 |         sys.stderr = self
103 |     def write(self, data):
104 |         self.file.write(data)
105 |         self.file.flush()
106 |         self.stderr.write(data)
107 | 
108 | log_f = None
109 | 
110 | def print_out(s, newline=True):
111 |   """Print a message out and log it to file."""
112 |   global log_f
113 |   if log_filename:
114 |     try:
115 |       if log_f is None:
116 |         log_f = open(log_filename, 'a', 1)
117 |       log_f.write(s + ("\n" if newline else ""))
118 |     # pylint: disable=bare-except
119 |     except:
120 |       sys.stdout.write("Error appending to %s\n" % log_filename)
121 |   sys.stdout.write(s + ("\n" if newline else ""))
122 |   sys.stdout.flush()
123 | 
124 | def safe_exp(x):
125 |   perp = 10000
126 |   if x < 100: perp = math.exp(x)
127 |   if perp > 10000: return 10000
128 |   return perp
129 | 
130 | def load_class(name):
131 |   modulename, classname = name.rsplit('.', 1)
132 |   module = __import__(modulename)
133 |   return getattr(module, classname)
134 | 


--------------------------------------------------------------------------------
/plots/large_carries.py:
--------------------------------------------------------------------------------
  1 | """Compute statistics on model checkpoints and long carries in decimal addition.
  2 | 
  3 | 
  4 | It looks for checkpoints of the form "../logs/September-*/*/neural_gpu.ckpt-100000
  5 | 
  6 | When run with different arguments, computes different statistics which
  7 | are placed in different files in the checkpoint directory; if that
  8 | file already exists, it does not compute the file.  Hence you can
  9 | repeatedly run this program, as you create more checkpoints.
 10 | 
 11 | With no arguments, in 'carries.csv' it places the success rate for various lengths of carries.
 12 | With '-t', in 'thresholds' it places the minimum threshold at which the success rate is < 50%
 13 | """
 14 | 
 15 | from __future__ import print_function
 16 | import tensorflow as tf
 17 | import numpy as np
 18 | import operator
 19 | import pandas
 20 | import random
 21 | import time
 22 | import os
 23 | import glob
 24 | import sys
 25 | 
 26 | sys.path.append(os.path.dirname(os.path.dirname(__file__)))
 27 | from neuralgpu import trainer, data_utils
 28 | import carries
 29 | 
 30 | #data_utils.bins.pop()
 31 | #data_utils.bins.pop()
 32 | 
 33 | #del data_utils.bins[6]
 34 | #del data_utils.bins[4]
 35 | 
 36 | # Because of the bug with 'tf.Variable' rather than 'tf.get_variable' for 'layer_index' in neural_gpu.py,
 37 | # we need to have an equal number of bins to when it was trained.
 38 | data_utils.bins = [8] + [2**i + 5 for i in range(3, 13)]
 39 | 
 40 | aligned = False
 41 | base, sep = (10, 13)
 42 | randloc = False
 43 | CarryGenerator = carries.get_generator(base, sep, aligned, randloc)
 44 | 
 45 | dir = None
 46 | model = None
 47 | sess = None
 48 | 
 49 | def load_model(dir):
 50 |     global model, sess
 51 |     reconfig = {'mode': 1,           # No backprop
 52 |                 'forward_max': 401}  # Large enough to check 200-digit carries
 53 |     if model is None:
 54 |         sess=tf.Session()
 55 |         model = trainer.load_model(sess, dir, reconfig)
 56 |     else:
 57 |         model.saver.restore(sess, dir+'/neural_gpu.ckpt-100000')
 58 | 
 59 | 
 60 | def find_dirs(base_dir='../logs', check_file='carries.csv'):
 61 |     """Find all checkpoint directories that haven't been updated for check_file"""
 62 |     for one in glob.glob(base_dir+'/September-*'):
 63 |         for full_dir in glob.glob(one+'/*'):
 64 |             if os.path.exists(full_dir+'/neural_gpu.ckpt-100000'):
 65 |                 if not os.path.exists(os.path.join(full_dir, check_file)):
 66 |                     yield full_dir
 67 | 
 68 | 
 69 | locs = list(range(1, 30)) + list(range(30,100,5))
 70 | 
 71 | def get_data(dir, locs=locs):
 72 |     load_model(dir)
 73 |     results = CarryGenerator.get_rates(sess, model, locs, 201 if randloc else None, 1)
 74 |     return results
 75 | 
 76 | def run_dir(dir):
 77 |     try:
 78 |         results = get_data(dir)
 79 |     except tensorflow.python.framework.errors.FailedPreconditionError as e:
 80 |         print('ERROR ON DIR', dir, file=sys.stderr)
 81 |         print()
 82 |         print(e)
 83 |         print()
 84 |         return
 85 |     with open(dir+'/carries.csv', 'w') as f:
 86 |         f.write(results.to_csv())
 87 | 
 88 | def bsearch(is_leq, lo=1, hi=None):
 89 |     if hi is None:
 90 |         hi =  2*lo
 91 |         while not is_leq(hi):
 92 |             lo, hi = hi+1, 2*hi
 93 |     while lo < hi:
 94 |         mid = (lo+hi)//2
 95 |         if is_leq(mid):
 96 |             hi = mid
 97 |         else:
 98 |             lo = mid + 1
 99 |     return lo
100 | 
101 | def find_threshold():
102 |     def is_leq(n):
103 |         def get_estimate(blocksize):
104 |             return sum(CarryGenerator.get_error_rate(sess, model, n, truth, None, blocksize) for truth in [False, True]) * 1./(2*blocksize)
105 | 
106 |         blocksize = 32
107 |         result = get_estimate(blocksize)
108 |         print(n, result)
109 |         # Be extra careful once we get close
110 |         if abs(result - .5) < .2:
111 |             result = np.mean([result] + [get_estimate(blocksize) for _ in range(2)])
112 |             print('Refined estimate:', result)
113 |         return result >= .5
114 |     return bsearch(is_leq)
115 | 
116 | def main_results():
117 |     for dir in find_dirs():
118 |         print('Checking', dir)
119 |         run_dir(dir)
120 | 
121 | def main_thresholds(fname = 'threshold'):
122 |     for dir in find_dirs(check_file=fname):
123 |         print('Checking', dir)
124 |         load_model(dir)
125 |         thresh = find_threshold()
126 |         with open(os.path.join(dir, fname), 'w') as f:
127 |             print(thresh, file=f)
128 | 
129 | if __name__ == '__main__':
130 |     if '-t' in sys.argv:
131 |         main_thresholds()
132 |     else:
133 |         main_results()
134 | 


--------------------------------------------------------------------------------
/plots/construct_table.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | from __future__ import print_function
  3 | import fileinput
  4 | 
  5 | import sys
  6 | import numpy as np
  7 | import pandas as pd
  8 | import argparse
  9 | import glob
 10 | import scipy.signal
 11 | import os
 12 | import yaml
 13 | 
 14 | import collections
 15 | 
 16 | parser = argparse.ArgumentParser(description='Get scores')
 17 | 
 18 | parser.add_argument("--metric", type=str, default='score')
 19 | parser.add_argument("--dir", type=str, default='/home/ecprice/large/research/neural_gpu/neural_parsed_logs/newer')
 20 | parser.add_argument("--curr", type=bool, default=True)
 21 | parser.add_argument("tasks", type=str, nargs='*')
 22 | args =  parser.parse_args()
 23 | 
 24 | def groupby(lst, num):
 25 |     ans = []
 26 |     for i in range(0, len(lst), num):
 27 |         yield lst[i:i+num]
 28 | 
 29 | class Run(dict):
 30 |     @property
 31 |     def metadata(self):
 32 |         return self['metadata']
 33 | 
 34 |     def options(self):
 35 |         cmd = self.metadata['commandline']
 36 |         lst = []
 37 |         for arg in cmd[1:]:
 38 |             a, b = arg.split('=', 1)
 39 |             lst.append((a.lstrip('-'), b))
 40 |         args = collections.OrderedDict(lst)
 41 |         return args
 42 | 
 43 |     @property
 44 |     def tasks(self):
 45 |         return self.options()['task'].split(',')
 46 | 
 47 |     @property
 48 |     def version(self):
 49 |         d = self.options()
 50 |         for k in 'train_dir task forward_max random_seed max_steps'.split():
 51 |             del d[k]
 52 |         mapping = {'progressive_curriculum': 'curr'}
 53 |         for k, v in mapping.items():
 54 |             d[v] = d[k]
 55 |             del d[k]
 56 |         return '-'.join('%s=%s' % (a, b) for (a, b) in d.items())
 57 | 
 58 |     def get_value(self, metric):
 59 |         if '.' in metric:
 60 |             metric, key = metric.split('.')
 61 |         else:
 62 |             key = 'fraction'
 63 |         data = self[metric][self.task]
 64 |         res = data[key]
 65 |         count = len(data['last'])
 66 |         if isinstance(res, list):
 67 |             res = int(np.median([x or 200000 for x in data[key]]) / 100)
 68 |             if res == 2000:
 69 |                 res = np.inf
 70 |         return (res, count)
 71 | 
 72 | def value_to_str(val):
 73 |     if val == np.inf:
 74 |         res = '$\\infty$'
 75 |     elif val is None:
 76 |         res = '-'
 77 |     elif isinstance(val, float):
 78 |         res = str(int(val*100)) + r'\%'
 79 |     else:
 80 |         res = str(val)
 81 |     return res
 82 | 
 83 | def pair_to_str(pair):
 84 |     if pair is None:
 85 |         return '-'
 86 |     else:
 87 |         return '%s (%s)' % (value_to_str(pair[0]), pair[1])
 88 | 
 89 | def load_all_data(dirname):
 90 |     files = glob.glob(os.path.join(dirname, '*'))
 91 |     results = []
 92 |     for fname in files:
 93 |         with open(fname) as f:
 94 |             results.append(Run(yaml.load(f)))
 95 |     return results
 96 | 
 97 | all_runs = load_all_data(args.dir)
 98 | 
 99 | if not args.tasks:
100 |     s = set([run.task for run in all_runs])
101 |     print('Need task name.  Options:', ' '.join(s))
102 |     sys.exit()
103 | 
104 | def build_table(all_runs, tasks):
105 |     rows = {}
106 |     for run in all_runs:
107 |         if run.task in tasks:
108 |             d = rows.setdefault(run.version, {})
109 |             assert run.task not in d
110 |             d[run.task] = run
111 |     return rows
112 | 
113 | def texify(s):
114 |     return r'\texttt{%s}' % (s.replace('_', r'\_'))
115 | 
116 | def table_to_str(rows, tasks, metric):
117 |     ans = []
118 |     ans.append(' & '.join(['Name', 'Mean'] + list(tasks)))
119 |     for version, runs in sorted(rows.items()):
120 |         values = [runs[t].get_value(metric) if t in runs else None for t in tasks]
121 |         row_strs = ([texify(version.split('=',1)[1])] +
122 |                     [value_to_str(np.mean([v[0] for v in values if v is not None]))] +
123 |                     [pair_to_str(value) for value in values])
124 |         ans.append(' & '.join(row_strs))
125 |     interior =  '\\\\\n'.join(ans)
126 |     table = r'''\begin{tabular}{lc%s}
127 | %s
128 | \end{tabular}''' % ('c'*len(tasks), interior)
129 |     return table
130 | 
131 | def split_table_to_str(table, tasks, metric, maxcol):
132 |     ans = []
133 |     for lst in groupby(tasks, maxcol):
134 |         ans.append(table_to_str(table, lst, metric))
135 |     return '\n\n\\noindent'.join(ans)
136 | 
137 | def get_document(rows, tasks, metrics, maxcol = 7):
138 |     table = build_table(rows, tasks)
139 |     results = []
140 |     for metric in metrics:
141 |         s = split_table_to_str(table, tasks, metric, maxcol)
142 |         results.append('\\section{%s}\n%s' % (metric, s))
143 |     return (r'''
144 | \documentclass[11pt,letterpaper]{article}
145 | \usepackage[margin=1in]{geometry}
146 | \begin{document}
147 | %s
148 | \end{document}
149 |     ''' % '\n\\newpage\n'.join(results))
150 | 
151 | print(get_document(all_runs, args.tasks, args.metric.split(','), 7))
152 | 


--------------------------------------------------------------------------------
/neuralgpu/curriculum.py:
--------------------------------------------------------------------------------
  1 | """Curriculum and its subclasses decide when to choose which task for training."""
  2 | 
  3 | from __future__ import print_function
  4 | 
  5 | import numpy as np
  6 | 
  7 | class Curriculum(object):
  8 |   def __init__(self, generators, model_config):
  9 |     self.generators = generators
 10 |     for i, g in enumerate(generators):
 11 |       g.taskid = i
 12 | 
 13 |     self.min_length = model_config.min_length
 14 |     self.max_length = model_config.max_length
 15 |     self.model_config = model_config
 16 |     self.max_cur_lengths = {g.taskid: min(self.min_length+3, self.max_length)
 17 |                             for g in generators}
 18 | 
 19 |   def draw_length(self, cur_length, generator):
 20 |     l = None
 21 |     while l is None:
 22 |       # Select the length for curriculum learning.
 23 |       l = np.random.randint(self.min_length, cur_length + 1)
 24 |       if np.random.randint(100) < 60: # Prefer longer stuff 60% of time.
 25 |         l = max(l, np.random.randint(self.min_length, cur_length + 1))
 26 |       # Mixed curriculum learning: in 25% of cases go to an even larger length.
 27 |       if np.random.randint(100) < 25:
 28 |         l = max(l, np.random.randint(self.min_length, self.max_length + 1))
 29 | 
 30 |       if not generator.is_valid_length(l):
 31 |         l = None
 32 | 
 33 |     within_bounds = (l <= cur_length)
 34 |     return l, within_bounds
 35 | 
 36 |   def get_generator_for_task(self, task):
 37 |     return [g for g in self.generators if g.name == task][0]
 38 | 
 39 |   def test_examples(self, batch_size, task):
 40 |     generator = self.get_generator_for_task(task)
 41 |     for l in np.arange(self.min_length, self.max_length + 1):
 42 |       if generator.is_valid_length(l):
 43 |         yield (generator.get_batch(l, batch_size), l)
 44 | 
 45 |   def draw_example(self, batch_size, l=None, task=None):
 46 |     """Draw a random example"""
 47 |     generator = self.draw_generator(task)
 48 |     if l is None:
 49 |       cur_length = self.get_cur_length(generator)
 50 |       l, within_bounds = self.draw_length(cur_length, generator)
 51 |     else:
 52 |       within_bounds = True
 53 |     return (generator.get_batch(l, batch_size), within_bounds)
 54 | 
 55 |   def tasks(self):
 56 |     """List of task names"""
 57 |     return [g.name for g in self.generators]
 58 | 
 59 |   def consider_extending(self, results):
 60 |     """Interpret the results"""
 61 |     pass
 62 | 
 63 |   def draw_generator(self, task=None):
 64 |     options = (self.generators if task is None else
 65 |                [g for g in self.generators if g.name == task])
 66 |     return np.random.choice(options)
 67 | 
 68 |   def get_cur_length(self, generator):
 69 |     return self.max_cur_lengths[generator.taskid]
 70 | 
 71 |   def consider_extending(self, record):
 72 |     ans = False
 73 |     for t in record.record_for_task:
 74 |       ans = max(ans, self.consider_extending_for_task(record.record_for_task[t], t))
 75 |     return ans
 76 | 
 77 |   def consider_extending_for_task(self, record, taskid):
 78 |     if record.avg_seq_err > self.model_config.curriculum_bound:
 79 |       return 0
 80 |     if self.max_cur_lengths[taskid] < self.max_length:
 81 |       self.max_cur_lengths[taskid] += 1
 82 |       while not self.generators[0].is_valid_length(self.max_cur_lengths[taskid]) and self.max_cur_lengths[taskid] < self.max_length:
 83 |         self.max_cur_lengths[taskid] += 1
 84 |       return 2
 85 |     return 1
 86 | 
 87 |   @property
 88 |   def length_str(self):
 89 |     return '/'.join(str(v) for k, v in sorted(self.max_cur_lengths.items()))
 90 | 
 91 | class GeneralizeCurriculum(Curriculum):
 92 | 
 93 |   def draw_generator(self, task=None):
 94 |     options = (self.generators[:1] if task is None else
 95 |                [g for g in self.generators if g.name == task])
 96 |     return options[0]
 97 | 
 98 |   @property
 99 |   def length_str(self):
100 |     return str(self.max_cur_lengths[self.generators[0].taskid])
101 | 
102 | class BetterCurriculum(Curriculum):
103 |   rand_prob = 0.2
104 |   only_later = False
105 |   decrease_threshold = 1
106 |   last_if_solved = False
107 | 
108 |   def __init__(self, generators, model_config, kind):
109 |     super(BetterCurriculum, self).__init__(generators, model_config)
110 |     if kind == 2:
111 |       self.decrease_threshold = 0.01
112 |     elif kind == 3:
113 |       self.rand_prob = 0
114 |     elif kind == 4:
115 |       self.only_later = True
116 |     elif kind == 5:
117 |       self.only_later = True
118 |       self.last_if_solved = True
119 | 
120 |   def draw_generator(self, task=None):
121 |     if task is not None:
122 |       return [g for g in self.generators if g.name == task][0]
123 |     unsolved = [g for g in self.generators if self.max_cur_lengths[g.taskid] < self.max_length]
124 |     if not unsolved:
125 |       if self.last_if_solved:
126 |         return self.generators[-1]
127 |       else:
128 |         return np.random.choice(self.generators)
129 |     if np.random.random() > self.rand_prob:
130 |       return unsolved[0]
131 |     if self.only_later:
132 |       return np.random.choice(unsolved)
133 |     else:
134 |       return np.random.choice(self.generators)
135 | 
136 |   def consider_extending_for_task(self, record, taskid):
137 |     if (self.max_cur_lengths[taskid] == self.max_length and
138 |         record.avg_seq_err > self.decrease_threshold):
139 |       self.max_cur_lengths[taskid] -= 1
140 |       return 0
141 |     if record.avg_seq_err > self.model_config.curriculum_bound:
142 |       return 0
143 |     val = super(BetterCurriculum, self).consider_extending_for_task(record, taskid)
144 |     # Don't stop us from decreasing learning rate here
145 |     if (self.max_cur_lengths[taskid] == self.max_length):
146 |       return 0
147 |     return val
148 | 


--------------------------------------------------------------------------------
/neuralgpu/records.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | NeuralGPUResult records what happened during one run of the NeuralGPU
  3 | 
  4 | ResultsRecord keeps track of the results during one stage of training.
  5 | '''
  6 | 
  7 | from __future__ import print_function
  8 | 
  9 | import numpy as np
 10 | 
 11 | from . import data_utils
 12 | 
 13 | class NeuralGPUResult(object):
 14 |   """Recover of result of a single batch, which is always on one task."""
 15 |   grad_norm = None
 16 |   back_update = None
 17 |   loss = None
 18 |   output = None
 19 |   layers = None
 20 |   attention = None
 21 | 
 22 |   def __init__(self, vals, inp, target, taskid):
 23 |     self.feed_out = vals
 24 |     self.__dict__.update(vals)
 25 |     self.input = inp
 26 |     self.target = target
 27 |     self.taskid = taskid
 28 | 
 29 |   def accuracy(self, nprint=0):
 30 |     mask = self.target > 0
 31 |     errors = mask * (self.target != np.argmax(self.output, axis=-1))
 32 |     return np.sum(errors), np.sum(mask), np.sum(np.any(errors, axis=1))
 33 | 
 34 |   @property
 35 |   def length(self):
 36 |     return (self.input[0,:,0] > 0).sum()
 37 | 
 38 |   @property
 39 |   def batch_size(self):
 40 |     return len(self.input)
 41 | 
 42 |   def __repr__(self):
 43 |     err, tot, seq_err = self.accuracy()
 44 |     return '<NeuralGPUResult: length=%s loss=%s bs=%s err=%s seq_err=%s>' % \
 45 |       (self.length, self.loss, self.batch_size, err, seq_err)
 46 | 
 47 |   def attention_by_layer(self):
 48 |     return self.attention.mean(axis=-1).round(3)
 49 | 
 50 |   def to_string(self, i=None):
 51 |     if i is None:
 52 |       return '\n\n'.join(self.to_string(i) for i in range(self.batch_size))
 53 |     inp, outp, targ = map(data_utils.to_string, (self.input[i], self.output[i].argmax(axis=-1), self.target[i]))
 54 |     ans = '\n'.join([inp, '-'*len(outp), outp, targ])
 55 |     if hasattr(self, 'probs'):
 56 |       ans = '%s\n%s' % (ans, self.probs[:,i].round(3))
 57 |     return ans
 58 | 
 59 |   def plot_attention(self, figname):
 60 |     import pylab
 61 |     for i in range(self.attention.shape[2]):
 62 |       for j in range(self.attention.shape[1]):
 63 |         pylab.plot(self.attention[:,j,i], color='rbgkyo'[j], alpha=0.2, marker='o')
 64 |     pylab.savefig(figname)
 65 | 
 66 | def plot_many_examples(sess, model, max_length, generator, batch_size,
 67 |                        dirpat):
 68 |   examples = [(l, generator.get_batch(l, batch_size)) for l in range(3, max_length+1)
 69 |               if generator.is_valid_length(l)]
 70 |   for l, example in examples:
 71 |     print(l)
 72 |     result = model.step(sess, example, False)
 73 |     result.plot_attention(dirpat % l)
 74 | 
 75 | class ResultsRecord(object):
 76 |   """Result from many runs of training, on many tasks"""
 77 |   def __init__(self, batch_size):
 78 |     self.batch_size = batch_size
 79 |     self.record_for_task = {}
 80 | 
 81 |   def feed(self, results, step_time, below_curriculum):
 82 |     taskid = results.taskid[0]
 83 |     assert(not(np.any(results.taskid != taskid)))
 84 |     if taskid not in self.record_for_task:
 85 |       self.record_for_task[taskid] = ResultsRecordPerTask(self.batch_size)
 86 |     self.record_for_task[taskid].feed(results, step_time, below_curriculum)
 87 | 
 88 |   def __str__(self):
 89 |     def fmt_attr(name, fmt, label, scale=1):
 90 |       return label + ' '  + '/'.join(fmt % (getattr(v, name)*scale)
 91 |                                      for v in self.record_for_task.values())
 92 |     stat_list = [fmt_attr('avg_ppx', '%.8f', 'ppx'),
 93 |                 fmt_attr('avg_grad_norm', '%.8f', 'grad-norm'),
 94 |                 fmt_attr('avg_step_time', '%s', 'step-time'),
 95 |                 fmt_attr('avg_err', '%.2f', 'errors', 100),
 96 |                 fmt_attr('avg_seq_err', '%.2f', 'seq-errors', 100),
 97 |                 ]
 98 |     if hasattr(next(iter(self.record_for_task.values())), 'binary_gap'):
 99 |       stat_list.append(fmt_attr('avg_binary_gap', '%.3f', 'binary-gap'))
100 |     return ' '.join(stat_list)
101 | 
102 | class ResultsRecordPerTask(object):
103 |   """Result of many batches on a single task"""
104 |   def __init__(self, batch_size):
105 |     self.batch_size = batch_size
106 | 
107 |     self.loss = 0.
108 |     self.err = 0.
109 |     self.seq_err = 0.
110 |     self.acc = 0.
111 |     self.grad_norm = 0.
112 |     self.num_batches = 0
113 |     self.num_below = 0
114 |     self.step_time = 0.
115 |     self.total = 0.
116 | 
117 |   def feed(self, results, step_time, below_curriculum):
118 |     self.num_batches += 1
119 |     self.num_below += below_curriculum
120 | 
121 |     self.step_time += step_time
122 |     self.grad_norm += results.grad_norm
123 |     for key in ['binary_gap']:
124 |       if hasattr(results, key):
125 |         if not hasattr(self, key):
126 |           setattr(self, key, 0)
127 |         setattr(self, key, getattr(self, key) + getattr(results, key))
128 |     if below_curriculum:
129 |       self.loss += results.loss
130 |       err, tot, seq_err = results.accuracy()
131 |       self.err += err
132 |       self.seq_err += seq_err
133 |       self.total += tot
134 | 
135 |   @property
136 |   def safe_num_below(self):
137 |     # If we happen to not have any samples within the curriculum, don't crash
138 |     return self.num_below or 1.
139 | 
140 |   @property
141 |   def avg_binary_gap(self):
142 |     return self.binary_gap / self.num_batches
143 | 
144 |   @property
145 |   def avg_step_time(self):
146 |     return self.step_time / self.num_batches
147 | 
148 |   @property
149 |   def avg_grad_norm(self):
150 |     return self.grad_norm / self.num_batches
151 | 
152 |   @property
153 |   def avg_loss(self):
154 |     return self.loss / self.safe_num_below
155 | 
156 |   @property
157 |   def avg_ppx(self):
158 |     return data_utils.safe_exp(self.loss / self.safe_num_below)
159 | 
160 |   @property
161 |   def avg_err(self):
162 |     return self.err / (self.total or 1)
163 | 
164 |   @property
165 |   def avg_seq_err(self):
166 |     return self.seq_err / (self.safe_num_below * self.batch_size)
167 | 


--------------------------------------------------------------------------------
/neuralgpu/mytf.py:
--------------------------------------------------------------------------------
  1 | """Various improvements to the tensorflow API."""
  2 | 
  3 | from __future__ import print_function
  4 | 
  5 | import tensorflow as tf
  6 | from tensorflow.python.training import moving_averages
  7 | import functools
  8 | 
  9 | def broadcast_as(origin, target, axes=None):
 10 |   """Broadcast origin into the shape of target using numpy-style broadcasting.
 11 | 
 12 |   If axes is not None, set the shape to be 1 (rather than target.shape[i])
 13 |   for each axis i not in axes."""
 14 |   in_size = shape_list(origin)
 15 |   out_size = shape_list(target)
 16 |   result = []
 17 |   if axes is None:
 18 |     axes = range(len(out_size))
 19 |   for d, (i, o) in enumerate(zip(in_size, out_size)):
 20 |     if i is None or o is None:
 21 |       result.append(1)
 22 |     if d in axes:
 23 |       assert o % i == 0
 24 |       result.append(o//i)
 25 |     else:
 26 |       result.append(1)
 27 |   return tf.tile(origin, result)
 28 | 
 29 | def stack(tensor_list, ax):
 30 |   """Stack many tensors along a single axis"""
 31 |   return tf.concat(ax, [tf.expand_dims(t, ax) for t in tensor_list])
 32 | 
 33 | def shape_list(tensor):
 34 |   """Return the tensor shape in a form tf.reshape understands."""
 35 |   return [x or -1 for x in tensor.get_shape().as_list()]
 36 | 
 37 | def safe_squeeze(array, i):
 38 |   """Only squeeze a particular axis, and check it was 1"""
 39 |   shape = shape_list(array)
 40 |   assert shape[i] == 1
 41 |   return tf.reshape(array, shape[:i] + (shape[i+1:] if (i+1) else []))
 42 | 
 43 | def expand_dims_by_k(array, k):
 44 |   """Add k 1s to the end of the tensor's shape"""
 45 |   return tf.reshape(array, shape_list(array) + [1]*k)
 46 | 
 47 | 
 48 | def fix_batching(f, k, nargs=1):
 49 |   """Make a given function f support extra initial dimensions.
 50 | 
 51 |   A number of tf.nn operations expect shapes of the form [-1] + lst
 52 |   where len(lst) is a fixed constant, and operate independently on the
 53 |   -1.  This lets them work on shapes of the form lst2 + lst, where
 54 |   lst2 is arbitrary.
 55 | 
 56 |   args:
 57 |     k: len(lst) that f wants
 58 |     nargs: Number of tensors with this property
 59 |   """
 60 |   @functools.wraps(f)
 61 |   def wrapper(*args, **kws):
 62 |     arrays = args[:nargs]
 63 |     old_shape = shape_list(arrays[0])
 64 |     used_shape = old_shape[-k:]
 65 |     inputs_reshaped = tuple(tf.reshape(array, [-1]+used_shape)
 66 |                        for array in arrays)
 67 |     output = f(*(inputs_reshaped + args[nargs:]), **kws)
 68 |     new_prefix = old_shape[:-k]
 69 |     new_suffix = shape_list(output)[1:]
 70 |     output_reshaped = tf.reshape(output, new_prefix + new_suffix)
 71 |     return output_reshaped
 72 |   return wrapper
 73 | 
 74 | softmax = fix_batching(tf.nn.softmax, 1)
 75 | conv2d = fix_batching(tf.nn.conv2d, 3)
 76 | softmax_cross_entropy_with_logits = fix_batching(tf.nn.softmax_cross_entropy_with_logits, 1, 2)
 77 | 
 78 | 
 79 | 
 80 | def masked_moments(x, axes, mask):
 81 |     x = x * mask
 82 |     num_entries = tf.reduce_sum(tf.ones_like(x) * mask, axes)
 83 |     mean = tf.reduce_sum(x, axes) / num_entries
 84 |     var = tf.reduce_sum(tf.squared_difference(x, mean)*mask, axes) / num_entries
 85 |     return (mean, var)
 86 | 
 87 | 
 88 | # From http://stackoverflow.com/questions/33949786/how-could-i-use-batch-normalization-in-tensorflow
 89 | # and https://github.com/ry/tensorflow-resnet/blob/master/resnet.py
 90 | def batch_norm(x, phase_train, mask=None, scope='bn'):
 91 |     """
 92 |     Batch normalization on convolutional maps.
 93 |     Args:
 94 |         x:           Tensor, 4D BHWD input maps
 95 |         phase_train: boolean tf.Varialbe, true indicates training phase
 96 |         scope:       string, variable scope
 97 |     Return:
 98 |         normed:      batch-normalized maps
 99 |     """
100 |     x_shape = shape_list(x)
101 |     params_shape = x_shape[-1:]
102 |     BN_DECAY = 0.8
103 |     BN_EPSILON = 1e-3
104 |     with tf.variable_scope(scope) as vs:
105 |         beta = tf.get_variable('beta', params_shape, initializer=tf.zeros_initializer)
106 |         gamma = tf.get_variable('gamma', params_shape, initializer=tf.ones_initializer)
107 |         moving_mean = tf.get_variable('moving_mean', params_shape,
108 |                                       initializer=tf.zeros_initializer, trainable=False)
109 |         moving_var = tf.get_variable('moving_var', params_shape,
110 |                                      initializer=tf.ones_initializer, trainable=False)
111 |         axes = range(len(x_shape)-1)
112 |         if mask is None:
113 |             batch_mean, batch_var = tf.nn.moments(x, axes, name='moments')
114 |         else:
115 |             batch_mean, batch_var = masked_moments(x, axes, mask)
116 | 
117 |         update_ops = [
118 |             moving_averages.assign_moving_average(moving_mean, batch_mean, BN_DECAY),
119 |             moving_averages.assign_moving_average(moving_var, batch_var, BN_DECAY)]
120 |         def mean_var_with_update():
121 |             with tf.control_dependencies(update_ops):
122 |                 return tf.identity(batch_mean), tf.identity(batch_var)
123 | 
124 |         #mean, var = tf.cond(phase_train,
125 |         #                    mean_var_with_update,
126 |         #                    lambda: (moving_mean, moving_var))
127 |         mean, var = mean_var_with_update()#(batch_mean, batch_var)
128 |         normed = tf.nn.batch_normalization(x, mean, var, beta, gamma, BN_EPSILON)
129 |     return normed
130 | 
131 | 
132 | 
133 | 
134 | def print_bn_state(sess, nmaps):
135 |     var_list = 'beta gamma moving_mean moving_var'.split()
136 |     d = {}
137 |     with tf.variable_scope('model/RX1/bn', reuse=True) as vs:
138 |         for v in var_list:
139 |             d[v] = tf.get_variable(v, [nmaps])
140 |     result = sess.run(d, {})
141 |     for v in var_list:
142 |         print(v, result[v])
143 | 
144 | #numpy.fft.ifft(numpy.conj(numpy.fft.fft(a)) * numpy.fft.fft(b)).round(3)
145 | 
146 | def softmax_index2d(indices, values, reduce = False):
147 |   indices_shape = shape_list(indices)
148 |   softmax_indices = tf.reshape(
149 |     tf.nn.softmax(
150 |       tf.reshape(indices, [-1, indices_shape[-1]*indices_shape[-2]])),
151 |     indices_shape)
152 |   softmax_indices = tf.complex(softmax_indices, tf.zeros_like(softmax_indices))
153 |   values = tf.complex(values, tf.zeros_like(values))
154 |   fft_of_answer = tf.conj(tf.batch_fft2d(softmax_indices)) * tf.batch_fft2d(values)
155 |   if reduce:
156 |     return tf.reduce_mean(tf.real(tf.batch_ifft(fft_of_answer)), -2)
157 |   else:
158 |     return tf.real(tf.batch_ifft2d(fft_of_answer))
159 | 
160 | def softmax_index1d(indices, values):
161 |   # indices: bs x height x length 
162 |   # values: stuff x bs x height x length
163 |   indices_shape = shape_list(indices)
164 |   softmax_indices = softmax(indices)
165 |   softmax_indices = tf.complex(softmax_indices, tf.zeros_like(softmax_indices))
166 |   values = tf.complex(values, tf.zeros_like(values))
167 |   fft_of_answer = tf.conj(tf.batch_fft(softmax_indices)) * tf.batch_fft(values)
168 |   return tf.real(tf.batch_ifft(fft_of_answer))
169 | 


--------------------------------------------------------------------------------
/neuralgpu/model.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """The Neural GPU Model."""
 16 | 
 17 | import time
 18 | 
 19 | import tensorflow as tf
 20 | 
 21 | import random
 22 | import numpy as np
 23 | 
 24 | from . import mytf
 25 | from . import data_utils
 26 | from .records import NeuralGPUResult
 27 | 
 28 | FLAGS = tf.app.flags.FLAGS
 29 | 
 30 | def tf_cut_function(val, vlo, vhi, glo, ghi):
 31 |   if vlo is None:
 32 |     return val
 33 |   a = tf.clip_by_value(val, vlo, vhi)
 34 |   if glo is None:
 35 |     return a
 36 |   assert ghi >= vhi > vlo >= glo
 37 |   zz = tf.clip_by_value(val, glo, ghi)
 38 |   return zz - tf.stop_gradient(zz - a)
 39 | 
 40 | def sigmoid_cutoff(x, cutoff):
 41 |   """Sigmoid with cutoff, e.g., 1.2sigmoid(x) - 0.1."""
 42 |   y = tf.sigmoid(x)
 43 |   if cutoff < 1.01: return y
 44 |   d = (cutoff - 1.0) / 2.0
 45 |   z = cutoff * y - d
 46 |   dd = (FLAGS.smooth_grad - 1.0) / 2.0 if FLAGS.smooth_grad else None
 47 |   glo, ghi = (-dd, 1+dd) if FLAGS.smooth_grad else (None, None)
 48 |   return tf_cut_function(z, 0, 1, glo, ghi)
 49 | 
 50 | def tanh_cutoff(x, cutoff):
 51 |   """Tanh with cutoff, e.g., 1.1tanh(x) cut to [-1. 1]."""
 52 |   y = tf.tanh(x)
 53 |   if cutoff < 1.01: return y
 54 |   z = cutoff * y
 55 |   tcut = FLAGS.smooth_grad_tanh
 56 |   glo, ghi = (-tcut, tcut) if tcut else (None, None)
 57 |   return tf_cut_function(z, -1, 1, glo, ghi)
 58 | 
 59 | def conv_linear(arg, kw, kh, nout, prefix, bias=0):
 60 |   """Convolutional linear map."""
 61 |   strides = [1, 1, 1, 1]
 62 |   if isinstance(arg, list):
 63 |     if len(arg) == 1:
 64 |       arg = arg[0]
 65 |     else:
 66 |       arg = tf.concat(len(mytf.shape_list(arg[0]))-1, arg)
 67 |   nin = mytf.shape_list(arg)[-1]
 68 |   with tf.variable_scope(prefix):
 69 |     k = tf.get_variable("CvK", [kw, kh, nin, nout])
 70 |     res = mytf.conv2d(arg, k, strides, "SAME")
 71 | 
 72 |     if bias is None:
 73 |       return res
 74 |     bias_term = tf.get_variable("CvB", [nout],
 75 |                                 initializer=tf.constant_initializer(0.0))
 76 |     return res + bias_term + float(bias)
 77 | 
 78 | def conv_gru(mem, kw, kh, nmaps, cutoff, prefix, extras=[]):
 79 |   """Convolutional GRU."""
 80 |   # mem shape: bs x length x height x nmaps
 81 |   def conv_lin(arg, suffix, bias_start):
 82 |     return conv_linear(extras + [arg], kw, kh, nmaps,
 83 |                        prefix + "/" + suffix, bias=bias_start)
 84 |   reset = sigmoid_cutoff(conv_lin(mem, "r", 1), cutoff)
 85 |   candidate = tanh_cutoff(conv_lin(reset * mem, "c", 0), FLAGS.cutoff_tanh)
 86 |   gate = sigmoid_cutoff(conv_lin(mem, "g", 1), cutoff)
 87 |   return gate * mem + (1 - gate) * candidate
 88 | 
 89 | def resnet_block(cur, kw, kh, nmaps, cutoff, mask, suffix, nconvs=2,
 90 |                  extras = []):
 91 |   old = cur
 92 |   for i in range(nconvs):
 93 |     cur = conv_linear(extras + [cur], kw, kh, nmaps, "cgru_%d_%s" % (i, suffix))
 94 |     if i == nconvs - 1:
 95 |       cur = old + cur
 96 |     cur = tf.nn.relu(cur * mask)
 97 |   return cur
 98 | 
 99 | def lstm_block(cur, kw, kh, nmaps, cutoff, mask, suffix, nconvs=2,
100 |                extras = []):
101 |   # Do nconvs-many CGRU steps.
102 |   for layer in range(nconvs):
103 |     cur = conv_gru(cur, kw, kh, nmaps, cutoff, "cgru_%d_%s" % (layer, suffix),
104 |                    extras = extras)
105 |     cur *= mask
106 |   return cur
107 | 
108 | def gru_block(*args, **kws):
109 |   if FLAGS.do_resnet:
110 |     return resnet_block(*args, **kws)
111 |   else:
112 |     return lstm_block(*args, **kws)
113 | 
114 | def relaxed_average(var_name_suffix, rx_step):
115 |   """Calculate the average of relaxed variables having var_name_suffix."""
116 |   relaxed_vars = []
117 |   for l in range(rx_step):
118 |     with tf.variable_scope("RX%d" % l, reuse=True):
119 |       try:
120 |         relaxed_vars.append(tf.get_variable(var_name_suffix))
121 |       except ValueError:
122 |         pass
123 |   assert relaxed_vars
124 |   dsum = tf.add_n(relaxed_vars)
125 |   avg = dsum / len(relaxed_vars)
126 |   diff = [v - avg for v in relaxed_vars]
127 |   davg = tf.add_n([d*d for d in diff])
128 |   return avg, tf.reduce_sum(davg)
129 | 
130 | 
131 | def relaxed_distance(rx_step):
132 |   """Distance between relaxed variables and their average."""
133 |   res, ops, rx_done = [], [], {}
134 |   for v in tf.trainable_variables():
135 |     vals = v.op.name.split('/', 2)
136 |     if vals[1].startswith('RX'):
137 |       rx_name = vals[2]
138 |       if rx_name not in rx_done:
139 |         avg, dist_loss = relaxed_average(rx_name, rx_step)
140 |         res.append(dist_loss)
141 |         rx_done[rx_name] = avg
142 |       ops.append(v.assign(rx_done[rx_name]))
143 |   return tf.add_n(res), tf.group(*ops)
144 | 
145 | class NeuralGPUAtSize(object):
146 |   """Instantiate the NeuralGPU at a given block size."""
147 |   def __init__(self, model, length, adam):
148 |     self.ntasks = 4
149 | 
150 |     self.config = model.config
151 |     self.length = length
152 |     # batch_size x length x height
153 |     self.input = tf.placeholder(tf.int32, shape=(None, length, self.config.height),
154 |                                 name="input{0}".format(length))
155 |     self.target = tf.placeholder(tf.int32, shape=(None,length), name="target{0}".format(length))
156 |     self.emb_weights = model.emb_weights
157 |     self.e0 = model.e0
158 |     self.do_training = tf.placeholder(tf.bool, shape=[], name="do_training")
159 | 
160 |     self.model = model
161 | 
162 |     self.task = tf.placeholder(tf.uint8, shape=(None,), name="task")
163 | 
164 |     self.construct_graph(adam)
165 | 
166 |   def construct_mask(self) :
167 |     # Mask to 0-out padding space in each step.
168 |     # bmask: batch_size x length
169 |     bmask = tf.reduce_any(self.input > 0, 2) | (self.target > 0)
170 |     # mask: batch x length x 1 x 1
171 |     mask = tf.to_float(mytf.expand_dims_by_k(bmask, 2))
172 |     return mask
173 | 
174 |   def looping_layer(self, cur, index, *args):
175 |     if FLAGS.output_layer == 1:
176 |       output, = args
177 |     keep_prob = 1.0 - tf.to_float(self.do_training) * (self.config.dropout * 8.0 / self.length)
178 |     for it in range(self.config.rx_step):
179 |       with tf.variable_scope("RX%d" % it) as vs:
180 |         old = cur
181 |         cur = tf.nn.dropout(cur, keep_prob)
182 |         cur = gru_block(cur, self.config.kw, self.config.kh, self.config.nmaps,
183 |                         self.config.cutoff, self.mask, 'lookup',
184 |                         self.config.nconvs, extras=self.extras)
185 | 
186 |         if FLAGS.do_batchnorm:
187 |           if FLAGS.do_batchnorm == 1:
188 |             cur = mytf.batch_norm(cur, self.do_training, scope='bn')
189 |           elif FLAGS.do_batchnorm == 2:
190 |             cur = mytf.batch_norm(cur, self.do_training, self.mask, scope='bn')
191 | 
192 |         if FLAGS.output_layer == 1:
193 |           output += cur
194 |         else:
195 |           cur = tf.select(tf.greater_equal(self.output_layers, index + it), cur, old)
196 |     if FLAGS.output_layer == 1:
197 |       return (cur, index + self.config.rx_step, output)
198 |     else:
199 |       return (cur, index + self.config.rx_step)
200 | 
201 |   def construct_all_layers(self, first, mask):
202 |     # first: batch_size x length x height x nmaps
203 | 
204 |     output_layers = tf.to_int32(tf.reduce_sum(mask, [1,2,3]))
205 | 
206 |     cur = first
207 |     layers = []
208 | 
209 |     extras = []
210 |     if FLAGS.taskid:
211 |       # bs x 1 x 1 x ntasks
212 |       task = tf.one_hot(tf.to_int32(mytf.expand_dims_by_k(self.task, 2)),
213 |                         self.ntasks)
214 |       extras.append(mytf.broadcast_as(task, cur, [1,2]))
215 | 
216 |     self.mask = mask
217 |     self.extras = extras
218 |     self.output_layers = output_layers
219 |     it = tf.get_variable("layer_index", [], dtype=tf.int32,
220 |                          initializer=tf.constant_initializer(0))
221 |     # Using swap is slower, but saves GPU memory.
222 |     use_swap = bool(self.config.nmaps > 256 or (FLAGS.do_batchnorm and self.config.nmaps == 128))
223 |     num_layers = int(self.config.layer_scale*self.length)
224 |     args = [cur, it] + ([tf.zeros_like(cur)] if FLAGS.output_layer == 1 else [])
225 |     result = tf.while_loop(lambda cur, it, *args: it < num_layers,
226 |                             self.looping_layer,
227 |                             args,
228 |                             parallel_iterations=1,
229 |                             swap_memory=use_swap)
230 |     if FLAGS.output_layer == 1:
231 |       ans = result[-1]
232 |     else:
233 |       ans = result[0]
234 |     return ans
235 | 
236 |   def _get_first_layer(self, mask):
237 |     """Turn the input into a batch_size x length x height x nmaps tensor"""
238 |     nmaps = self.config.nmaps
239 |     height = self.config.height
240 | 
241 |     # Embed inputs and calculate mask.
242 |     with tf.control_dependencies([self.e0]):
243 |       embedded = tf.nn.embedding_lookup(self.emb_weights, self.input)
244 | 
245 |     # first: batch_size x length x height x nmaps
246 |     first = tf.tanh(embedded)
247 | 
248 |     return first
249 | 
250 |   def construct_graph(self, adam):
251 |     nmaps = self.config.nmaps
252 |     noclass = self.config.noclass
253 | 
254 |     mask = self.construct_mask()
255 | 
256 |     # The general tensor shape is
257 |     # batchsize x length x height x nmaps
258 |     first = self._get_first_layer(mask)
259 | 
260 |     # Computation steps.
261 |     last_layer = self.construct_all_layers(first, mask)
262 | 
263 |     # Final convolution to get logits, list outputs.
264 |     layer_output = conv_linear(last_layer[:,:,:1,:], 1, 1, noclass, "output")
265 |     output = mytf.safe_squeeze(layer_output, -2) # batch x length x noclass
266 | 
267 |     self.output = mytf.softmax(output) # batch_size x length x noclass
268 | 
269 |     # Calculate cross-entropy loss and normalize it.
270 |     targets = tf.one_hot(self.target, noclass)
271 |     xent = mytf.softmax_cross_entropy_with_logits(output, targets) # shape: batch x length
272 |     perp_loss = tf.reduce_mean(xent * tf.reshape(mask, [-1, self.length]))
273 | 
274 |     # Final loss: cross-entropy + shared parameter relaxation part.
275 |     relax_dist, self.model.avg_op = relaxed_distance(self.config.rx_step)
276 |     total_loss = perp_loss + relax_dist * self.model.pull
277 |     self.loss = perp_loss
278 | 
279 |     # Gradients and Adam update operation.
280 |     if self.length == data_utils.bins[0] or (self.config.mode == 0 and
281 |                                         self.length < data_utils.bins[-1] + 1):
282 |       data_utils.print_out("Creating backward for bin of length %d." % self.length)
283 |       params = tf.trainable_variables()
284 |       grads = tf.gradients(total_loss, params)
285 |       grads, norm = tf.clip_by_global_norm(grads, self.config.max_grad_norm)
286 |       self.grad_norm = norm
287 |       update = adam.apply_gradients(zip(grads, params),
288 |                                     global_step=self.model.global_step)
289 |       self.update = update
290 | 
291 |   def __repr__(self):
292 |     return '<NeuralGPUAtSize %s>' % (self.length)
293 | 
294 |   def step(self, sess, batch, do_backward=False, get_steps=False, more_feed={}):
295 |     """Run a step of the network."""
296 |     inp, target, taskid = batch
297 |     assert inp.shape[0] == target.shape[0]
298 |     assert inp.shape[-1] == target.shape[1]
299 |     if len(inp.shape) == 2:
300 |       inp = np.expand_dims(inp, 1)
301 |     assert len(inp.shape) == 3
302 |     if inp.shape[1] < self.config.height:
303 |       extra = self.config.height - inp.shape[1]
304 |       inp = np.concatenate([inp] + [np.zeros_like(inp[:,:1,:])]*extra, axis=1)
305 |     feed_in = {}
306 |     feed_in[self.do_training] = do_backward
307 |     feed_in[self.task] = taskid
308 |     feed_in[self.input] = inp.transpose([0,2,1])
309 |     feed_in[self.target] = target
310 |     feed_out = {}
311 |     feed_out.update(more_feed)
312 |     if do_backward:
313 |       feed_out['back_update'] = self.update
314 |       feed_out['grad_norm'] = self.grad_norm
315 |     if get_steps:
316 |       feed_out['layers'] = self.layers
317 |     if hasattr(self, 'probs'):
318 |       feed_out['probs'] = self.probs
319 |     feed_out['loss'] = self.loss
320 |     feed_out['output'] = self.output
321 |     res = sess.run(feed_out, feed_in)
322 |     return NeuralGPUResult(res, inp, target, taskid)
323 | 
324 | class NeuralGPU(object):
325 |   """Neural GPU Model."""
326 |   def __init__(self, config):
327 |     self.t = time.time()
328 |     self.config = config
329 | 
330 |     # Feeds for parameters and ops to update them.
331 |     self.global_step = tf.Variable(0, trainable=False)
332 |     self.lr = tf.Variable(float(config.lr), trainable=False)
333 |     self.lr_decay_op = self.lr.assign(self.lr * 0.98)
334 |     self.pull = tf.Variable(float(config.pull), trainable=False)
335 |     self.pull_incr_op = self.pull.assign(self.pull * config.pull_incr)
336 | 
337 |     # Feeds for inputs, targets, outputs, losses, etc.
338 |     self.instances = []
339 | 
340 |     with tf.variable_scope("model") as vs:
341 |       self.construct_graph()
342 |       self.saver = tf.train.Saver(tf.all_variables())
343 | 
344 |   def construct_graph(self):
345 |     vec_size = self.config.nmaps
346 |     # Computation.
347 |     self.emb_weights = tf.get_variable(
348 |         "embedding", [self.config.niclass, vec_size],
349 |         initializer=tf.random_uniform_initializer(-1.7, 1.7))
350 |     self.e0 = tf.scatter_update(self.emb_weights,
351 |                            tf.constant(0, dtype=tf.int32, shape=[1]),
352 |                            tf.zeros([1, vec_size]))
353 | 
354 |     adam = tf.train.AdamOptimizer(self.lr, epsilon=1e-4, use_locking=True)
355 | 
356 |     # Main graph creation loop, for every bin in data_utils.
357 |     for length in sorted(list(set(data_utils.bins + [data_utils.forward_max]))):
358 |       data_utils.print_out("Creating model for bin of length %d." % length)
359 |       start_time = time.time()
360 |       self.instances.append(NeuralGPUAtSize(self, length, adam))
361 |       tf.get_variable_scope().reuse_variables() # Later rounds reuse variables
362 |       data_utils.print_out("Created model for bin of length %d in"
363 |                            " %.2f s." % (length, time.time() - start_time))
364 | 
365 |   def get_instance_for_length(self, length):
366 |     for instance in self.instances:
367 |       if instance.length >= length:
368 |         return instance
369 |     raise IndexError('Max instance size %s; %s is too large!' % (instance.length, length))
370 | 
371 |   def step(self, sess, batch, *args, **kws):
372 |     """Run a step of the network."""
373 |     inp, target, taskid = batch
374 |     instance = self.get_instance_for_length(target.shape[-1])
375 |     return instance.step(sess, batch, *args, **kws)
376 | 
377 |   def simple_step(self, sess, a):
378 |     """Run a simple operation on one input.
379 | 
380 |     Reverses the order for you, so you can input in little endian.
381 |     """
382 |     if isinstance(a, basestring):
383 |       a = [data_utils.to_id(c) for c in a]
384 |     else:
385 |       a = list(a)
386 |     l = self.get_instance_for_length(len(a)).length
387 |     pad = l - len(a)
388 |     input = np.array([a[::-1] + [0]*pad])
389 |     result = self.step(sess, (input, input, [0]), False)
390 |     relevant_output = result.output.argmax(axis=-1)[0, :(-pad if pad else None)]
391 |     return ''.join(map(data_utils.to_symbol, relevant_output[::-1]))
392 | 


--------------------------------------------------------------------------------
/neuralgpu/generators.py:
--------------------------------------------------------------------------------
  1 | """Generators for the different problems."""
  2 | 
  3 | import math
  4 | import random
  5 | import sys
  6 | import time
  7 | import operator
  8 | import functools
  9 | import numpy as np
 10 | 
 11 | from . import data_utils
 12 | from .data_utils import SPACE, START, MINUS, DUP
 13 | 
 14 | # This maps task names to DataGenerator instances
 15 | generators = {}
 16 | 
 17 | PADDING = False
 18 | 
 19 | def to_base(num, b, l=1):
 20 |   if num < 0:
 21 |     val = to_base(-num, b, (l - 1) or 1)
 22 |     return np.concatenate([val, [MINUS-1]])
 23 |   assert num >= 0
 24 |   ans = []
 25 |   while num:
 26 |     ans.append(num%b)
 27 |     num //= b
 28 |   while len(ans) < l:
 29 |     ans.append(0)
 30 |   return np.array(ans)
 31 | 
 32 | def from_base(lst, b):
 33 |   num = 0
 34 |   for v in lst[::-1]:
 35 |     num = num*b + v
 36 |   return num
 37 | 
 38 | 
 39 | class DataGenerator(object):
 40 |   """The base class for generating problem input/output pairs"""
 41 |   nclass = 33
 42 |   name = '<unknown task>'
 43 |   taskid = 0
 44 |   height = None
 45 |   min_length = 1
 46 | 
 47 |   def is_valid_length(self, l):
 48 |     """Can this problem have instances of length l?"""
 49 |     return True
 50 | 
 51 |   def rand_pair(self, length):
 52 |     """Random data pair for a task. Total length should be <= length."""
 53 |     raise NotImplementedError()
 54 | 
 55 |   def rand_pair_padded(self, length):
 56 |     """Construct a random data pair, then pad the inputs to a valid size."""
 57 |     pad_length = data_utils.pad(length)
 58 |     inp, outp = self.rand_pair(length)
 59 |     inp = np.array(inp)
 60 |     if len(inp.shape) == 1:
 61 |       inp = np.array([inp])
 62 |     padding_func = lambda x: np.pad(x, [(0,0)]*(len(x.shape)-1) +
 63 |                                        [(0, pad_length - x.shape[-1])], 'constant')
 64 |     inp, outp = padding_func(inp), padding_func(outp)
 65 |     assert inp.shape[-1] == pad_length, outp.shape[-1] == pad_length
 66 |     return inp, outp
 67 | 
 68 |   def get_batch(self, length, batch_size):
 69 |     """Construct a complete batch of problem instances"""
 70 |     inps, outps = [], []
 71 |     for _ in range(batch_size):
 72 |       inp, outp = self.rand_pair_padded(length)
 73 |       inps.append(inp)
 74 |       outps.append(outp)
 75 | 
 76 |     inp = np.stack(inps, 0)
 77 |     outp = np.stack(outps, 0)
 78 |     return inp, outp, np.array([self.taskid] * batch_size)
 79 | 
 80 |   def _initialize(self, nclass):
 81 |     self.nclass = nclass
 82 | 
 83 |   def __repr__(self):
 84 |     return "<%s name='%s' taskid=%s>" % (self.__class__.__name__, self.name, self.taskid)
 85 | 
 86 | class OpGenerator(DataGenerator):
 87 |   """Generator for instances using operations on two variables in some base"""
 88 |   min_length = 3
 89 | 
 90 |   def __init__(self, base, f, sep, zero_pad=True):
 91 |     self.base = base
 92 |     self.f = f
 93 |     self.sep = sep
 94 |     self.zero_pad = zero_pad
 95 | 
 96 |   def is_valid_length(self, l):
 97 |     return l%2 == 1 and l >= self.min_length
 98 | 
 99 |   def _rand_inputs(self, k):
100 |     k = int(k)
101 |     n1 = random.randint(0, self.base**k-1)
102 |     n2 = random.randint(0, self.base**k-1)
103 |     return (n1, n2)
104 | 
105 |   def rand_pair(self, l):
106 |     k = int((l-1 - 2*PADDING)//2)
107 |     n1, n2 = self._rand_inputs(k)
108 |     result = self.f(n1, n2)
109 |     inp = np.concatenate([[START] if PADDING else [],
110 |        to_base(n1, self.base, k if self.zero_pad else 1) + 1,
111 |        [self.sep],
112 |                           to_base(n2, self.base, k if self.zero_pad else 1) + 1,
113 |                           #[22] if PADDING else []
114 |     ])
115 |     outp = np.concatenate([#[START] if PADDING else [],
116 |             to_base(result, self.base, 2*k+1 if self.zero_pad else 1) + 1,
117 |                            #[22] if PADDING else []
118 |     ])
119 |     return inp, outp
120 | 
121 | generators.update(dict(badd=OpGenerator(2, operator.add, 11),
122 |                        qadd=OpGenerator(4, operator.add, 12),
123 |                        add=OpGenerator(10, operator.add, 13),
124 |                        bmul=OpGenerator(2, operator.mul, 14),
125 |                        qmul=OpGenerator(4, operator.mul, 15),
126 |                        omul=OpGenerator(8, operator.mul, 17),
127 |                        fmul=OpGenerator(5, operator.mul, 18),
128 |                        mul=OpGenerator(10, operator.mul, 16),))
129 | 
130 | generators.update(dict(baddz=OpGenerator(2, operator.add, 11, False),
131 |                        qaddz=OpGenerator(4, operator.add, 12, False),
132 |                        addz=OpGenerator(10, operator.add, 13, False),
133 |                        bmulz=OpGenerator(2, operator.mul, 14, False),
134 |                        qmulz=OpGenerator(4, operator.mul, 15, False),
135 |                        mulz=OpGenerator(10, operator.mul, 16, False),))
136 | 
137 | class ToughAddGenerator(OpGenerator):
138 |   """More adversarial inputs for addition"""
139 |   def __init__(self, base, sep, zero_pad=True):
140 |     super(ToughAddGenerator, self).__init__(base, operator.add, sep, zero_pad)
141 | 
142 |   def _rand_inputs(self, k):
143 |     r = random.random()
144 |     if r < 0.2:
145 |       lo, hi = sorted([random.randint(1, k), random.randint(1, k)])
146 |       vals = (self.base**hi - self.base**(lo-1), random.randint(0,self.base**(lo)-1))
147 |     elif r < .4:
148 |       k2 = random.choice([k, random.randint(1, k)])
149 |       lo = random.randint(1, self.base**k2-1)
150 |       vals = (lo, self.base**k2 - lo - random.randint(0,1))
151 |     else:
152 |       vals = (random.randint(0, self.base**k-1), random.randint(0, self.base**k-1))
153 |     if random.random() > .5:
154 |       return vals
155 |     else:
156 |       return vals[::-1]
157 | 
158 | generators.update(dict(baddt=ToughAddGenerator(2, 11),
159 |                        qaddt=ToughAddGenerator(4, 12),
160 |                        addt=ToughAddGenerator(10, 13),))
161 | 
162 | 
163 | 
164 | class AlignedOpGenerator(OpGenerator):
165 |   """Two-line binary inputs"""
166 |   min_length = 2
167 |   def rand_pair(self, l):
168 |     k = int((l-1 - 2*PADDING)//2)
169 |     n1, n2 = self._rand_inputs(k)
170 |     result = self.f(n1, n2)
171 |     n1, n2 = [np.concatenate([[START] if PADDING else [],
172 |                               to_base(n, self.base, k) + 1,
173 |                               #[22] if PADDING else []
174 |                              ]) for n in [n1,n2]]
175 |     preferred_length = l#max(len(n1), len(n2))+1
176 |     pad_n1, pad_n2 = [np.pad(n,(0, preferred_length-len(n)), 'constant') for n in (n1, n2)]
177 |     pad_n2[len(n2)] = self.sep
178 |     inp2 = np.vstack([pad_n1, pad_n2])
179 |     o = np.concatenate([[START] if PADDING else [], to_base(result, self.base, l) + 1])
180 |     outp = np.pad(o, (0, preferred_length - len(o)), 'constant', constant_values=SPACE)
181 |     return inp2, outp
182 | 
183 | class AlignedToughAddGenerator(AlignedOpGenerator, ToughAddGenerator):
184 |   pass
185 | 
186 | generators.update(dict(badde=AlignedOpGenerator(2, operator.add, 11),
187 |                        qadde=AlignedOpGenerator(4, operator.add, 12),
188 |                        adde=AlignedOpGenerator(10, operator.add, 13),
189 |                        bmule=AlignedOpGenerator(2, operator.mul, 14),
190 |                        qmule=AlignedOpGenerator(4, operator.mul, 15),
191 |                        mule=AlignedOpGenerator(10, operator.mul, 16),
192 |                        baddet=AlignedToughAddGenerator(2, 11),
193 |                        qaddet=AlignedToughAddGenerator(4, 12),
194 |                        addet=AlignedToughAddGenerator(10, 13),
195 |                        baddzt=ToughAddGenerator(2, 11, False),
196 |                        qaddzt=ToughAddGenerator(4, 12, False),
197 |                        addzt=ToughAddGenerator(10, 13, False),
198 | ))
199 | 
200 | class FGenerator(DataGenerator):
201 |   def __init__(self, f):
202 |     self.f = f
203 | 
204 |   def rand_pair(self, l):
205 |     x = np.random.randint(self.nclass - 1, size=l) + 1
206 |     return list(x), list(self.f(x))
207 | 
208 | generators.update(dict(rev=FGenerator(lambda l: l[::-1]),
209 |                        sort=FGenerator(sorted),
210 |                        id=FGenerator(lambda l: l),
211 |                        ))
212 | 
213 | 
214 | # With spacing
215 | class SpacedGenerator(DataGenerator):
216 |   height=4
217 | 
218 |   def is_valid_length(self, l):
219 |     return super(SpacedGenerator, self).is_valid_length(l) and l >= self.min_length
220 | 
221 |   def rand_pair(self, l):
222 |     l2 = np.random.randint(self.min_length, l)
223 |     inp, res = self._rand_pair(l2)
224 |     if not hasattr(inp[0], '__iter__'):
225 |       inp = [inp]
226 |     inp = np.array(inp)
227 |     goal_dims = (self.height, l)
228 |     bots = (0, 1 if PADDING else 0)
229 |     tops = (goal_dims[0] - inp.shape[0], goal_dims[1] - inp.shape[1])
230 |     placed_loc = [np.random.randint(b, t+1) for b, t in zip(bots, tops)]
231 |     final_inp = np.zeros(goal_dims) + SPACE
232 |     if PADDING:
233 |       final_inp[:,0] = START
234 |     final_inp[placed_loc[0]:placed_loc[0]+inp.shape[0],
235 |               placed_loc[1]:placed_loc[1]+inp.shape[1]] = inp
236 |     res = np.concatenate([res, [SPACE] * (l - len(res))])
237 |     return (final_inp, res)
238 | 
239 | class CopyGenerator(SpacedGenerator):
240 |   def __init__(self, base):
241 |     self.base = base
242 | 
243 |   def _rand_pair(self, l):
244 |     x = [np.random.randint(self.base)+1 for _ in range(l)]
245 |     inp = x
246 |     res = x
247 |     return inp, res
248 | 
249 | class DupGenerator(SpacedGenerator):
250 |   min_length = 2
251 |   def __init__(self, base):
252 |     self.base = base
253 | 
254 |   def _rand_pair(self, l):
255 |     x = [np.random.randint(self.base)+1 for _ in range(l//2)]
256 |     inp = [DUP] + x
257 |     res = x + x
258 |     return inp, res
259 | 
260 | class SpacedAlignedOpGenerator(SpacedGenerator, OpGenerator):
261 |   def _rand_pair(self, l):
262 |     k = int((l-1)//2)
263 |     n1, n2 = self._rand_inputs(k)
264 |     result = self.f(n1, n2)
265 |     n1, n2 = [to_base(n, self.base) + 1 for n in [n1,n2]]
266 |     preferred_length = max(len(n1), len(n2))
267 |     inp = np.array([np.pad(n, (0, preferred_length - len(n)), 'constant',
268 |                            constant_values=SPACE) for n in (n1, n2)])
269 |     inp = np.concatenate([[[SPACE, self.sep]], inp.T]).T
270 |     o = to_base(result, self.base) + 1
271 |     return inp, o
272 | 
273 | class TSAOG(SpacedAlignedOpGenerator, ToughAddGenerator):
274 |   pass
275 | 
276 | class SpacedOpGenerator(SpacedGenerator, OpGenerator):
277 |   def _rand_pair(self, l):
278 |     k = int((l-1)//2)
279 |     n1, n2 = self._rand_inputs(k)
280 |     result = self.f(n1, n2)
281 |     n1, n2 = [to_base(n, self.base) + 1 for n in [n1,n2]]
282 |     inp = np.concatenate([n1, [self.sep], n2])
283 |     o = to_base(result, self.base) + 1
284 |     return inp, o
285 | 
286 | class TSOG(SpacedOpGenerator, ToughAddGenerator):
287 |   pass
288 | 
289 | generators.update(dict(scopy=CopyGenerator(10),
290 |                        sdup=DupGenerator(10),
291 |                        sbcopy=CopyGenerator(2),
292 |                        sbdup=DupGenerator(2),
293 |                        sbadde=SpacedAlignedOpGenerator(2, operator.add, 11),
294 |                        sbmule=SpacedAlignedOpGenerator(2, operator.mul, 14),
295 |                        sbaddet=TSAOG(2, 11),
296 |                        sbadd=SpacedOpGenerator(2, operator.add, 11),
297 |                        sbaddt=TSOG(2, 11),
298 |                        sbaddz=SpacedOpGenerator(2, operator.add, 11, False),
299 |                        sbaddzt=TSOG(2, 11, False),
300 |                        sbmul=SpacedOpGenerator(2, operator.mul, 14),
301 |                        ))
302 | 
303 | 
304 | class MultiOpGenerator(DataGenerator):
305 |   """Inputs where a single operation can appear many times"""
306 |   def __init__(self, base, f, sep, num, zero_chance=1, zero_pad=True):
307 |     self.base = base
308 |     self.f = f
309 |     self.sep = sep
310 |     self.num = num
311 |     self.zero_pad = zero_pad
312 |     self.min_length = 1 if num is None else 2*num - 1
313 |     self.zero_chance = zero_chance
314 | 
315 |   def is_valid_length(self, l):
316 |     return l >= self.min_length
317 | 
318 |   def _rand_inputs(self, k, num, allow_zero):
319 |     k = int(k)
320 |     return [random.randint(0 if allow_zero else 1, self.base**k-1) for i in range(num)]
321 | 
322 |   def rand_pair(self, l):
323 |     num = self.num
324 |     if num is None:
325 |       num = random.randint(1, (l+1)//2)
326 |     k = int((l+1)//num-1)
327 |     allow_zero = random.random() < self.zero_chance
328 |     ns = self._rand_inputs(k, num, allow_zero)
329 |     result = functools.reduce(self.f, ns)
330 |     input_arrays = []
331 |     for i, n in enumerate(ns):
332 |       if i:
333 |         input_arrays.append([self.sep])
334 |       input_arrays.append(to_base(n, self.base, k if self.zero_pad else 1)+1)
335 |     inp = np.concatenate(input_arrays)
336 |     outp = np.concatenate([
337 |             to_base(result, self.base, (k+1)*num-1 if self.zero_pad else 1) + 1,
338 |     ])
339 |     return inp, outp
340 | 
341 | generators.update({'3badd':MultiOpGenerator(2, operator.add, 11, 3),
342 |                    '3qadd':MultiOpGenerator(4, operator.add, 12, 3),
343 |                    '3add':MultiOpGenerator(10, operator.add, 13, 3),
344 |                    '3bmul':MultiOpGenerator(2, operator.mul, 14, 3),
345 |                    })
346 | generators.update({'kbadd':MultiOpGenerator(2, operator.add, 11, None),
347 |                    'kqadd':MultiOpGenerator(4, operator.add, 12, None),
348 |                    'kadd':MultiOpGenerator(10, operator.add, 13, None),
349 |                    'kbmul':MultiOpGenerator(2, operator.mul, 14, None, .3),
350 |                    })
351 | 
352 | class ExpressionGenerator(DataGenerator):
353 |   """Inputs where each character has a chance of being a random operator."""
354 |   min_length = 1
355 | 
356 |   def __init__(self, base, operators, op_chance):
357 |     self.base = base
358 |     self.operators = dict(operators)
359 |     self.nums = range(base)
360 |     self.op_chance = op_chance
361 | 
362 |     self.to_num = {i: i+1 for i in self.nums}
363 |     self.to_num.update(self.operators)
364 | 
365 |   def rand_pair(self, l):
366 |     ans = []
367 |     inp = []
368 |     last_num = []
369 |     valid_op = False
370 |     for i in range(l):
371 |       if valid_op and random.random() < self.op_chance:
372 |         choice = random.choice(self.operators.keys())
373 |       else:
374 |         choice = random.choice(self.nums)
375 |       inp.append(self.to_num[choice])
376 |       if choice in self.operators:
377 |         ans.append(from_base(last_num, self.base))
378 |         last_num = []
379 |         ans.append(choice)
380 |         valid_op = False
381 |       else:
382 |         last_num.append(choice)
383 |         if i == l-2:
384 |           valid_op = False
385 |         else:
386 |           valid_op = True
387 |     ans.append(from_base(last_num, self.base))
388 |     string_expr = ''.join(map(str, ans[::-1]))
389 |     string_expr = string_expr.replace('/', '//')
390 |     try:
391 |       result = eval(string_expr)
392 |     except ZeroDivisionError:
393 |       return self.rand_pair(l)
394 |     if result < 0:
395 |       return self.rand_pair(l)
396 |     outp = to_base(result, self.base, l)+1
397 |     return inp, outp
398 | 
399 | generators.update({'bexpr':ExpressionGenerator(2, zip('+*', [11, 14]), .3),
400 |                    'qexpr':ExpressionGenerator(4, zip('+*', [12, 15]), .3),
401 |                    'expr':ExpressionGenerator(10, zip('+*', [13, 16]), .3),})
402 | 
403 | generators.update({'bexpra':ExpressionGenerator(2, zip('+*/-', [11, 14,17,20]), .3),
404 |                    'qexpra':ExpressionGenerator(4, zip('+*/-', [12, 15,18,21]), .3),
405 |                    'expra':ExpressionGenerator(10, zip('+*/-', [13, 16,19,22]), .3),})
406 | 
407 | generators.update({'bexprp':ExpressionGenerator(2, zip('+', [11]), .3),
408 |                    'qexprp':ExpressionGenerator(4, zip('+', [12]), .3),
409 |                    'exprp':ExpressionGenerator(10, zip('+', [13]), .3),})
410 | 
411 | generators.update({'bexprs':ExpressionGenerator(2, zip('+-', [11, 20]), .3),
412 |                    'qexprs':ExpressionGenerator(4, zip('+-', [12, 21]), .3),
413 |                    'exprs':ExpressionGenerator(10, zip('+-', [13, 22]), .3),})
414 | 
415 | generators.update({'bexprsm':ExpressionGenerator(2, zip('+*-', [11, 14,20]), .3),
416 |                    'qexprsm':ExpressionGenerator(4, zip('+*-', [12, 15,21]), .3),
417 |                    'exprsm':ExpressionGenerator(10, zip('+*-', [13, 16,22]), .3),})
418 | 
419 | for k in generators:
420 |   generators[k].name = k
421 | 
422 | def set_height(self, height):
423 |   for k in generators:
424 |     generators[k].height = height
425 | 
426 | 


--------------------------------------------------------------------------------
/neuralgpu/trainer.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Neural GPU for Learning Algorithms."""
 16 | 
 17 | from __future__ import print_function
 18 | 
 19 | import math
 20 | import os
 21 | import random
 22 | import sys
 23 | import time
 24 | import subprocess
 25 | import yaml
 26 | 
 27 | import matplotlib.animation as anim
 28 | import matplotlib.pyplot as plt
 29 | import numpy as np
 30 | import tensorflow as tf
 31 | 
 32 | from tensorflow.python.platform import gfile
 33 | 
 34 | from . import data_utils as data
 35 | from .generators import generators
 36 | from .model import NeuralGPU
 37 | from . import curriculum
 38 | from . import mytf
 39 | from . import records
 40 | from .config import NeuralConfig
 41 | 
 42 | def define_flags():
 43 |   """This is placed in a function so reload() works"""
 44 |   tf.app.flags.DEFINE_float("lr", 0.001, "Learning rate.")
 45 |   tf.app.flags.DEFINE_float("init_weight", 1.0, "Initial weights deviation.")
 46 |   tf.app.flags.DEFINE_float("max_grad_norm", 1.0, "Clip gradients to this norm.")
 47 |   tf.app.flags.DEFINE_float("cutoff", 1.2, "Cutoff at the gates.")
 48 |   tf.app.flags.DEFINE_float("cutoff_tanh", 0.0, "Cutoff at tanh.")
 49 |   tf.app.flags.DEFINE_float("pull", 0.0005, "Starting pull of the relaxations.")
 50 |   tf.app.flags.DEFINE_float("pull_incr", 1.2, "Increase pull by that much.")
 51 |   tf.app.flags.DEFINE_float("curriculum_bound", 0.15, "Move curriculum < this.")
 52 |   tf.app.flags.DEFINE_float("dropout", 0.15, "Dropout that much.")
 53 |   tf.app.flags.DEFINE_integer("max_steps", 0, "Quit after this many steps.")
 54 |   tf.app.flags.DEFINE_integer("batch_size", 32, "Batch size.")
 55 |   tf.app.flags.DEFINE_integer("low_batch_size", 16, "Low batch size.")
 56 |   tf.app.flags.DEFINE_integer("steps_per_epoch", 200, "Steps per epoch.")
 57 |   tf.app.flags.DEFINE_integer("nmaps", 24, "Number of floats in each cell.")
 58 |   tf.app.flags.DEFINE_integer("niclass", 33, "Number of classes (0 is padding).")
 59 |   tf.app.flags.DEFINE_integer("noclass", 33, "Number of classes (0 is padding).")
 60 |   tf.app.flags.DEFINE_integer("max_length", 41, "Maximum length.")
 61 |   tf.app.flags.DEFINE_integer("rx_step", 6, "Relax that many recursive steps.")
 62 |   tf.app.flags.DEFINE_integer("random_seed", 125459, "Random seed.")
 63 |   tf.app.flags.DEFINE_integer("time_till_ckpt", 30, "How many tests per checkpoint")
 64 |   tf.app.flags.DEFINE_integer("time_till_eval", 2, "Number of steps between evals")
 65 |   tf.app.flags.DEFINE_integer("nconvs", 2, "How many convolutions / 1 step.")
 66 |   tf.app.flags.DEFINE_integer("kw", 3, "Kernel width.")
 67 |   tf.app.flags.DEFINE_integer("kh", 3, "Kernel height.")
 68 |   tf.app.flags.DEFINE_integer("height", 4, "Height.")
 69 |   tf.app.flags.DEFINE_integer("forward_max", 401, "Maximum forward length.")
 70 |   tf.app.flags.DEFINE_integer("nprint", 0, "How many test examples to print out.")
 71 |   tf.app.flags.DEFINE_integer("mode", 0, "Mode: 0-train other-decode.")
 72 |   tf.app.flags.DEFINE_bool("animate", False, "Whether to produce an animation.")
 73 |   tf.app.flags.DEFINE_float("smooth_grad", 0.0, "Whether to avoid clipping gradient")
 74 |   tf.app.flags.DEFINE_float("smooth_grad_tanh", 0.0, "Whether to avoid clipping tanh gradient")
 75 |   tf.app.flags.DEFINE_string("task", "badd", "Which task are we learning?")
 76 |   tf.app.flags.DEFINE_string("train_dir", "/tmp/neural", "Directory to store models.")
 77 | 
 78 |   tf.app.flags.DEFINE_float("layer_scale", 1.0, "Number of layers to use")
 79 | 
 80 |   # Batchnorm:     0 = none
 81 |   #                2 = correct
 82 |   #                1 = not quite correct, because of how masking is done, but simpler.
 83 |   tf.app.flags.DEFINE_integer("do_batchnorm", 0, "Whether to use batch normalization.")
 84 | 
 85 |   tf.app.flags.DEFINE_bool("do_resnet", False, "Whether to use resnets.")
 86 | 
 87 |   tf.app.flags.DEFINE_bool("print_one", True, "Print one example each evaluation")
 88 | 
 89 |   # output layer: 0 = standard: output layer n on length-n inputs
 90 |   #               1 = alternate: output sum of first n layers on length-n inputs.
 91 |   tf.app.flags.DEFINE_integer("output_layer", 0, "Which layer to output.")
 92 | 
 93 |   # progressive_curriculum: 0 = none: always train on first task.
 94 |   #                         1-5: progress through the tasks in sequence,
 95 |   #                              training each one to length max_len then move on.
 96 |   #                              The different options have subtle changes; see
 97 |   #                              BetterCurriculum for details.
 98 |   #                              5 is probably the best one.
 99 |   tf.app.flags.DEFINE_integer("progressive_curriculum", 0, "Whether to use progressive curriculum.")
100 |   tf.app.flags.DEFINE_bool("taskid", False, "Feed task id to algorithm in each layer")
101 | 
102 |   tf.app.flags.DEFINE_bool("always_large", False, "Perform the large test even when the model is inaccurate")
103 | 
104 | FLAGS = tf.app.flags.FLAGS
105 | if not FLAGS.__parsed: # Hack so reload() works
106 |   define_flags()
107 | 
108 | EXTRA_EVAL = 2
109 | 
110 | 
111 | log_output = None
112 | step_output = None
113 | 
114 | def log_parameters(checkpoint_dir):
115 |   """Write enough information in checkpoint_dir for reproducibility.
116 | 
117 |   Also check that we're in a new checkpoint directory.
118 |   """
119 |   global log_output, step_output
120 |   command_fname = os.path.join(checkpoint_dir, 'commandline')
121 |   if gfile.Exists(command_fname):
122 |     old_argv = open(command_fname).read().strip()
123 |     new_argv = ' '.join(sys.argv)
124 |     if old_argv != new_argv:
125 |       data.print_out('ERROR: restarted with changed argv')
126 |       data.print_out('WAS %s' % old_argv)
127 |       data.print_out('NOW %s' % new_argv)
128 |       raise ValueError("Bad log dir: partial state exists with different arguments")
129 |     else:
130 |       print('Reusing existing log dir')
131 |       #raise ValueError("Even though the argv didn't change, we'll still kill you.")
132 | 
133 |   with open(command_fname, 'w') as f:
134 |     f.write(' '.join(sys.argv)+'\n')
135 | 
136 |   with open(os.path.join(checkpoint_dir, 'all_args'), 'w') as f:
137 |     yaml.dump(FLAGS.__flags, f, default_flow_style=False)
138 | 
139 |   with open(os.path.join(checkpoint_dir, 'git-rev'), 'w') as f:
140 |     subprocess.call(['git', 'rev-parse', 'HEAD'], stdout=f)
141 | 
142 |   log_output = open(os.path.join(checkpoint_dir, 'results'), 'a', 1)
143 |   step_output = open(os.path.join(checkpoint_dir, 'steps'), 'a', 1)
144 | 
145 | def load_model(sess, checkpoint_dir, reconfig={}):
146 |   # possibly tf.reset_default_graph()
147 |   with open(os.path.join(checkpoint_dir, 'all_args')) as f:
148 |     options = yaml.load(f)
149 |   options.update(reconfig)
150 |   FLAGS._parse_flags()
151 |   FLAGS.__flags.update(options)
152 |   data.forward_max = max(FLAGS.forward_max, data.bins[-1])
153 |   config = NeuralConfig(FLAGS)
154 |   model = NeuralGPU(config)
155 |   ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
156 |   if ckpt and gfile.Exists(ckpt.model_checkpoint_path):
157 |     model.saver.restore(sess, ckpt.model_checkpoint_path)
158 |   return model
159 | 
160 | def get_checkpoint_dir():
161 |   #return FLAGS.train_dir + ('-seed%s-pid%s' % (FLAGS.random_seed, os.getpid()))
162 |   return FLAGS.train_dir
163 | 
164 | def get_config_from_flags(checkpoint_dir = None):
165 |   # Set random seed.
166 |   seed = FLAGS.random_seed
167 |   tf.set_random_seed(seed)
168 |   random.seed(seed)
169 |   np.random.seed(seed)
170 | 
171 |   # Create checkpoint directory if it does not exist.
172 |   if checkpoint_dir is None:
173 |     checkpoint_dir = get_checkpoint_dir()
174 |   if not gfile.IsDirectory(checkpoint_dir):
175 |     data.print_out("Creating checkpoint directory %s." % checkpoint_dir)
176 |     try:
177 |       gfile.MkDir(os.path.dirname(checkpoint_dir))
178 |     except OSError as e:
179 |       pass
180 |     gfile.MkDir(checkpoint_dir)
181 | 
182 |   data.err_tee = data.TeeErr(open(os.path.join(checkpoint_dir, "err"), 'w'))
183 | 
184 |   data.print_out("NN ", newline=False)
185 | 
186 |   config = NeuralConfig(FLAGS)
187 | 
188 |   # Check data sizes.
189 |   while len(data.bins) > 1 and data.bins[-2] > config.max_length + EXTRA_EVAL:
190 |     data.bins = data.bins[:-1]
191 |   assert data.bins[0] > FLAGS.rx_step
192 |   data.forward_max = max(FLAGS.forward_max, data.bins[-1])
193 | 
194 |   return config
195 | 
196 | def initialize(sess, checkpoint_dir=None):
197 |   """Initialize data and model."""
198 |   config = get_config_from_flags(checkpoint_dir)
199 |   data.print_out(str(sys.argv))
200 |   data.print_out(str(config))
201 | 
202 |   if checkpoint_dir is None:
203 |     checkpoint_dir = get_checkpoint_dir()
204 |   log_parameters(checkpoint_dir)
205 | 
206 |   # Initialize data for each task.
207 |   nclass = min(config.niclass, config.noclass)
208 |   tasks = config.task.split(",")
209 |   data_generators = [generators[t] for t in tasks]
210 |   for g in data_generators:
211 |     g._initialize(nclass)
212 | 
213 |   # Create model and initialize it.
214 |   tf.get_variable_scope().set_initializer(
215 |       tf.uniform_unit_scaling_initializer(factor=1.8 * FLAGS.init_weight))
216 |   model = NeuralGPU(config)
217 |   data.print_out("Created model.")
218 |   sess.run(tf.initialize_all_variables())
219 |   data.print_out("Initialized variables.")
220 | 
221 |   # Load model from parameters if a checkpoint exists.
222 |   ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
223 |   model.curriculum = None
224 |   if ckpt and gfile.Exists(ckpt.model_checkpoint_path):
225 |     data.print_out("Reading model parameters from %s"
226 |                    % ckpt.model_checkpoint_path)
227 |     model.saver.restore(sess, ckpt.model_checkpoint_path)
228 |     try:
229 |       model.curriculum = yaml.load(open(os.path.join(checkpoint_dir, 'neural_gpu_curriculum.ckpt')))
230 |     except IOError:
231 |       pass
232 | 
233 |   if model.curriculum is None:
234 |     if FLAGS.progressive_curriculum:
235 |       model.curriculum = curriculum.BetterCurriculum(data_generators, model.config,
236 |                                                       FLAGS.progressive_curriculum)
237 |     else:
238 |       model.curriculum = curriculum.GeneralizeCurriculum(data_generators, model.config)
239 | 
240 |   # Return the model and needed variables.
241 |   return model
242 | 
243 | 
244 | def single_test(l, model, sess, task, nprint, batch_size, print_out=True,
245 |                 offset=None, get_steps=False, batch=None):
246 |   """Test model on test data of length l using the given session."""
247 |   if batch is None:
248 |     batch, _ = model.curriculum.draw_example(batch_size, l, task)
249 |   result = model.step(sess, batch, False, get_steps=get_steps)
250 |   errors, total, seq_err = result.accuracy(nprint)
251 |   seq_err = float(seq_err) / batch_size
252 |   if total > 0:
253 |     errors = float(errors) / total
254 |   if print_out:
255 |     data.print_out("  %s len %d errors %.2f sequence-errors %.2f"
256 |                    % (task, l, 100*errors, 100*seq_err))
257 |   return errors, seq_err, result
258 | 
259 | 
260 | def multi_test(l, model, sess, task, nprint, batch_size, offset=None):
261 |   """Run multiple tests at lower batch size to save memory."""
262 |   errors, seq_err = 0.0, 0.0
263 |   to_print = nprint
264 |   low_batch = FLAGS.low_batch_size
265 |   low_batch = min(low_batch, batch_size)
266 |   for mstep in range(batch_size // low_batch):
267 |     cur_offset = None if offset is None else offset + mstep * low_batch
268 |     err, sq_err, result = single_test(l, model, sess, task, to_print, low_batch,
269 |                                  False, cur_offset)
270 |     to_print = max(0, to_print - low_batch)
271 |     errors += err
272 |     seq_err += sq_err
273 |   errors = float(low_batch) * float(errors) / batch_size
274 |   seq_err = float(low_batch) * float(seq_err) / batch_size
275 |   data.print_out("  %s len %d errors %.2f sequence-errors %.2f"
276 |                  % (task, l, 100*errors, 100*seq_err))
277 |   return errors, seq_err, result
278 | 
279 | class Timer(object):
280 |   """Utility class for tracking time used in a function"""
281 |   def __init__(self, label, print_fn=data.print_out):
282 |     self.startt = time.time()
283 |     self.label = label
284 |     self.print_fn = print_fn
285 |     self.print_fn('Start %s' % self.label)
286 | 
287 |   def done(self):
288 |     self.print_fn('Finish %s, took %s seconds' % (self.label, time.time()-self.startt))
289 | 
290 | def train_for_a_bit(sess, model, batch_size, nsteps, thresh=0.0):
291 |   results_record = records.ResultsRecord(batch_size)
292 |   for _ in range(nsteps):
293 | 
294 |     batch, within_bounds = model.curriculum.draw_example(batch_size)
295 | 
296 |     # Run a step and time it.
297 |     start_time = time.time()
298 |     result = model.step(sess, batch, True)
299 | 
300 |     # Accumulate statistics only if we did not exceed curriculum length.
301 |     results_record.feed(result, time.time() - start_time, within_bounds)
302 | 
303 |   global_step, lr, pull = sess.run( [model.global_step, model.lr, model.pull])
304 |   # Normalize and print out accumulated statistics.
305 |   message = ('step %s ' % (global_step, ) +
306 |              'len %s ' % model.curriculum.length_str +
307 |              'lr %.8f pull %.3f ' % (lr, pull) +
308 |              '%s' % str(results_record)
309 |   )
310 |   data.print_out(message)
311 |   print(message, file=step_output)
312 |   if FLAGS.do_batchnorm:
313 |     mytf.print_bn_state(sess, model.config.nmaps)
314 | 
315 |   would_extend = model.curriculum.consider_extending(results_record)
316 |   decent = (would_extend >= 1)
317 |   extended = (would_extend >= 2)
318 |   # If errors are below the curriculum threshold, move curriculum forward.
319 |   if decent:
320 |     # Either increase pull or, if it's large, average parameters.
321 |     if pull < 0.1:
322 |       sess.run(model.pull_incr_op)
323 |     else:
324 |       data.print_out("  Averaging parameters.")
325 |       sess.run(model.avg_op)
326 | 
327 |   # Lower learning rate if we're worse than the last 3 checkpoints.
328 |   # [XXX the logic isn't great in mixed-task settings; it picks one
329 |   # task semi-arbitrary.]
330 |   first_record = sorted(results_record.record_for_task.items())[0][1]
331 |   acc_perp = data.safe_exp(first_record.avg_loss)
332 |   if acc_perp > thresh:
333 |     data.print_out("Lower learning rate: %s %s" % (acc_perp, thresh))
334 |     sess.run(model.lr_decay_op)
335 |   return (extended, acc_perp)
336 | 
337 | def run_evaluation(sess, model, batch_size):
338 |   global_step, = sess.run( [model.global_step])
339 |   for task in model.curriculum.tasks():
340 |     errors = []
341 |     for batch, length in model.curriculum.test_examples(batch_size, task):
342 |       _, seq_err, result = single_test(length, model, sess, task,
343 |                                        FLAGS.nprint, batch_size, batch=batch)
344 |       errors.append(seq_err)
345 |       if len(errors) >= 4 and min(errors[-4:]) == 1:
346 |         break
347 |     if FLAGS.print_one:
348 |       data.print_out(result.to_string(0))
349 |     if seq_err < 0.05 or FLAGS.always_large:  # Run larger test if we're good enough.
350 |       _, seq_err, result = multi_test(data.forward_max, model, sess, task,
351 |                               FLAGS.nprint, batch_size * 4)
352 |       data.print_out("LARGE ERROR: %s %s %s"  % (global_step, seq_err, task))
353 |       log_output.write('%s %s %s\n' % (global_step, seq_err, task))
354 |       if FLAGS.print_one:
355 |         data.print_out(result.to_string(0))
356 |   if seq_err < 0.01:  # Super-large test on 1-task large-forward models.
357 |     if data.forward_max > 4000 and len(tasks) == 1:
358 |       multi_test(data.forward_max, model, sess, task, FLAGS.nprint,
359 |                  batch_size * 16, 0)
360 | 
361 | def checkpoint(sess, model, checkpoint_dir):
362 |   checkpoint_path = os.path.join(checkpoint_dir, "neural_gpu.ckpt")
363 |   global_step, = sess.run( [model.global_step])
364 |   model.saver.save(sess, checkpoint_path,
365 |                    global_step=model.global_step,
366 |                    write_meta_graph=False)
367 |   with open(os.path.join(checkpoint_dir, 'neural_gpu_curriculum.ckpt'), 'w') as f:
368 |     yaml.dump(model.curriculum, f)
369 | 
370 | 
371 | def train_loop(sess, model, batch_size, checkpoint_dir):
372 |   time_till_ckpt = FLAGS.time_till_ckpt
373 |   time_till_eval = FLAGS.time_till_eval
374 |   # Main training loop.
375 |   accuracies = [1e4]*3
376 |   while True:
377 |     data.print_out("Reminder: checkpoint dir %s" % checkpoint_dir)
378 |     timer = Timer("training steps")
379 |     extended, acc = train_for_a_bit(sess, model, batch_size, FLAGS.steps_per_epoch,
380 |                                     max(accuracies[-3:]))
381 |     accuracies.append(acc)
382 |     if extended: # If we extended, don't just lower the learning rate
383 |       accuracies.append(1000) 
384 |     timer.done()
385 | 
386 |     # Save checkpoint.
387 |     time_till_ckpt -= 1
388 |     if time_till_ckpt == 0:
389 |       time_till_ckpt = FLAGS.time_till_ckpt
390 |       timer = Timer("saving checkpoint")
391 |       checkpoint(sess, model, checkpoint_dir)
392 |       timer.done()
393 | 
394 |     # Run evaluation.
395 |     global_step, = sess.run( [model.global_step])
396 |     time_till_eval -= 1
397 |     if time_till_eval == 0:
398 |       time_till_eval = FLAGS.time_till_eval
399 |       timer = Timer("running evaluation %s"  % global_step)
400 |       run_evaluation(sess, model, batch_size)
401 |       timer.done()
402 | 
403 |     global_step, = sess.run( [model.global_step])
404 |     if FLAGS.max_steps and global_step  >= FLAGS.max_steps:
405 |       data.print_out("Finished all %s steps" % global_step)
406 |       checkpoint(sess, model, checkpoint_dir)
407 |       break
408 | 
409 | def start_and_train():
410 |   """Train the model."""
411 |   with tf.Session() as sess:
412 |     timer = Timer('initialization')
413 |     model = initialize(sess)
414 |     timer.done()
415 |     train_loop(sess, model, FLAGS.batch_size, get_checkpoint_dir())
416 | 


--------------------------------------------------------------------------------
/plots/get_pretty_score.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | from __future__ import print_function
  3 | import fileinput
  4 | 
  5 | import sys
  6 | import numpy as np
  7 | import pandas as pd
  8 | import argparse
  9 | import glob
 10 | import scipy.signal
 11 | import os
 12 | import yaml
 13 | import shutil
 14 | import joblib
 15 | import functools
 16 | import re
 17 | import pickle
 18 | 
 19 | import collections
 20 | import pylab
 21 | from matplotlib import rc
 22 | import matplotlib
 23 | import matplotlib.gridspec as gridspec
 24 | import matplotlib.ticker as mtick
 25 | 
 26 | rc('font',  size='12')
 27 | rc('text', usetex=True)
 28 | rc('axes', labelsize='large')
 29 | 
 30 | rc('axes', prop_cycle="cycler('color', ['b','g','r','c','m','y','k'] + "
 31 |    "['orange', 'darkgreen', 'indigo', 'gold', 'fuchsia'])")
 32 | #pylab.rcParams['axes.prop_cycle'] = ("cycler('color', 'bgrcmyk'*2)")
 33 | 
 34 | parser = argparse.ArgumentParser(description='Get scores')
 35 | 
 36 | RESULT='score'
 37 | 
 38 | 
 39 | parser.add_argument("--key", type=str, default="seq-errors,score")
 40 | parser.add_argument("--job", type=str, default='plot')
 41 | parser.add_argument("--task", type=str, default=None)
 42 | parser.add_argument("--exclude_opts", type=str, default=None)
 43 | parser.add_argument("--title", type=str, default='')
 44 | parser.add_argument("--titles", type=str, default='')
 45 | parser.add_argument("--savedir", type=str, default='')
 46 | parser.add_argument("--min-length", type=int, default=2)
 47 | parser.add_argument("--dirs-in-name", type=int, default=1)
 48 | parser.add_argument("--one-legend", type=bool, default=True)
 49 | parser.add_argument("--global-legend", type=str, default='')
 50 | parser.add_argument("--save-to", type=str, default='')
 51 | parser.add_argument("--skip-dir", action='store_true')
 52 | parser.add_argument("--success", action='store_true')
 53 | parser.add_argument("--recache", action='store_true')
 54 | parser.add_argument("--separate_seeds", action='store_true')
 55 | parser.add_argument("--median", action='store_true')
 56 | parser.add_argument("--order", type=str, default='')
 57 | parser.add_argument("--colorcycle", type=str, default='')
 58 | parser.add_argument("--std", type=bool, default=True)
 59 | parser.add_argument("--only-same", type=bool, default=False)
 60 | parser.add_argument("--smoothing", type=int, default='11')
 61 | parser.add_argument("--remove_strings", type=str, default='')
 62 | parser.add_argument("--remove_strings2", type=str, default='')
 63 | parser.add_argument('files', type=str, nargs='+',
 64 |                     help='Log files to examine')
 65 | parser.add_argument("--xlims", type=str, default='')
 66 | parser.add_argument("--ylims", type=str, default='')
 67 | 
 68 | parser.add_argument("--nbinsx", type=str, default='')
 69 | parser.add_argument("--nbinsy", type=str, default='')
 70 | 
 71 | parser.add_argument("--overlay", type=int, default=1)
 72 | parser.add_argument("--only-plot", type=str, default=None)
 73 | 
 74 | parser.add_argument("--xticks", type=str, default='')
 75 | parser.add_argument("--yticks", type=str, default='')
 76 | parser.add_argument("--lw", type=int, default=3)
 77 | parser.add_argument("--figsize", type=str, default='')
 78 | 
 79 | parser.add_argument('--traces', dest='traces', action='store_true')
 80 | parser.add_argument('--no-traces', dest='traces', action='store_false')
 81 | parser.set_defaults(traces=False)
 82 | 
 83 | parser.add_argument('--startx', dest='startx', action='store_true')
 84 | parser.add_argument('--no-startx', dest='startx', action='store_false')
 85 | parser.set_defaults(startx=True)
 86 | parser.add_argument('--starty', dest='starty', action='store_true')
 87 | parser.add_argument('--no-starty', dest='starty', action='store_false')
 88 | parser.set_defaults(starty=True)
 89 | 
 90 | parser.add_argument('--simplify', dest='simplify', action='store_true')
 91 | parser.add_argument('--no-simplify', dest='simplify', action='store_false')
 92 | parser.set_defaults(simplify=True)
 93 | 
 94 | memory = joblib.Memory(cachedir='/home/ecprice/neural_gpu/cache',
 95 |                        verbose=1)
 96 | 
 97 | def recache(f):
 98 |     g = memory.cache(f)
 99 |     @functools.wraps(g)
100 |     def cached(*args, **kwargs):
101 |         if recache.do_recache:
102 |             try:
103 |                 shutil.rmtree(g.get_output_dir(*args, **kwargs)[0])
104 |             except OSError: # Not actually in cache
105 |                 pass
106 |         return g(*args, **kwargs)
107 |     return cached
108 | 
109 | recache.do_recache = False
110 | 
111 | @recache
112 | def get_results_dict(fname):
113 |     if not os.path.exists(fname):
114 |         return {}
115 |     answer = {}
116 |     with open(fname) as f:
117 |         for line in f:
118 |             words = line.split()
119 |             if not words: # Blank line on restart
120 |                 continue
121 |             loc, val = words[:2]
122 |             taskname = words[2]
123 |             if taskname not in answer:
124 |                 answer[taskname] = pd.Series(name=RESULT)
125 |             try:
126 |                 answer[taskname].loc[int(loc)] = float(val)
127 |             except ValueError:
128 |                 pass
129 |     return answer
130 | 
131 | def get_scores_dict(fname):
132 |     with open(fname) as f:
133 |         for line in f:
134 |             if line.startswith('step '):
135 |                 entries = line.split()
136 |                 d = collections.OrderedDict(zip(entries[::2], entries[1::2]))
137 |                 try:
138 |                     yield d
139 |                 except ValueError:
140 |                     break
141 | 
142 | @recache
143 | def get_dfs(dirname, tasknames):
144 |     fname = dirname+'/steps'
145 |     if not os.path.exists(fname):
146 |         fname = dirname+'/log0'
147 |     data_series = {t:{} for t in tasknames}
148 |     for d in get_scores_dict(fname):
149 |         lens = d['len'].split('/')
150 |         if 'progressive_curriculum=5' in fname:
151 |             missing = [i for i in range(len(tasknames)) if lens[i] != '41'] or [len(tasknames)-1]
152 |         else:
153 |             missing = []
154 |         for key in d:
155 |             vals = d[key].split('/')
156 |             if len(vals) == 1 and (key == 'step' or not missing):
157 |                 vals *= len(tasknames)
158 |             elif len(vals) == len(missing):
159 |                 vals2 = [np.nan]*len(tasknames)
160 |                 for i, v in zip(missing, vals):
161 |                     vals2[i] = v
162 |                 vals = vals2
163 |             elif len(vals) < len(tasknames): #Failed to get data for one
164 |                 vals = [np.nan]*len(tasknames)
165 |             for val, task in zip(vals, tasknames):
166 |                 data_series[task].setdefault(key, []).append(float(val))
167 |     dfs = {}
168 |     for task in data_series:
169 |         try:
170 |             dfs[task] = pd.DataFrame(data_series[task], index=data_series[task]['step'])
171 |             dfs[task] = dfs[task].drop_duplicates(subset='step', keep='last')
172 |         except KeyError: #Hasn't gotten to 'step' line yet
173 |             pass
174 |     return dfs
175 | 
176 | def matches(fname, exclude_opts):
177 |     if exclude_opts:
178 |         for opt in exclude_opts.split('|'):
179 |             if opt in fname:
180 |                 return True
181 |     return False
182 | 
183 | class Scores(object):
184 |     def __init__(self, dirname, tasknames=None, prefix=''):
185 |         self.dirname = dirname
186 |         self.index = 0
187 |         if tasknames is None:
188 |             tasknames = get_tasks(self.key)
189 |         self.tasknames = tasknames
190 |         self.prefix = prefix
191 |         self.result_dfs = {}
192 |         self.dfs = {}
193 | 
194 |     @property
195 |     def key(self):
196 |         return get_key(self.dirname)
197 | 
198 |     def args_str(self, task=None):
199 |         label = get_key(self.dirname[len(self.prefix):])
200 |         return (label +
201 |                 (' (%s)' % task if task and len(self.tasknames) > 1 else ''))
202 | 
203 |     def last_loc(self):
204 |         options = ([d.index[-1] for d in self.result_dfs.values()] +
205 |                    [d.index[-1] for d in self.dfs.values()])
206 |         ans = max(options or [3])
207 |         if ans == 200200 or ans == 60200:
208 |             ans -= 200
209 |         return ans
210 | 
211 |     def get_scores(self, key, task):
212 |         if key == RESULT:
213 |             self._load_results()
214 |             if task is None:
215 |                 assert len(self.result_dfs) == 1
216 |                 task = self.result_dfs.keys()[0]
217 |             if task not in self.result_dfs:
218 |                 basic = pd.Series([1], name=RESULT)
219 |                 basic.loc[self.last_loc()] = 1
220 |                 return basic
221 |             return self.result_dfs[task]
222 |         else:
223 |             self._load_scores()
224 |             if task is None:
225 |                 assert len(self.dfs) == 1
226 |                 task = self.dfs.keys()[0]
227 |             if task not in self.dfs:
228 |                 return None
229 |             if key in ['errors', 'seq-errors']:
230 |                 scale = 0.01
231 |             else:
232 |                 scale = 1
233 |             return self.dfs[task].get(key) * scale
234 | 
235 |     def _load_results(self):
236 |         if self.result_dfs:
237 |             return
238 |         self.result_dfs = get_results_dict(self.dirname+'/results')
239 | 
240 |     def _load_scores(self):
241 |         if self.dfs:
242 |             return
243 |         self.dfs = get_dfs(self.dirname, self.tasknames)
244 | 
245 |     def commandline(self):
246 |         return open(os.path.join(self.dirname, 'commandline')).read().split()
247 | 
248 |     def total_steps(self):
249 |         lens = self.get_scores('len', self.tasknames[0])
250 |         return lens.index[-1].item() if lens is not None else None
251 | 
252 | def get_name(fname):
253 |     fname = remove_defaults(fname)
254 |     for s in args.remove_strings2.split('|'):
255 |         fname = fname.replace(s, '')
256 |     ans = '/'.join(fname.split('/')[:2])
257 |     ans = ans.replace('_', r'\_')
258 |     return ans
259 | 
260 | def plot_startx(key):
261 |     pylab.xlabel('Steps of training')
262 | def plot_starty(key):
263 |     if key:
264 |         mapping = {'score': 'Test error',
265 |                    'seq-errors': 'Training error',}
266 |         pylab.ylabel(mapping.get(key, key))
267 |     else:
268 |         pylab.ylabel('Sequence error on large input')
269 | 
270 | def plot_results(fname, frame):
271 |     label = get_name(fname)#fname
272 |     fmt = dict()
273 |     if frame is None: #Just put in legend
274 |         pylab.plot([], label=label, **fmt)
275 |         return
276 |     x = frame.index
277 |     ysets = list(frame.T.values)
278 |     if args.smoothing > 1:
279 |         f = lambda y: scipy.signal.savgol_filter(y, args.smoothing, 1) if len(y) > args.smoothing else y
280 |     else:
281 |         f = lambda y: y
282 |     ysets = np.array(map(f, ysets)).T
283 |     y = np.median(ysets, axis=1) if args.median else ysets.mean(axis=1)
284 |     v=pylab.plot(x, y,
285 |                label=label,
286 |                **fmt
287 |     )
288 |     if args.traces:
289 |         for ys in list(ysets.T):
290 |             pylab.plot(x, ys, alpha=0.2,
291 |                        color=v[0].get_color(),
292 |             )
293 |     pylab.fill_between(frame.index, ysets.min(axis=1), ysets.max(axis=1),
294 |                        alpha=0.15, color=v[0].get_color())
295 | 
296 |     #for k in frame.columns:
297 |     #    pylab.scatter(frame.index, frame[k].values, alpha=0.15, color=v[0].get_color())
298 | 
299 | def get_tasks(key):
300 |     if 'task' not in key:
301 |         return ['rev']
302 |     else:
303 |         locs = key.split('=')
304 |         index = [i for i,a in enumerate(locs) if a.endswith('task')][0]+1
305 |         tasks = locs[index].split('-')[0].split(',')
306 |         return tasks
307 | 
308 | def remove_defaults(fname):
309 |     for default in ['max_steps=200000',
310 |                     'max_steps=40000',
311 |                     'max_steps=60000',
312 |                     'max_steps=80000',
313 |                     'max_steps=100000',
314 |                     'forward_max=201',
315 | #                    'forward_max=401',
316 |                     'max_length=41',
317 |                     'time_till_eval=4',
318 |                     'always_large=True',
319 |                     'do_resnet=False',
320 |                     'do_binarization=0.0',
321 |                     'do_batchnorm=0',
322 |                     'do_shifter=0',
323 |                     'progressive_curriculum=False',
324 |                     'cutoff_tanh=0.0',
325 |                     'input_height=2',
326 |                     'batch_size=32',
327 |                     ]:
328 |         fname = fname.replace(default+'-', '')
329 |         if fname.endswith(default):
330 |             fname = fname[:-len(default)-1]
331 |     if fname.startswith('random_seed='):
332 |         fname = fname.split('-', 1)[1]
333 |     if 'task' in fname and len(fname.split('task=')[1].split('-')[0].split(',')) == 1:
334 |         for s in ['2', '3', '4', '5', 'True']:
335 |             fname = fname.replace('-progressive_curriculum=%s' % s, '')
336 |     if args.simplify:
337 |         fname = fname.replace('badd,baddt', 'badd')
338 |         fname = fname.replace('baddt,badd', 'baddt')
339 |         fname = fname.replace('badde,baddet', 'badde')
340 |         fname = fname.replace('baddet,badde', 'baddet')
341 |         fname = fname.replace('baddz,baddzt', 'baddz')
342 |         fname = fname.replace('baddzt,baddz', 'baddzt')
343 |     fname = re.sub('(task=[^-]*)-(nmaps=[0-9]*)', r'\2-\1', fname)
344 |     for s in args.remove_strings.split('|'):
345 |         fname = fname.replace(s, '')
346 |     return fname
347 | 
348 | def get_key(fname):
349 |     if not args.separate_seeds:
350 |         fname = fname.split('-seed')[0]
351 |     fname = '/'.join(fname.split('/')[-args.dirs_in_name:])
352 |     fname = remove_defaults(fname)
353 |     return fname
354 | 
355 | def get_prefix(fileset):
356 |     longest_cp = os.path.commonprefix(fileset)
357 |     i = 1
358 |     while i <= len(longest_cp) and longest_cp[-i] not in '-/':
359 |         i += 1
360 |     return longest_cp[:len(longest_cp)+ 1-i]
361 | 
362 | def sort_key_fn(label):
363 |     return label.replace('nmaps=24', 'nmaps=024')
364 | 
365 | badkeys = set()
366 | def plot_all(func, scores, column=None, taskset=None, order=None):
367 |     d = {}
368 |     for s in scores:
369 |         d.setdefault(s.key, []).append(s)
370 | 
371 |     keys = sorted(d, key=sort_key_fn)
372 |     ordered_keys = []
373 |     for key in keys:
374 |         if matches(key, args.exclude_opts):
375 |             continue
376 |         ordered_keys.append(key)
377 |     if order:
378 |         ordered_keys = [ordered_keys[i-1] for i in order]
379 |     for key in ordered_keys:
380 |         for task in d[key][0].tasknames:
381 |             if (key, task) in badkeys:
382 |                 continue
383 |             if task not in taskset:
384 |                 continue
385 |             columns = [score.get_scores(column, task)
386 |                        for score in d[key]]
387 |             columns = [c for c in columns if c is not None and not c.isnull().all()]
388 |             def strip_last(c):
389 |                 if c is None or c.index[-1] != 200200:
390 |                     return c
391 |                 return c[c.index[:-1]]
392 |             columns = map(strip_last, columns)
393 |             if column == 'len' and args.success:
394 |                 if not [c for c in columns if c is not None and c.values[-1] > 10]:
395 |                     badkeys.add((key, task))
396 |                     continue
397 |             median_len = np.median([len(c) for c in columns if c is not None])
398 |             if column != 'score':
399 |                 columns = [c for c in columns if len(c) >= median_len / 2 and len(c) >= args.min_length]
400 |             else:
401 |                 length_fn = lambda c: c.last_valid_index() // 200
402 |                 median_len = np.median(map(length_fn, columns))
403 |                 columns = [c for c in columns if length_fn(c) >= median_len / 2 and length_fn(c) >= args.min_length and len(c) > 1]
404 |             data = pd.DataFrame(columns).T
405 |             if not len(data):
406 |                 func(score.args_str(), None)
407 |                 continue
408 |             try:
409 |                 loc = data.first_valid_index()
410 |             except IndexError:
411 |                 continue
412 |             data.loc[loc] = data.loc[loc].fillna(1)
413 |             data = data.interpolate(method='nearest')
414 |             func(score.args_str(), data)
415 | 
416 | legend_locs = dict(score='upper right',
417 |                    len='lower right',
418 |                    errors='upper right')
419 | 
420 | def get_filter(column):
421 |     if column == 'len':
422 |         return lambda x: x == 41
423 |     else:
424 |         return lambda x: x < 0.01
425 | 
426 | def get_print_results(scores, column, avg=10):
427 |     assert len(set(x.key for x in scores)) == 1
428 |     ans = {}
429 |     for task in scores[0].tasknames:
430 |         columns = [score.get_scores(column, task) for score in scores]
431 |         columns = [c for c in columns if c is not None]
432 |         if not columns:
433 |             continue
434 |         last_values = [np.mean(c.values[-avg:]).item() for c in columns]
435 |         filt = get_filter(column)
436 |         times = [c.index[np.where(filt(c))] for c in columns]
437 |         first_time = [t[0].item() if len(t) else None for t in times]
438 |         ans[task] = {}
439 |         ans[task]['last'] = last_values
440 |         ans[task]['first-time'] = first_time
441 |         ans[task]['fraction'] = len([x for x in first_time if x is not None]) * 1. / len(times)
442 | 
443 |     return ans
444 | 
445 | def construct_parsed_data(scores, columns, save_dir):
446 |     d = {}
447 |     for s in scores:
448 |         if s.total_steps() < 50000:
449 |             continue
450 |         d.setdefault(s.key, []).append(s)
451 | 
452 |     for i, key in enumerate(d):
453 |         ans = {}
454 |         ans['metadata'] = dict(commandline=d[key][0].commandline(),
455 |                                count = len(d[key]),
456 |                                steps = [s.total_steps() for s in d[key]]
457 |         )
458 |         for col in columns:
459 |             ans[col] = get_print_results(d[key], col)
460 |         with open(os.path.join(save_dir, key), 'w') as f:
461 |             print(yaml.safe_dump(ans), file=f)
462 |         print("Done %s/%s" % (i+1, len(d)))
463 | 
464 | @recache
465 | def is_valid_dir(f):
466 |     return os.path.exists(os.path.join(f, 'log0'))
467 | 
468 | gs = None
469 | 
470 | def run_plots(args, scores, all_tasks, keys):
471 |     global gs
472 |     if args.colorcycle:
473 |         if ',' in args.colorcycle:
474 |             lst = args.colorcycle.split(',')
475 |         else:
476 |             lst = list(args.colorcycle)
477 |         rc('axes', prop_cycle=matplotlib.cycler('color', lst))
478 | 
479 |     rc('lines', linewidth=args.lw)
480 |     title = args.title
481 |     if not title:
482 |         title = os.path.split(args.files[0])[-2]
483 |     pylab.suptitle(title, size=18)
484 |     goal_xlim = None
485 |     axes = [[None for _ in range(len(all_tasks))] for _ in range(len(keys))]
486 | 
487 |     figkws = {}
488 |     if args.figsize:
489 |         figkws['figsize']=map(int, args.figsize.split(','))
490 |     fig = pylab.figure(1,**figkws)
491 |     task_overlays = args.overlay
492 |     if gs is None:
493 |         gs = gridspec.GridSpec(len(keys), len(all_tasks) / task_overlays)
494 |     for ki, key in enumerate(keys):
495 |         for i, task in enumerate(all_tasks):
496 |             full_plot_index = ki*len(all_tasks) + i
497 |             plot_index = full_plot_index // task_overlays
498 |             if args.only_plot and plot_index + 1 != int(args.only_plot.split(',')[0]):
499 |                 continue
500 |             print("Subplot %s/%s" % (full_plot_index+1, len(all_tasks)*len(keys)))
501 |             sharex = axes[0][i]
502 |             if args.only_plot:
503 |                 newloc = int(args.only_plot.split(',')[1])
504 |                 ax = fig.add_subplot(gs[newloc-1])
505 |                 axes[ki][i] = ax
506 |             else:
507 |                 axes[ki][i] = fig.add_subplot(gs[plot_index], sharex=sharex)
508 |             if ki == len(keys)-1 and args.startx:
509 |                 plot_startx(key)
510 |             if i == 0 and args.starty:
511 |                 plot_starty(key)
512 |             order = get_value(args.order, i)
513 |             if order:
514 |                 order = map(int, order.split(','))
515 |             plot_all(plot_results, scores, column=key, taskset = [task], order=order)
516 |             if not args.global_legend and (not args.one_legend or (ki == len(keys)-1 and
517 |                                        (i == len(all_tasks)-1 or 1))):
518 |                 pylab.legend(loc=legend_locs.get(key, 0))
519 |             if not args.titles:
520 |                 pylab.title('Task %s' % task)
521 |             else:
522 |                 pylab.title(args.titles.split('|')[plot_index])
523 |             maxy = None
524 |             if key in ('score', 'errors', 'seq-errors'):
525 |                 maxy = 1
526 |                 axes[ki][i].yaxis.set_major_formatter(mtick.FuncFormatter(
527 |                     lambda x, pos: '% 2d\\%%' % (x*100)
528 |                 ))
529 |             ylims = map(float, get_value(args.ylims, ki).split(',')) if args.ylims else (0,1)
530 |             pylab.ylim(ylims)
531 |             xlims = map(float, get_value(args.xlims, i).split(',')) if args.xlims else (0,None)
532 |             pylab.xlim(xlims)
533 | 
534 |             if args.nbinsx:
535 |                 pylab.locator_params(axis='x',nbins=int(get_value(args.nbinsx, i)))
536 |             if args.nbinsy:
537 |                 pylab.locator_params(axis='y',nbins=int(get_value(args.nbinsy, ki)))
538 |             if args.yticks:
539 |                 pylab.yticks(map(float, get_value(args.yticks, ki).split(',')))
540 | 
541 |             axes[ki][i].xaxis.set_major_formatter(mtick.FuncFormatter(
542 |                 lambda x, pos: '%dk' % (x//1000) if x else '0'
543 |             ))
544 |     rect = [0,0,1,.92]
545 |     if args.global_legend:
546 |         if not args.only_plot:
547 |             ax = [row for row in axes if row[0]][0][0]
548 |         lines,labels = ax.get_legend_handles_labels()
549 |         my_labels = args.global_legend.split('|')
550 |         if my_labels == ['1']:
551 |             my_labels = labels
552 |         if my_labels != ['0']:
553 |             if my_labels != ['2']:
554 |                 fig.legend(lines, my_labels, loc='lower center',
555 |                            ncol=2, labelspacing=0.)
556 |             rect = [0, 0.1, 1, 0.92]
557 |     gs.tight_layout(fig, rect=rect)
558 |     if args.save_to:
559 |         pylab.savefig(args.save_to)
560 |     else:
561 |         pylab.show()
562 | 
563 | def get_value(s, i):
564 |     v = s.split('|')
565 |     if len(v) == 1:
566 |         return v[0]
567 |     return v[i]
568 | 
569 | 
570 | def main():
571 |     global args
572 |     args =  parser.parse_args()
573 |     recache.do_recache = args.recache
574 |     print("Started")
575 |     all_tasks = sorted(set(x for file in args.files for x in get_tasks(get_key(file))))
576 |     if args.task:
577 |         all_tasks = args.task.split(',')
578 |     keys = args.key.split(',')
579 |     prefix = get_prefix(args.files)
580 |     scores = [Scores(f, prefix=prefix) for f in args.files if is_valid_dir(f)]
581 |     if args.job == 'parse':
582 |         if args.savedir:
583 |             construct_parsed_data(scores, keys, args.savedir)
584 |         else:
585 |             ans = {}
586 |             for key in keys:
587 |                 ans[key] = get_print_results(scores, key)
588 |             print(yaml.safe_dump(ans))
589 |     elif args.job == 'plot':
590 |         run_plots(args, scores, all_tasks, keys)
591 | 
592 | '''
593 | python  get_pretty_score.py cachedlogs/{Jul,A}*/*24*={b,}add{e,z,}*  --task badd,badde,baddz,add,adde,addz --remove_strings '|-progressive_|curriculum=2|curriculum=5' --exclude='forward_max|rx_step|cutoff|binar|grad_noise|t,|dropout|badd,add|batchnorm|resnet'  --min-length 30 --title 'Alignment helps addition' --titles='||Binary addition, 24 filters|'  --xlims='0,30000' --nbinsx=3 --global-legend='Padded|Aligned|Unpadded' --overlay=3 --save-to=moo.pdf --no-startx dump magic1.pickle
594 | python get_pretty_score.py cachedlogs/{Jul,A}*/*128*={b,}add{e,z,}*  --task badd,badde,baddz,add,adde,addz --remove_strings '|-progressive_|curriculum=2|curriculum=5' --exclude='kbadd|qbadd|qadd|3badd|3add|kadd|curric|forward_max|rx_step|cutoff|binar|grad_noise|t,|dropout|badd,add|curriculum|resnet|batchn'  --min-length 30 --title 'Alignment helps addition' --titles='Binary, 128 filters|Decimal, 128 filters||'  --xlims='0,30000' --nbinsx=3  --overlay=3 --save-to=moo.pdf --global-legend='Padded|Aligned|Unpadded' dump magic2.pickle
595 | 
596 | python  get_pretty_score.py cachedlogs/{Jul,A}*/*24*=bmul{e,z,}-*  --task mul,mule,mulz,bmul,bmule,bmulz --remove_strings '|-progressive_|curriculum=2|curriculum=5|max_steps=80000-' --exclude='forward_max|rx_step|cutoff|binar|grad_noise|t,|dropout|batchn|resn|layer'  --min-length 30 --title 'Alignment hurts multiplication'  --overlay=3 --global-legend=2  --titles '|||Binary multiplication, 24 filters' --no-startx --xlims='0,100000' --save-to=moo.pdf dump magic3.pickle
597 | 
598 | python  get_pretty_score.py cachedlogs/{Jul,A}*/*128*=bmul{e,z,}-*  --task mul,mule,mulz,bmul,bmule,bmulz --remove_strings '|-progressive_|curriculum=2|curriculum=5|max_steps=80000-' --exclude='forward_max|rx_step|cutoff|binar|grad_noise|t,|dropout|batchn|resn|layer'  --min-length 30 --title 'Alignment helps addition, hurts multiplication'  --overlay=3 --global-legend=2  --titles '|||Binary multiplication, 128 filters' --save-to=moo.pdf dump magic4.pickle
599 | 
600 | '''
601 | if __name__ == '__main__':
602 |     if sys.argv[1] == 'magic':
603 |         for i, loc in enumerate(['3,1', '3,3', '4,2', '4,4']):
604 |             sys.argv[1:] = pickle.load(open('magic%s.pickle' % (i+1))) + ['--only-plot', loc]
605 |             main()
606 |         sys.exit()
607 |     if len(sys.argv) > 1 and 'dump' == sys.argv[-2]:
608 |         loc = sys.argv.pop()
609 |         sys.argv.pop()
610 |         pickle.dump(sys.argv[1:], open(loc, 'w'))
611 |         sys.exit()
612 |     main()
613 | 


--------------------------------------------------------------------------------