├── neuralgpu ├── __init__.py ├── config.py ├── data_utils.py ├── curriculum.py ├── records.py ├── mytf.py ├── model.py ├── generators.py └── trainer.py ├── .gitignore ├── neural_gpu_trainer.py ├── examples └── loading_and_using_model.py ├── plots ├── plot_carry_thresholds.py ├── carries.py ├── paper_carries3.py ├── large_carries.py ├── construct_table.py └── get_pretty_score.py └── README.md /neuralgpu/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *~ 3 | *.pdf 4 | *.aux 5 | *.log 6 | *.pickle 7 | *.csv 8 | -------------------------------------------------------------------------------- /neural_gpu_trainer.py: -------------------------------------------------------------------------------- 1 | """Start and train the NeuralGPU. 2 | 3 | See neuralgpu/trainer.py for flags and more information. 4 | """ 5 | 6 | import tensorflow as tf 7 | from neuralgpu import trainer 8 | 9 | 10 | def main(_): 11 | trainer.start_and_train() 12 | 13 | if __name__ == "__main__": 14 | tf.app.run() 15 | -------------------------------------------------------------------------------- /examples/loading_and_using_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | import os 5 | import sys 6 | sys.path.append(os.path.dirname(os.path.dirname(__file__))) 7 | from neuralgpu import trainer, generators 8 | 9 | DIR = '/tmp/moo/cow3' 10 | 11 | sess = tf.Session() 12 | model = trainer.load_model(sess, DIR) 13 | 14 | example = generators.generators['baddet'].get_batch(8,32) 15 | 16 | result = model.step(sess, example, False) 17 | print(result.to_string()) 18 | -------------------------------------------------------------------------------- /plots/plot_carry_thresholds.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import pylab 3 | import numpy as np 4 | from matplotlib import rc 5 | 6 | rc('font', size='9') 7 | rc('axes', labelsize='large') 8 | rc('lines', linewidth=3) 9 | 10 | #pylab.ion() 11 | 12 | data = np.array([int(open(fname).read().strip()) for fname in glob.glob('cachedlogs/September-0*/*/threshold2')]) 13 | data.sort() 14 | 15 | pylab.figure(figsize=(4,4)) 16 | 17 | pylab.clf() 18 | pylab.plot(1-np.arange(len(data)) * 1./ len(data), data, marker='o') 19 | pylab.loglog() 20 | pylab.xlabel('Fraction of training runs') 21 | pylab.ylabel('Decimal addition carry length with 50% failure') 22 | pylab.title('A small fraction of trials can carry\nover longer intervals (log log plot)') 23 | pylab.tight_layout() 24 | pylab.savefig('../neuralgpu_paper/carry_runs_loglog.pdf') 25 | 26 | pylab.clf() 27 | pylab.plot(data[::-1], marker='o') 28 | pylab.xlabel('Run') 29 | pylab.ylabel('# carries before failure') 30 | pylab.title('5% of runs carry much better, but still not perfectly') 31 | pylab.savefig('carry_runs.pdf') 32 | 33 | -------------------------------------------------------------------------------- /neuralgpu/config.py: -------------------------------------------------------------------------------- 1 | """Config object. 2 | 3 | Ideally, all used FLAGS would be passed through here. However, the 4 | code is lazy in parts and uses FLAGS directly. 5 | """ 6 | 7 | from . import data_utils 8 | 9 | class NeuralConfig(object): 10 | """Initial configuration settings for model""" 11 | 12 | config_keys = '''nmaps niclass noclass dropout rx_step max_grad_norm 13 | cutoff nconvs kw kh height mode lr pull pull_incr 14 | min_length batch_size task init_weight curriculum_bound layer_scale 15 | '''.split() 16 | 17 | def __init__(self, FLAGS, **kws): 18 | for key in self.config_keys: 19 | val = kws.get(key, getattr(FLAGS, key, None)) 20 | setattr(self, key, val) 21 | 22 | min_length = 5 23 | max_length = min(FLAGS.max_length, data_utils.bins[-1]) 24 | assert max_length + 1 > min_length 25 | self.max_length = max_length 26 | self.min_length = min_length 27 | 28 | def __str__(self): 29 | msg1 = ("layers %d kw %d h %d kh %d relax %d batch %d task %s" 30 | % (self.nconvs, self.kw, self.height, self.kh, self.rx_step, 31 | self.batch_size, self.task)) 32 | msg2 = ("cut %.2f pull %.3f lr %.2f iw %.2f cr %.2f nm %d d%.4f gn %.2f %s" % 33 | (self.cutoff, self.pull_incr, self.lr, self.init_weight, 34 | self.curriculum_bound, self.nmaps, self.dropout, self.max_grad_norm, msg1)) 35 | return msg2 36 | 37 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Code for the Neural GPU model originally described in 2 | [[http://arxiv.org/abs/1511.08228]]. 3 | 4 | 5 | Running experiments 6 | =================== 7 | 8 | Running one instance 9 | -------------------- 10 | 11 | The following would use 256 filters to train on binary multiplication, 12 | then 4-ary, then decimal: 13 | ``` 14 | python neural_gpu_trainer.py --nmaps=256 --task=bmul,qmul,mul --progressive_curriculum=5 15 | ``` 16 | 17 | My typical invocation is something like 18 | 19 | ``` 20 | CUDA_VISIBLE_DEVICES=0 python neural_gpu_trainer.py --random_seed=0 --max_steps=200000 --forward_max=201 --nmaps=256 --task=bmul,qmul,mul --time_till_eval=4 --progressive_curriculum=5 --train_dir=../logs/August-12-curriculum/forward_max=201-nmaps=256-task=bmul,qmul,mul-progressive_curriculum=5-random_seed=0 21 | ``` 22 | 23 | The tests on decimal carry were done using invocations like the following: 24 | ``` 25 | CUDA_VISIBLE_DEVICES=0 neural_gpu_trainer.py --train_dir=../logs/run1 --random_seed=1 --max_steps=100000 --forward_max=201 --nmaps=128 --task=add --time_till_eval=4 --time_till_ckpt=1 26 | ``` 27 | 28 | You can find a list of options, and their default values, in `neuralgpu/trainer.py`. 29 | 30 | Examining results 31 | ================= 32 | 33 | Loading and examining a model 34 | ----------------------------- 35 | 36 | `examples/examples_for_loading_model.py` gives a simple instance of loading a 37 | model and running it on an instance. 38 | 39 | Plotting results 40 | ---------------- 41 | 42 | Something like `python plots/get_pretty_score.py cachedlogs/*/*task=bmul,qmul,mul-*` works. There are a lot of options to make it prettier (renaming stuff, removing some runs, changing titles, reordering, etc.). For example, one of my plots was made with 43 | 44 | ``` 45 | python get_pretty_score.py cachedlogs/A*/*256*[=,]mul-* --titles '256 filters|' --title 'Decimal multiplication is easier with curriculum' --task mul --remove_strings='|-progressive_curriculum=5' --exclude='layer|progressive' --order '4,2,1,3' --global-legend=1 46 | ``` 47 | 48 | Requirements 49 | ============ 50 | 51 | * TensorFlow (see tensorflow.org for how to install) 52 | * Matplotlib for Python (sudo apt-get install python-matplotlib) 53 | * joblib 54 | 55 | Credits 56 | ======= 57 | 58 | Original code by Lukasz Kaiser (lukaszkaiser). Modified by Eric Price 59 | (ecprice) 60 | -------------------------------------------------------------------------------- /plots/carries.py: -------------------------------------------------------------------------------- 1 | """Class for constructing problem inputs featuring lots of carries.""" 2 | from __future__ import print_function 3 | 4 | import tensorflow as tf, numpy as np 5 | 6 | import operator 7 | import pandas 8 | import random 9 | import time 10 | import glob 11 | import sys 12 | import os 13 | 14 | sys.path.append(os.path.dirname(os.path.dirname(__file__))) 15 | from neuralgpu import generators 16 | 17 | 18 | 19 | def get_generator(base, sep, aligned=False, randloc=False): 20 | base_class = generators.AlignedOpGenerator if aligned else generators.OpGenerator 21 | class CarryGenerator(base_class): 22 | def __init__(self, carry, overflow, randloc=randloc, base=base, sep=sep, zero_pad=True): 23 | super(CarryGenerator, self).__init__(base, operator.add, sep) 24 | self.carry = carry 25 | self.overflow = overflow 26 | self.randloc = randloc 27 | 28 | def _rand_inputs(self, k): 29 | n1 = random.randint(1 if self.overflow else 0, self.base**self.carry-1) 30 | n2 = self.base**self.carry - n1 - (0 if self.overflow else 1) 31 | loc = random.randint(0, k - self.carry) if self.randloc else 0 32 | vals = [n1*self.base**loc, n2*self.base**loc] 33 | if random.random() > .5: 34 | return vals 35 | else: 36 | return vals[::-1] 37 | 38 | @classmethod 39 | def get_error_rate(cls, sess, model, carry_length, do_overflow, max_length, num): 40 | if max_length is None: 41 | max_length = 2*carry_length + 3 42 | example = cls(carry_length, do_overflow).get_batch(max_length, num) 43 | result = model.step(sess, example, False) 44 | return result.accuracy()[2] 45 | 46 | @classmethod 47 | def get_rates(cls, sess, model, carries, max_length=201, numblocks=1, blocksize=32, verbose=True): 48 | df = pandas.DataFrame(index=carries, columns=[False, True]) 49 | for carry in carries: 50 | for col in df.columns: 51 | ans = 0 52 | for i in range(numblocks): 53 | ans += cls.get_error_rate(sess, model, carry, col, max_length, blocksize) 54 | df[col][carry] = ans 55 | if verbose: 56 | print(carry, ':', df[False][carry], df[True][carry]) 57 | return df 58 | 59 | return CarryGenerator 60 | -------------------------------------------------------------------------------- /plots/paper_carries3.py: -------------------------------------------------------------------------------- 1 | import pandas 2 | import pylab 3 | from matplotlib import rc 4 | import matplotlib.ticker as mtick 5 | 6 | rc('font', size='9') 7 | rc('axes', labelsize='large') 8 | rc('lines', linewidth=3) 9 | 10 | supsize=12 11 | titlesize=9 12 | legsize=8 13 | 14 | if __name__ != '__main__': 15 | pylab.ion() 16 | 17 | vertical = True 18 | #vertical = False 19 | #scp 6.cirrascale.sci.openai.org:models/neural_gpu/*.csv . 20 | 21 | 22 | def make_plot(csv_name, kws1={}): 23 | data = pandas.read_csv(csv_name) 24 | data = data.set_index(data.columns[0]) 25 | data.index.name = 'Carries' 26 | data = data * 1. / data.values.max() 27 | data = data[:60] 28 | 29 | my_kws = dict(marker='o', ms=5) 30 | k1 = my_kws.copy() 31 | k1.update(kws1) 32 | #k2 = my_kws.copy() 33 | #k2.update(label="Result barely carries") 34 | #k2.update(kws2) 35 | #pylab.plot(data['False'], **k1) 36 | #pylab.plot(data['True'], **k2) 37 | pylab.plot((data['True'] + data['False'])/2, **k1) 38 | 39 | 40 | if vertical: 41 | pylab.figure(figsize=(4,4)) 42 | orientation = (2,1) 43 | else: 44 | pylab.figure(figsize=(6,3)) 45 | orientation = (1,2) 46 | 47 | pylab.subplot(*(orientation+(1,))) 48 | make_plot('csv/carry_errors_big.csv', 49 | dict(label='Train on random examples'), 50 | ) 51 | make_plot('csv/carry_errors_big.baddt.csv', 52 | dict(label='Train with some hard examples'), 53 | ) 54 | #pylab.xlabel("Number of carries $k$") 55 | #pylab.legend(loc=0, prop={'size': legsize}) 56 | pylab.gca().yaxis.set_major_formatter(mtick.FuncFormatter( 57 | lambda x, pos: '% 2d%%' % (x*100))) 58 | if not vertical: 59 | pylab.xlabel("Number of carries $k$") 60 | pylab.ylabel("Error rate") 61 | pylab.title("Binary", size=titlesize) 62 | pylab.locator_params(axis='y',nbins=4)# 63 | 64 | pylab.subplot(*(orientation+(2,))) 65 | make_plot('csv/carry_errors_add_large.csv', 66 | dict(label='Train on random examples'), 67 | ) 68 | make_plot('csv/carry_errors_addt_large.csv', 69 | dict(label='Train with some hard examples'), 70 | ) 71 | pylab.xlabel("Number of carries $k$") 72 | #pylab.legend(loc=0, prop={'size': legsize}) 73 | if vertical: 74 | pylab.ylabel("Error rate") 75 | pylab.gca().yaxis.set_major_formatter(mtick.FuncFormatter( 76 | lambda x, pos: '% 2d%%' % (x*100))) 77 | pylab.title("Decimal", size=titlesize) 78 | pylab.suptitle("Additions with long carries", size=supsize) 79 | pylab.locator_params(axis='y',nbins=4)# 80 | pylab.gcf().legend(*pylab.gca().get_legend_handles_labels(), loc='lower center', ncol=1, labelspacing=0) 81 | if vertical: 82 | pylab.tight_layout(rect=[0, 0.14, 1, 0.95]) 83 | else: 84 | pylab.tight_layout(rect=[0, 0.18, 1, 0.93]) 85 | if vertical: 86 | pylab.savefig('../neuralgpu_paper/fig_carries_all_vertical.pdf') 87 | else: 88 | pylab.savefig('../neuralgpu_paper/fig_carries_all_horizontal.pdf') 89 | -------------------------------------------------------------------------------- /neuralgpu/data_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Utilities for the NeuralGPU 16 | 17 | This file has two main components: 18 | 19 | - generators is a dict mapping task names to DataGenerator instances, which construct individual problem input/output pairs 20 | - Utilities for converting those input/output pairs to/from string representations. 21 | """ 22 | 23 | import math 24 | import random 25 | import sys 26 | import time 27 | import operator 28 | import functools 29 | import numpy as np 30 | import tensorflow as tf 31 | 32 | from tensorflow.python.platform import gfile 33 | 34 | FLAGS = tf.app.flags.FLAGS 35 | 36 | # Lengths of NeuralGPU instances. Inputs will be padded to the next 37 | # larger one. 38 | bins = [8, 12, 16, 20, 24, 28, 32, 36, 40, 48, 64, 128] 39 | forward_max = 128 40 | log_filename = "" 41 | 42 | 43 | DIGITS = range(1, 11) 44 | NULL = 0 45 | DUP = 22 46 | SPACE = 23 47 | START = 24 48 | MINUS = 25 49 | 50 | def pad(l): 51 | for b in bins + [forward_max]: 52 | if b >= l: return b 53 | raise IndexError("Length %s longer than max length %s" % (l, forward_max)) 54 | 55 | 56 | @np.vectorize 57 | def char_to_symbol(i): 58 | """Covert ids to text.""" 59 | i = int(i) 60 | if i == 0: return "_" 61 | if i in [11,12,13]: return "+" 62 | if i in [14,15,16]: return "*" 63 | if i in [17,18,19]: return "/" 64 | if i in [20,21,22]: return "-" 65 | if i in [START]: return '^' 66 | if i in [SPACE]: return '.' 67 | if i in [MINUS]: return '-' 68 | return str(i-1) 69 | 70 | def join_array(array, rev=False): 71 | if len(array.shape) == 1: 72 | if rev: 73 | array = array[::-1] 74 | return ''.join(array).rstrip(' ') 75 | elif len(array.shape) == 2: 76 | if rev: 77 | array = array[:,::-1] 78 | return '\n'.join([''.join(a).rstrip(' ') for a in array]) 79 | else: 80 | raise ValueError("Weird shape for joining: %s" % array.shape) 81 | 82 | def to_string(array, rev=True): 83 | if isinstance(array, tuple): 84 | if len(array) == 3: # Batches 85 | inp, outp = array[:2] 86 | return '\n\n'.join(to_string((i,o), rev) for i,o in zip(inp, outp)) 87 | inp, outp = [to_string(a, rev) for a in array[:2]] 88 | return '%s\n%s\n%s' % (inp, '-'*len(inp.split('\n')[0]), outp) 89 | return join_array(char_to_symbol(array), rev=rev) 90 | 91 | @np.vectorize 92 | def to_id(s): 93 | """Covert text to ids.""" 94 | if s == "+": return 11 95 | if s == "*": return 14 96 | return int(s) + 1 97 | 98 | class TeeErr(object): 99 | def __init__(self, f): 100 | self.file = f 101 | self.stderr = sys.stderr 102 | sys.stderr = self 103 | def write(self, data): 104 | self.file.write(data) 105 | self.file.flush() 106 | self.stderr.write(data) 107 | 108 | log_f = None 109 | 110 | def print_out(s, newline=True): 111 | """Print a message out and log it to file.""" 112 | global log_f 113 | if log_filename: 114 | try: 115 | if log_f is None: 116 | log_f = open(log_filename, 'a', 1) 117 | log_f.write(s + ("\n" if newline else "")) 118 | # pylint: disable=bare-except 119 | except: 120 | sys.stdout.write("Error appending to %s\n" % log_filename) 121 | sys.stdout.write(s + ("\n" if newline else "")) 122 | sys.stdout.flush() 123 | 124 | def safe_exp(x): 125 | perp = 10000 126 | if x < 100: perp = math.exp(x) 127 | if perp > 10000: return 10000 128 | return perp 129 | 130 | def load_class(name): 131 | modulename, classname = name.rsplit('.', 1) 132 | module = __import__(modulename) 133 | return getattr(module, classname) 134 | -------------------------------------------------------------------------------- /plots/large_carries.py: -------------------------------------------------------------------------------- 1 | """Compute statistics on model checkpoints and long carries in decimal addition. 2 | 3 | 4 | It looks for checkpoints of the form "../logs/September-*/*/neural_gpu.ckpt-100000 5 | 6 | When run with different arguments, computes different statistics which 7 | are placed in different files in the checkpoint directory; if that 8 | file already exists, it does not compute the file. Hence you can 9 | repeatedly run this program, as you create more checkpoints. 10 | 11 | With no arguments, in 'carries.csv' it places the success rate for various lengths of carries. 12 | With '-t', in 'thresholds' it places the minimum threshold at which the success rate is < 50% 13 | """ 14 | 15 | from __future__ import print_function 16 | import tensorflow as tf 17 | import numpy as np 18 | import operator 19 | import pandas 20 | import random 21 | import time 22 | import os 23 | import glob 24 | import sys 25 | 26 | sys.path.append(os.path.dirname(os.path.dirname(__file__))) 27 | from neuralgpu import trainer, data_utils 28 | import carries 29 | 30 | #data_utils.bins.pop() 31 | #data_utils.bins.pop() 32 | 33 | #del data_utils.bins[6] 34 | #del data_utils.bins[4] 35 | 36 | # Because of the bug with 'tf.Variable' rather than 'tf.get_variable' for 'layer_index' in neural_gpu.py, 37 | # we need to have an equal number of bins to when it was trained. 38 | data_utils.bins = [8] + [2**i + 5 for i in range(3, 13)] 39 | 40 | aligned = False 41 | base, sep = (10, 13) 42 | randloc = False 43 | CarryGenerator = carries.get_generator(base, sep, aligned, randloc) 44 | 45 | dir = None 46 | model = None 47 | sess = None 48 | 49 | def load_model(dir): 50 | global model, sess 51 | reconfig = {'mode': 1, # No backprop 52 | 'forward_max': 401} # Large enough to check 200-digit carries 53 | if model is None: 54 | sess=tf.Session() 55 | model = trainer.load_model(sess, dir, reconfig) 56 | else: 57 | model.saver.restore(sess, dir+'/neural_gpu.ckpt-100000') 58 | 59 | 60 | def find_dirs(base_dir='../logs', check_file='carries.csv'): 61 | """Find all checkpoint directories that haven't been updated for check_file""" 62 | for one in glob.glob(base_dir+'/September-*'): 63 | for full_dir in glob.glob(one+'/*'): 64 | if os.path.exists(full_dir+'/neural_gpu.ckpt-100000'): 65 | if not os.path.exists(os.path.join(full_dir, check_file)): 66 | yield full_dir 67 | 68 | 69 | locs = list(range(1, 30)) + list(range(30,100,5)) 70 | 71 | def get_data(dir, locs=locs): 72 | load_model(dir) 73 | results = CarryGenerator.get_rates(sess, model, locs, 201 if randloc else None, 1) 74 | return results 75 | 76 | def run_dir(dir): 77 | try: 78 | results = get_data(dir) 79 | except tensorflow.python.framework.errors.FailedPreconditionError as e: 80 | print('ERROR ON DIR', dir, file=sys.stderr) 81 | print() 82 | print(e) 83 | print() 84 | return 85 | with open(dir+'/carries.csv', 'w') as f: 86 | f.write(results.to_csv()) 87 | 88 | def bsearch(is_leq, lo=1, hi=None): 89 | if hi is None: 90 | hi = 2*lo 91 | while not is_leq(hi): 92 | lo, hi = hi+1, 2*hi 93 | while lo < hi: 94 | mid = (lo+hi)//2 95 | if is_leq(mid): 96 | hi = mid 97 | else: 98 | lo = mid + 1 99 | return lo 100 | 101 | def find_threshold(): 102 | def is_leq(n): 103 | def get_estimate(blocksize): 104 | return sum(CarryGenerator.get_error_rate(sess, model, n, truth, None, blocksize) for truth in [False, True]) * 1./(2*blocksize) 105 | 106 | blocksize = 32 107 | result = get_estimate(blocksize) 108 | print(n, result) 109 | # Be extra careful once we get close 110 | if abs(result - .5) < .2: 111 | result = np.mean([result] + [get_estimate(blocksize) for _ in range(2)]) 112 | print('Refined estimate:', result) 113 | return result >= .5 114 | return bsearch(is_leq) 115 | 116 | def main_results(): 117 | for dir in find_dirs(): 118 | print('Checking', dir) 119 | run_dir(dir) 120 | 121 | def main_thresholds(fname = 'threshold'): 122 | for dir in find_dirs(check_file=fname): 123 | print('Checking', dir) 124 | load_model(dir) 125 | thresh = find_threshold() 126 | with open(os.path.join(dir, fname), 'w') as f: 127 | print(thresh, file=f) 128 | 129 | if __name__ == '__main__': 130 | if '-t' in sys.argv: 131 | main_thresholds() 132 | else: 133 | main_results() 134 | -------------------------------------------------------------------------------- /plots/construct_table.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | from __future__ import print_function 3 | import fileinput 4 | 5 | import sys 6 | import numpy as np 7 | import pandas as pd 8 | import argparse 9 | import glob 10 | import scipy.signal 11 | import os 12 | import yaml 13 | 14 | import collections 15 | 16 | parser = argparse.ArgumentParser(description='Get scores') 17 | 18 | parser.add_argument("--metric", type=str, default='score') 19 | parser.add_argument("--dir", type=str, default='/home/ecprice/large/research/neural_gpu/neural_parsed_logs/newer') 20 | parser.add_argument("--curr", type=bool, default=True) 21 | parser.add_argument("tasks", type=str, nargs='*') 22 | args = parser.parse_args() 23 | 24 | def groupby(lst, num): 25 | ans = [] 26 | for i in range(0, len(lst), num): 27 | yield lst[i:i+num] 28 | 29 | class Run(dict): 30 | @property 31 | def metadata(self): 32 | return self['metadata'] 33 | 34 | def options(self): 35 | cmd = self.metadata['commandline'] 36 | lst = [] 37 | for arg in cmd[1:]: 38 | a, b = arg.split('=', 1) 39 | lst.append((a.lstrip('-'), b)) 40 | args = collections.OrderedDict(lst) 41 | return args 42 | 43 | @property 44 | def tasks(self): 45 | return self.options()['task'].split(',') 46 | 47 | @property 48 | def version(self): 49 | d = self.options() 50 | for k in 'train_dir task forward_max random_seed max_steps'.split(): 51 | del d[k] 52 | mapping = {'progressive_curriculum': 'curr'} 53 | for k, v in mapping.items(): 54 | d[v] = d[k] 55 | del d[k] 56 | return '-'.join('%s=%s' % (a, b) for (a, b) in d.items()) 57 | 58 | def get_value(self, metric): 59 | if '.' in metric: 60 | metric, key = metric.split('.') 61 | else: 62 | key = 'fraction' 63 | data = self[metric][self.task] 64 | res = data[key] 65 | count = len(data['last']) 66 | if isinstance(res, list): 67 | res = int(np.median([x or 200000 for x in data[key]]) / 100) 68 | if res == 2000: 69 | res = np.inf 70 | return (res, count) 71 | 72 | def value_to_str(val): 73 | if val == np.inf: 74 | res = '$\\infty$' 75 | elif val is None: 76 | res = '-' 77 | elif isinstance(val, float): 78 | res = str(int(val*100)) + r'\%' 79 | else: 80 | res = str(val) 81 | return res 82 | 83 | def pair_to_str(pair): 84 | if pair is None: 85 | return '-' 86 | else: 87 | return '%s (%s)' % (value_to_str(pair[0]), pair[1]) 88 | 89 | def load_all_data(dirname): 90 | files = glob.glob(os.path.join(dirname, '*')) 91 | results = [] 92 | for fname in files: 93 | with open(fname) as f: 94 | results.append(Run(yaml.load(f))) 95 | return results 96 | 97 | all_runs = load_all_data(args.dir) 98 | 99 | if not args.tasks: 100 | s = set([run.task for run in all_runs]) 101 | print('Need task name. Options:', ' '.join(s)) 102 | sys.exit() 103 | 104 | def build_table(all_runs, tasks): 105 | rows = {} 106 | for run in all_runs: 107 | if run.task in tasks: 108 | d = rows.setdefault(run.version, {}) 109 | assert run.task not in d 110 | d[run.task] = run 111 | return rows 112 | 113 | def texify(s): 114 | return r'\texttt{%s}' % (s.replace('_', r'\_')) 115 | 116 | def table_to_str(rows, tasks, metric): 117 | ans = [] 118 | ans.append(' & '.join(['Name', 'Mean'] + list(tasks))) 119 | for version, runs in sorted(rows.items()): 120 | values = [runs[t].get_value(metric) if t in runs else None for t in tasks] 121 | row_strs = ([texify(version.split('=',1)[1])] + 122 | [value_to_str(np.mean([v[0] for v in values if v is not None]))] + 123 | [pair_to_str(value) for value in values]) 124 | ans.append(' & '.join(row_strs)) 125 | interior = '\\\\\n'.join(ans) 126 | table = r'''\begin{tabular}{lc%s} 127 | %s 128 | \end{tabular}''' % ('c'*len(tasks), interior) 129 | return table 130 | 131 | def split_table_to_str(table, tasks, metric, maxcol): 132 | ans = [] 133 | for lst in groupby(tasks, maxcol): 134 | ans.append(table_to_str(table, lst, metric)) 135 | return '\n\n\\noindent'.join(ans) 136 | 137 | def get_document(rows, tasks, metrics, maxcol = 7): 138 | table = build_table(rows, tasks) 139 | results = [] 140 | for metric in metrics: 141 | s = split_table_to_str(table, tasks, metric, maxcol) 142 | results.append('\\section{%s}\n%s' % (metric, s)) 143 | return (r''' 144 | \documentclass[11pt,letterpaper]{article} 145 | \usepackage[margin=1in]{geometry} 146 | \begin{document} 147 | %s 148 | \end{document} 149 | ''' % '\n\\newpage\n'.join(results)) 150 | 151 | print(get_document(all_runs, args.tasks, args.metric.split(','), 7)) 152 | -------------------------------------------------------------------------------- /neuralgpu/curriculum.py: -------------------------------------------------------------------------------- 1 | """Curriculum and its subclasses decide when to choose which task for training.""" 2 | 3 | from __future__ import print_function 4 | 5 | import numpy as np 6 | 7 | class Curriculum(object): 8 | def __init__(self, generators, model_config): 9 | self.generators = generators 10 | for i, g in enumerate(generators): 11 | g.taskid = i 12 | 13 | self.min_length = model_config.min_length 14 | self.max_length = model_config.max_length 15 | self.model_config = model_config 16 | self.max_cur_lengths = {g.taskid: min(self.min_length+3, self.max_length) 17 | for g in generators} 18 | 19 | def draw_length(self, cur_length, generator): 20 | l = None 21 | while l is None: 22 | # Select the length for curriculum learning. 23 | l = np.random.randint(self.min_length, cur_length + 1) 24 | if np.random.randint(100) < 60: # Prefer longer stuff 60% of time. 25 | l = max(l, np.random.randint(self.min_length, cur_length + 1)) 26 | # Mixed curriculum learning: in 25% of cases go to an even larger length. 27 | if np.random.randint(100) < 25: 28 | l = max(l, np.random.randint(self.min_length, self.max_length + 1)) 29 | 30 | if not generator.is_valid_length(l): 31 | l = None 32 | 33 | within_bounds = (l <= cur_length) 34 | return l, within_bounds 35 | 36 | def get_generator_for_task(self, task): 37 | return [g for g in self.generators if g.name == task][0] 38 | 39 | def test_examples(self, batch_size, task): 40 | generator = self.get_generator_for_task(task) 41 | for l in np.arange(self.min_length, self.max_length + 1): 42 | if generator.is_valid_length(l): 43 | yield (generator.get_batch(l, batch_size), l) 44 | 45 | def draw_example(self, batch_size, l=None, task=None): 46 | """Draw a random example""" 47 | generator = self.draw_generator(task) 48 | if l is None: 49 | cur_length = self.get_cur_length(generator) 50 | l, within_bounds = self.draw_length(cur_length, generator) 51 | else: 52 | within_bounds = True 53 | return (generator.get_batch(l, batch_size), within_bounds) 54 | 55 | def tasks(self): 56 | """List of task names""" 57 | return [g.name for g in self.generators] 58 | 59 | def consider_extending(self, results): 60 | """Interpret the results""" 61 | pass 62 | 63 | def draw_generator(self, task=None): 64 | options = (self.generators if task is None else 65 | [g for g in self.generators if g.name == task]) 66 | return np.random.choice(options) 67 | 68 | def get_cur_length(self, generator): 69 | return self.max_cur_lengths[generator.taskid] 70 | 71 | def consider_extending(self, record): 72 | ans = False 73 | for t in record.record_for_task: 74 | ans = max(ans, self.consider_extending_for_task(record.record_for_task[t], t)) 75 | return ans 76 | 77 | def consider_extending_for_task(self, record, taskid): 78 | if record.avg_seq_err > self.model_config.curriculum_bound: 79 | return 0 80 | if self.max_cur_lengths[taskid] < self.max_length: 81 | self.max_cur_lengths[taskid] += 1 82 | while not self.generators[0].is_valid_length(self.max_cur_lengths[taskid]) and self.max_cur_lengths[taskid] < self.max_length: 83 | self.max_cur_lengths[taskid] += 1 84 | return 2 85 | return 1 86 | 87 | @property 88 | def length_str(self): 89 | return '/'.join(str(v) for k, v in sorted(self.max_cur_lengths.items())) 90 | 91 | class GeneralizeCurriculum(Curriculum): 92 | 93 | def draw_generator(self, task=None): 94 | options = (self.generators[:1] if task is None else 95 | [g for g in self.generators if g.name == task]) 96 | return options[0] 97 | 98 | @property 99 | def length_str(self): 100 | return str(self.max_cur_lengths[self.generators[0].taskid]) 101 | 102 | class BetterCurriculum(Curriculum): 103 | rand_prob = 0.2 104 | only_later = False 105 | decrease_threshold = 1 106 | last_if_solved = False 107 | 108 | def __init__(self, generators, model_config, kind): 109 | super(BetterCurriculum, self).__init__(generators, model_config) 110 | if kind == 2: 111 | self.decrease_threshold = 0.01 112 | elif kind == 3: 113 | self.rand_prob = 0 114 | elif kind == 4: 115 | self.only_later = True 116 | elif kind == 5: 117 | self.only_later = True 118 | self.last_if_solved = True 119 | 120 | def draw_generator(self, task=None): 121 | if task is not None: 122 | return [g for g in self.generators if g.name == task][0] 123 | unsolved = [g for g in self.generators if self.max_cur_lengths[g.taskid] < self.max_length] 124 | if not unsolved: 125 | if self.last_if_solved: 126 | return self.generators[-1] 127 | else: 128 | return np.random.choice(self.generators) 129 | if np.random.random() > self.rand_prob: 130 | return unsolved[0] 131 | if self.only_later: 132 | return np.random.choice(unsolved) 133 | else: 134 | return np.random.choice(self.generators) 135 | 136 | def consider_extending_for_task(self, record, taskid): 137 | if (self.max_cur_lengths[taskid] == self.max_length and 138 | record.avg_seq_err > self.decrease_threshold): 139 | self.max_cur_lengths[taskid] -= 1 140 | return 0 141 | if record.avg_seq_err > self.model_config.curriculum_bound: 142 | return 0 143 | val = super(BetterCurriculum, self).consider_extending_for_task(record, taskid) 144 | # Don't stop us from decreasing learning rate here 145 | if (self.max_cur_lengths[taskid] == self.max_length): 146 | return 0 147 | return val 148 | -------------------------------------------------------------------------------- /neuralgpu/records.py: -------------------------------------------------------------------------------- 1 | ''' 2 | NeuralGPUResult records what happened during one run of the NeuralGPU 3 | 4 | ResultsRecord keeps track of the results during one stage of training. 5 | ''' 6 | 7 | from __future__ import print_function 8 | 9 | import numpy as np 10 | 11 | from . import data_utils 12 | 13 | class NeuralGPUResult(object): 14 | """Recover of result of a single batch, which is always on one task.""" 15 | grad_norm = None 16 | back_update = None 17 | loss = None 18 | output = None 19 | layers = None 20 | attention = None 21 | 22 | def __init__(self, vals, inp, target, taskid): 23 | self.feed_out = vals 24 | self.__dict__.update(vals) 25 | self.input = inp 26 | self.target = target 27 | self.taskid = taskid 28 | 29 | def accuracy(self, nprint=0): 30 | mask = self.target > 0 31 | errors = mask * (self.target != np.argmax(self.output, axis=-1)) 32 | return np.sum(errors), np.sum(mask), np.sum(np.any(errors, axis=1)) 33 | 34 | @property 35 | def length(self): 36 | return (self.input[0,:,0] > 0).sum() 37 | 38 | @property 39 | def batch_size(self): 40 | return len(self.input) 41 | 42 | def __repr__(self): 43 | err, tot, seq_err = self.accuracy() 44 | return '' % \ 45 | (self.length, self.loss, self.batch_size, err, seq_err) 46 | 47 | def attention_by_layer(self): 48 | return self.attention.mean(axis=-1).round(3) 49 | 50 | def to_string(self, i=None): 51 | if i is None: 52 | return '\n\n'.join(self.to_string(i) for i in range(self.batch_size)) 53 | inp, outp, targ = map(data_utils.to_string, (self.input[i], self.output[i].argmax(axis=-1), self.target[i])) 54 | ans = '\n'.join([inp, '-'*len(outp), outp, targ]) 55 | if hasattr(self, 'probs'): 56 | ans = '%s\n%s' % (ans, self.probs[:,i].round(3)) 57 | return ans 58 | 59 | def plot_attention(self, figname): 60 | import pylab 61 | for i in range(self.attention.shape[2]): 62 | for j in range(self.attention.shape[1]): 63 | pylab.plot(self.attention[:,j,i], color='rbgkyo'[j], alpha=0.2, marker='o') 64 | pylab.savefig(figname) 65 | 66 | def plot_many_examples(sess, model, max_length, generator, batch_size, 67 | dirpat): 68 | examples = [(l, generator.get_batch(l, batch_size)) for l in range(3, max_length+1) 69 | if generator.is_valid_length(l)] 70 | for l, example in examples: 71 | print(l) 72 | result = model.step(sess, example, False) 73 | result.plot_attention(dirpat % l) 74 | 75 | class ResultsRecord(object): 76 | """Result from many runs of training, on many tasks""" 77 | def __init__(self, batch_size): 78 | self.batch_size = batch_size 79 | self.record_for_task = {} 80 | 81 | def feed(self, results, step_time, below_curriculum): 82 | taskid = results.taskid[0] 83 | assert(not(np.any(results.taskid != taskid))) 84 | if taskid not in self.record_for_task: 85 | self.record_for_task[taskid] = ResultsRecordPerTask(self.batch_size) 86 | self.record_for_task[taskid].feed(results, step_time, below_curriculum) 87 | 88 | def __str__(self): 89 | def fmt_attr(name, fmt, label, scale=1): 90 | return label + ' ' + '/'.join(fmt % (getattr(v, name)*scale) 91 | for v in self.record_for_task.values()) 92 | stat_list = [fmt_attr('avg_ppx', '%.8f', 'ppx'), 93 | fmt_attr('avg_grad_norm', '%.8f', 'grad-norm'), 94 | fmt_attr('avg_step_time', '%s', 'step-time'), 95 | fmt_attr('avg_err', '%.2f', 'errors', 100), 96 | fmt_attr('avg_seq_err', '%.2f', 'seq-errors', 100), 97 | ] 98 | if hasattr(next(iter(self.record_for_task.values())), 'binary_gap'): 99 | stat_list.append(fmt_attr('avg_binary_gap', '%.3f', 'binary-gap')) 100 | return ' '.join(stat_list) 101 | 102 | class ResultsRecordPerTask(object): 103 | """Result of many batches on a single task""" 104 | def __init__(self, batch_size): 105 | self.batch_size = batch_size 106 | 107 | self.loss = 0. 108 | self.err = 0. 109 | self.seq_err = 0. 110 | self.acc = 0. 111 | self.grad_norm = 0. 112 | self.num_batches = 0 113 | self.num_below = 0 114 | self.step_time = 0. 115 | self.total = 0. 116 | 117 | def feed(self, results, step_time, below_curriculum): 118 | self.num_batches += 1 119 | self.num_below += below_curriculum 120 | 121 | self.step_time += step_time 122 | self.grad_norm += results.grad_norm 123 | for key in ['binary_gap']: 124 | if hasattr(results, key): 125 | if not hasattr(self, key): 126 | setattr(self, key, 0) 127 | setattr(self, key, getattr(self, key) + getattr(results, key)) 128 | if below_curriculum: 129 | self.loss += results.loss 130 | err, tot, seq_err = results.accuracy() 131 | self.err += err 132 | self.seq_err += seq_err 133 | self.total += tot 134 | 135 | @property 136 | def safe_num_below(self): 137 | # If we happen to not have any samples within the curriculum, don't crash 138 | return self.num_below or 1. 139 | 140 | @property 141 | def avg_binary_gap(self): 142 | return self.binary_gap / self.num_batches 143 | 144 | @property 145 | def avg_step_time(self): 146 | return self.step_time / self.num_batches 147 | 148 | @property 149 | def avg_grad_norm(self): 150 | return self.grad_norm / self.num_batches 151 | 152 | @property 153 | def avg_loss(self): 154 | return self.loss / self.safe_num_below 155 | 156 | @property 157 | def avg_ppx(self): 158 | return data_utils.safe_exp(self.loss / self.safe_num_below) 159 | 160 | @property 161 | def avg_err(self): 162 | return self.err / (self.total or 1) 163 | 164 | @property 165 | def avg_seq_err(self): 166 | return self.seq_err / (self.safe_num_below * self.batch_size) 167 | -------------------------------------------------------------------------------- /neuralgpu/mytf.py: -------------------------------------------------------------------------------- 1 | """Various improvements to the tensorflow API.""" 2 | 3 | from __future__ import print_function 4 | 5 | import tensorflow as tf 6 | from tensorflow.python.training import moving_averages 7 | import functools 8 | 9 | def broadcast_as(origin, target, axes=None): 10 | """Broadcast origin into the shape of target using numpy-style broadcasting. 11 | 12 | If axes is not None, set the shape to be 1 (rather than target.shape[i]) 13 | for each axis i not in axes.""" 14 | in_size = shape_list(origin) 15 | out_size = shape_list(target) 16 | result = [] 17 | if axes is None: 18 | axes = range(len(out_size)) 19 | for d, (i, o) in enumerate(zip(in_size, out_size)): 20 | if i is None or o is None: 21 | result.append(1) 22 | if d in axes: 23 | assert o % i == 0 24 | result.append(o//i) 25 | else: 26 | result.append(1) 27 | return tf.tile(origin, result) 28 | 29 | def stack(tensor_list, ax): 30 | """Stack many tensors along a single axis""" 31 | return tf.concat(ax, [tf.expand_dims(t, ax) for t in tensor_list]) 32 | 33 | def shape_list(tensor): 34 | """Return the tensor shape in a form tf.reshape understands.""" 35 | return [x or -1 for x in tensor.get_shape().as_list()] 36 | 37 | def safe_squeeze(array, i): 38 | """Only squeeze a particular axis, and check it was 1""" 39 | shape = shape_list(array) 40 | assert shape[i] == 1 41 | return tf.reshape(array, shape[:i] + (shape[i+1:] if (i+1) else [])) 42 | 43 | def expand_dims_by_k(array, k): 44 | """Add k 1s to the end of the tensor's shape""" 45 | return tf.reshape(array, shape_list(array) + [1]*k) 46 | 47 | 48 | def fix_batching(f, k, nargs=1): 49 | """Make a given function f support extra initial dimensions. 50 | 51 | A number of tf.nn operations expect shapes of the form [-1] + lst 52 | where len(lst) is a fixed constant, and operate independently on the 53 | -1. This lets them work on shapes of the form lst2 + lst, where 54 | lst2 is arbitrary. 55 | 56 | args: 57 | k: len(lst) that f wants 58 | nargs: Number of tensors with this property 59 | """ 60 | @functools.wraps(f) 61 | def wrapper(*args, **kws): 62 | arrays = args[:nargs] 63 | old_shape = shape_list(arrays[0]) 64 | used_shape = old_shape[-k:] 65 | inputs_reshaped = tuple(tf.reshape(array, [-1]+used_shape) 66 | for array in arrays) 67 | output = f(*(inputs_reshaped + args[nargs:]), **kws) 68 | new_prefix = old_shape[:-k] 69 | new_suffix = shape_list(output)[1:] 70 | output_reshaped = tf.reshape(output, new_prefix + new_suffix) 71 | return output_reshaped 72 | return wrapper 73 | 74 | softmax = fix_batching(tf.nn.softmax, 1) 75 | conv2d = fix_batching(tf.nn.conv2d, 3) 76 | softmax_cross_entropy_with_logits = fix_batching(tf.nn.softmax_cross_entropy_with_logits, 1, 2) 77 | 78 | 79 | 80 | def masked_moments(x, axes, mask): 81 | x = x * mask 82 | num_entries = tf.reduce_sum(tf.ones_like(x) * mask, axes) 83 | mean = tf.reduce_sum(x, axes) / num_entries 84 | var = tf.reduce_sum(tf.squared_difference(x, mean)*mask, axes) / num_entries 85 | return (mean, var) 86 | 87 | 88 | # From http://stackoverflow.com/questions/33949786/how-could-i-use-batch-normalization-in-tensorflow 89 | # and https://github.com/ry/tensorflow-resnet/blob/master/resnet.py 90 | def batch_norm(x, phase_train, mask=None, scope='bn'): 91 | """ 92 | Batch normalization on convolutional maps. 93 | Args: 94 | x: Tensor, 4D BHWD input maps 95 | phase_train: boolean tf.Varialbe, true indicates training phase 96 | scope: string, variable scope 97 | Return: 98 | normed: batch-normalized maps 99 | """ 100 | x_shape = shape_list(x) 101 | params_shape = x_shape[-1:] 102 | BN_DECAY = 0.8 103 | BN_EPSILON = 1e-3 104 | with tf.variable_scope(scope) as vs: 105 | beta = tf.get_variable('beta', params_shape, initializer=tf.zeros_initializer) 106 | gamma = tf.get_variable('gamma', params_shape, initializer=tf.ones_initializer) 107 | moving_mean = tf.get_variable('moving_mean', params_shape, 108 | initializer=tf.zeros_initializer, trainable=False) 109 | moving_var = tf.get_variable('moving_var', params_shape, 110 | initializer=tf.ones_initializer, trainable=False) 111 | axes = range(len(x_shape)-1) 112 | if mask is None: 113 | batch_mean, batch_var = tf.nn.moments(x, axes, name='moments') 114 | else: 115 | batch_mean, batch_var = masked_moments(x, axes, mask) 116 | 117 | update_ops = [ 118 | moving_averages.assign_moving_average(moving_mean, batch_mean, BN_DECAY), 119 | moving_averages.assign_moving_average(moving_var, batch_var, BN_DECAY)] 120 | def mean_var_with_update(): 121 | with tf.control_dependencies(update_ops): 122 | return tf.identity(batch_mean), tf.identity(batch_var) 123 | 124 | #mean, var = tf.cond(phase_train, 125 | # mean_var_with_update, 126 | # lambda: (moving_mean, moving_var)) 127 | mean, var = mean_var_with_update()#(batch_mean, batch_var) 128 | normed = tf.nn.batch_normalization(x, mean, var, beta, gamma, BN_EPSILON) 129 | return normed 130 | 131 | 132 | 133 | 134 | def print_bn_state(sess, nmaps): 135 | var_list = 'beta gamma moving_mean moving_var'.split() 136 | d = {} 137 | with tf.variable_scope('model/RX1/bn', reuse=True) as vs: 138 | for v in var_list: 139 | d[v] = tf.get_variable(v, [nmaps]) 140 | result = sess.run(d, {}) 141 | for v in var_list: 142 | print(v, result[v]) 143 | 144 | #numpy.fft.ifft(numpy.conj(numpy.fft.fft(a)) * numpy.fft.fft(b)).round(3) 145 | 146 | def softmax_index2d(indices, values, reduce = False): 147 | indices_shape = shape_list(indices) 148 | softmax_indices = tf.reshape( 149 | tf.nn.softmax( 150 | tf.reshape(indices, [-1, indices_shape[-1]*indices_shape[-2]])), 151 | indices_shape) 152 | softmax_indices = tf.complex(softmax_indices, tf.zeros_like(softmax_indices)) 153 | values = tf.complex(values, tf.zeros_like(values)) 154 | fft_of_answer = tf.conj(tf.batch_fft2d(softmax_indices)) * tf.batch_fft2d(values) 155 | if reduce: 156 | return tf.reduce_mean(tf.real(tf.batch_ifft(fft_of_answer)), -2) 157 | else: 158 | return tf.real(tf.batch_ifft2d(fft_of_answer)) 159 | 160 | def softmax_index1d(indices, values): 161 | # indices: bs x height x length 162 | # values: stuff x bs x height x length 163 | indices_shape = shape_list(indices) 164 | softmax_indices = softmax(indices) 165 | softmax_indices = tf.complex(softmax_indices, tf.zeros_like(softmax_indices)) 166 | values = tf.complex(values, tf.zeros_like(values)) 167 | fft_of_answer = tf.conj(tf.batch_fft(softmax_indices)) * tf.batch_fft(values) 168 | return tf.real(tf.batch_ifft(fft_of_answer)) 169 | -------------------------------------------------------------------------------- /neuralgpu/model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """The Neural GPU Model.""" 16 | 17 | import time 18 | 19 | import tensorflow as tf 20 | 21 | import random 22 | import numpy as np 23 | 24 | from . import mytf 25 | from . import data_utils 26 | from .records import NeuralGPUResult 27 | 28 | FLAGS = tf.app.flags.FLAGS 29 | 30 | def tf_cut_function(val, vlo, vhi, glo, ghi): 31 | if vlo is None: 32 | return val 33 | a = tf.clip_by_value(val, vlo, vhi) 34 | if glo is None: 35 | return a 36 | assert ghi >= vhi > vlo >= glo 37 | zz = tf.clip_by_value(val, glo, ghi) 38 | return zz - tf.stop_gradient(zz - a) 39 | 40 | def sigmoid_cutoff(x, cutoff): 41 | """Sigmoid with cutoff, e.g., 1.2sigmoid(x) - 0.1.""" 42 | y = tf.sigmoid(x) 43 | if cutoff < 1.01: return y 44 | d = (cutoff - 1.0) / 2.0 45 | z = cutoff * y - d 46 | dd = (FLAGS.smooth_grad - 1.0) / 2.0 if FLAGS.smooth_grad else None 47 | glo, ghi = (-dd, 1+dd) if FLAGS.smooth_grad else (None, None) 48 | return tf_cut_function(z, 0, 1, glo, ghi) 49 | 50 | def tanh_cutoff(x, cutoff): 51 | """Tanh with cutoff, e.g., 1.1tanh(x) cut to [-1. 1].""" 52 | y = tf.tanh(x) 53 | if cutoff < 1.01: return y 54 | z = cutoff * y 55 | tcut = FLAGS.smooth_grad_tanh 56 | glo, ghi = (-tcut, tcut) if tcut else (None, None) 57 | return tf_cut_function(z, -1, 1, glo, ghi) 58 | 59 | def conv_linear(arg, kw, kh, nout, prefix, bias=0): 60 | """Convolutional linear map.""" 61 | strides = [1, 1, 1, 1] 62 | if isinstance(arg, list): 63 | if len(arg) == 1: 64 | arg = arg[0] 65 | else: 66 | arg = tf.concat(len(mytf.shape_list(arg[0]))-1, arg) 67 | nin = mytf.shape_list(arg)[-1] 68 | with tf.variable_scope(prefix): 69 | k = tf.get_variable("CvK", [kw, kh, nin, nout]) 70 | res = mytf.conv2d(arg, k, strides, "SAME") 71 | 72 | if bias is None: 73 | return res 74 | bias_term = tf.get_variable("CvB", [nout], 75 | initializer=tf.constant_initializer(0.0)) 76 | return res + bias_term + float(bias) 77 | 78 | def conv_gru(mem, kw, kh, nmaps, cutoff, prefix, extras=[]): 79 | """Convolutional GRU.""" 80 | # mem shape: bs x length x height x nmaps 81 | def conv_lin(arg, suffix, bias_start): 82 | return conv_linear(extras + [arg], kw, kh, nmaps, 83 | prefix + "/" + suffix, bias=bias_start) 84 | reset = sigmoid_cutoff(conv_lin(mem, "r", 1), cutoff) 85 | candidate = tanh_cutoff(conv_lin(reset * mem, "c", 0), FLAGS.cutoff_tanh) 86 | gate = sigmoid_cutoff(conv_lin(mem, "g", 1), cutoff) 87 | return gate * mem + (1 - gate) * candidate 88 | 89 | def resnet_block(cur, kw, kh, nmaps, cutoff, mask, suffix, nconvs=2, 90 | extras = []): 91 | old = cur 92 | for i in range(nconvs): 93 | cur = conv_linear(extras + [cur], kw, kh, nmaps, "cgru_%d_%s" % (i, suffix)) 94 | if i == nconvs - 1: 95 | cur = old + cur 96 | cur = tf.nn.relu(cur * mask) 97 | return cur 98 | 99 | def lstm_block(cur, kw, kh, nmaps, cutoff, mask, suffix, nconvs=2, 100 | extras = []): 101 | # Do nconvs-many CGRU steps. 102 | for layer in range(nconvs): 103 | cur = conv_gru(cur, kw, kh, nmaps, cutoff, "cgru_%d_%s" % (layer, suffix), 104 | extras = extras) 105 | cur *= mask 106 | return cur 107 | 108 | def gru_block(*args, **kws): 109 | if FLAGS.do_resnet: 110 | return resnet_block(*args, **kws) 111 | else: 112 | return lstm_block(*args, **kws) 113 | 114 | def relaxed_average(var_name_suffix, rx_step): 115 | """Calculate the average of relaxed variables having var_name_suffix.""" 116 | relaxed_vars = [] 117 | for l in range(rx_step): 118 | with tf.variable_scope("RX%d" % l, reuse=True): 119 | try: 120 | relaxed_vars.append(tf.get_variable(var_name_suffix)) 121 | except ValueError: 122 | pass 123 | assert relaxed_vars 124 | dsum = tf.add_n(relaxed_vars) 125 | avg = dsum / len(relaxed_vars) 126 | diff = [v - avg for v in relaxed_vars] 127 | davg = tf.add_n([d*d for d in diff]) 128 | return avg, tf.reduce_sum(davg) 129 | 130 | 131 | def relaxed_distance(rx_step): 132 | """Distance between relaxed variables and their average.""" 133 | res, ops, rx_done = [], [], {} 134 | for v in tf.trainable_variables(): 135 | vals = v.op.name.split('/', 2) 136 | if vals[1].startswith('RX'): 137 | rx_name = vals[2] 138 | if rx_name not in rx_done: 139 | avg, dist_loss = relaxed_average(rx_name, rx_step) 140 | res.append(dist_loss) 141 | rx_done[rx_name] = avg 142 | ops.append(v.assign(rx_done[rx_name])) 143 | return tf.add_n(res), tf.group(*ops) 144 | 145 | class NeuralGPUAtSize(object): 146 | """Instantiate the NeuralGPU at a given block size.""" 147 | def __init__(self, model, length, adam): 148 | self.ntasks = 4 149 | 150 | self.config = model.config 151 | self.length = length 152 | # batch_size x length x height 153 | self.input = tf.placeholder(tf.int32, shape=(None, length, self.config.height), 154 | name="input{0}".format(length)) 155 | self.target = tf.placeholder(tf.int32, shape=(None,length), name="target{0}".format(length)) 156 | self.emb_weights = model.emb_weights 157 | self.e0 = model.e0 158 | self.do_training = tf.placeholder(tf.bool, shape=[], name="do_training") 159 | 160 | self.model = model 161 | 162 | self.task = tf.placeholder(tf.uint8, shape=(None,), name="task") 163 | 164 | self.construct_graph(adam) 165 | 166 | def construct_mask(self) : 167 | # Mask to 0-out padding space in each step. 168 | # bmask: batch_size x length 169 | bmask = tf.reduce_any(self.input > 0, 2) | (self.target > 0) 170 | # mask: batch x length x 1 x 1 171 | mask = tf.to_float(mytf.expand_dims_by_k(bmask, 2)) 172 | return mask 173 | 174 | def looping_layer(self, cur, index, *args): 175 | if FLAGS.output_layer == 1: 176 | output, = args 177 | keep_prob = 1.0 - tf.to_float(self.do_training) * (self.config.dropout * 8.0 / self.length) 178 | for it in range(self.config.rx_step): 179 | with tf.variable_scope("RX%d" % it) as vs: 180 | old = cur 181 | cur = tf.nn.dropout(cur, keep_prob) 182 | cur = gru_block(cur, self.config.kw, self.config.kh, self.config.nmaps, 183 | self.config.cutoff, self.mask, 'lookup', 184 | self.config.nconvs, extras=self.extras) 185 | 186 | if FLAGS.do_batchnorm: 187 | if FLAGS.do_batchnorm == 1: 188 | cur = mytf.batch_norm(cur, self.do_training, scope='bn') 189 | elif FLAGS.do_batchnorm == 2: 190 | cur = mytf.batch_norm(cur, self.do_training, self.mask, scope='bn') 191 | 192 | if FLAGS.output_layer == 1: 193 | output += cur 194 | else: 195 | cur = tf.select(tf.greater_equal(self.output_layers, index + it), cur, old) 196 | if FLAGS.output_layer == 1: 197 | return (cur, index + self.config.rx_step, output) 198 | else: 199 | return (cur, index + self.config.rx_step) 200 | 201 | def construct_all_layers(self, first, mask): 202 | # first: batch_size x length x height x nmaps 203 | 204 | output_layers = tf.to_int32(tf.reduce_sum(mask, [1,2,3])) 205 | 206 | cur = first 207 | layers = [] 208 | 209 | extras = [] 210 | if FLAGS.taskid: 211 | # bs x 1 x 1 x ntasks 212 | task = tf.one_hot(tf.to_int32(mytf.expand_dims_by_k(self.task, 2)), 213 | self.ntasks) 214 | extras.append(mytf.broadcast_as(task, cur, [1,2])) 215 | 216 | self.mask = mask 217 | self.extras = extras 218 | self.output_layers = output_layers 219 | it = tf.get_variable("layer_index", [], dtype=tf.int32, 220 | initializer=tf.constant_initializer(0)) 221 | # Using swap is slower, but saves GPU memory. 222 | use_swap = bool(self.config.nmaps > 256 or (FLAGS.do_batchnorm and self.config.nmaps == 128)) 223 | num_layers = int(self.config.layer_scale*self.length) 224 | args = [cur, it] + ([tf.zeros_like(cur)] if FLAGS.output_layer == 1 else []) 225 | result = tf.while_loop(lambda cur, it, *args: it < num_layers, 226 | self.looping_layer, 227 | args, 228 | parallel_iterations=1, 229 | swap_memory=use_swap) 230 | if FLAGS.output_layer == 1: 231 | ans = result[-1] 232 | else: 233 | ans = result[0] 234 | return ans 235 | 236 | def _get_first_layer(self, mask): 237 | """Turn the input into a batch_size x length x height x nmaps tensor""" 238 | nmaps = self.config.nmaps 239 | height = self.config.height 240 | 241 | # Embed inputs and calculate mask. 242 | with tf.control_dependencies([self.e0]): 243 | embedded = tf.nn.embedding_lookup(self.emb_weights, self.input) 244 | 245 | # first: batch_size x length x height x nmaps 246 | first = tf.tanh(embedded) 247 | 248 | return first 249 | 250 | def construct_graph(self, adam): 251 | nmaps = self.config.nmaps 252 | noclass = self.config.noclass 253 | 254 | mask = self.construct_mask() 255 | 256 | # The general tensor shape is 257 | # batchsize x length x height x nmaps 258 | first = self._get_first_layer(mask) 259 | 260 | # Computation steps. 261 | last_layer = self.construct_all_layers(first, mask) 262 | 263 | # Final convolution to get logits, list outputs. 264 | layer_output = conv_linear(last_layer[:,:,:1,:], 1, 1, noclass, "output") 265 | output = mytf.safe_squeeze(layer_output, -2) # batch x length x noclass 266 | 267 | self.output = mytf.softmax(output) # batch_size x length x noclass 268 | 269 | # Calculate cross-entropy loss and normalize it. 270 | targets = tf.one_hot(self.target, noclass) 271 | xent = mytf.softmax_cross_entropy_with_logits(output, targets) # shape: batch x length 272 | perp_loss = tf.reduce_mean(xent * tf.reshape(mask, [-1, self.length])) 273 | 274 | # Final loss: cross-entropy + shared parameter relaxation part. 275 | relax_dist, self.model.avg_op = relaxed_distance(self.config.rx_step) 276 | total_loss = perp_loss + relax_dist * self.model.pull 277 | self.loss = perp_loss 278 | 279 | # Gradients and Adam update operation. 280 | if self.length == data_utils.bins[0] or (self.config.mode == 0 and 281 | self.length < data_utils.bins[-1] + 1): 282 | data_utils.print_out("Creating backward for bin of length %d." % self.length) 283 | params = tf.trainable_variables() 284 | grads = tf.gradients(total_loss, params) 285 | grads, norm = tf.clip_by_global_norm(grads, self.config.max_grad_norm) 286 | self.grad_norm = norm 287 | update = adam.apply_gradients(zip(grads, params), 288 | global_step=self.model.global_step) 289 | self.update = update 290 | 291 | def __repr__(self): 292 | return '' % (self.length) 293 | 294 | def step(self, sess, batch, do_backward=False, get_steps=False, more_feed={}): 295 | """Run a step of the network.""" 296 | inp, target, taskid = batch 297 | assert inp.shape[0] == target.shape[0] 298 | assert inp.shape[-1] == target.shape[1] 299 | if len(inp.shape) == 2: 300 | inp = np.expand_dims(inp, 1) 301 | assert len(inp.shape) == 3 302 | if inp.shape[1] < self.config.height: 303 | extra = self.config.height - inp.shape[1] 304 | inp = np.concatenate([inp] + [np.zeros_like(inp[:,:1,:])]*extra, axis=1) 305 | feed_in = {} 306 | feed_in[self.do_training] = do_backward 307 | feed_in[self.task] = taskid 308 | feed_in[self.input] = inp.transpose([0,2,1]) 309 | feed_in[self.target] = target 310 | feed_out = {} 311 | feed_out.update(more_feed) 312 | if do_backward: 313 | feed_out['back_update'] = self.update 314 | feed_out['grad_norm'] = self.grad_norm 315 | if get_steps: 316 | feed_out['layers'] = self.layers 317 | if hasattr(self, 'probs'): 318 | feed_out['probs'] = self.probs 319 | feed_out['loss'] = self.loss 320 | feed_out['output'] = self.output 321 | res = sess.run(feed_out, feed_in) 322 | return NeuralGPUResult(res, inp, target, taskid) 323 | 324 | class NeuralGPU(object): 325 | """Neural GPU Model.""" 326 | def __init__(self, config): 327 | self.t = time.time() 328 | self.config = config 329 | 330 | # Feeds for parameters and ops to update them. 331 | self.global_step = tf.Variable(0, trainable=False) 332 | self.lr = tf.Variable(float(config.lr), trainable=False) 333 | self.lr_decay_op = self.lr.assign(self.lr * 0.98) 334 | self.pull = tf.Variable(float(config.pull), trainable=False) 335 | self.pull_incr_op = self.pull.assign(self.pull * config.pull_incr) 336 | 337 | # Feeds for inputs, targets, outputs, losses, etc. 338 | self.instances = [] 339 | 340 | with tf.variable_scope("model") as vs: 341 | self.construct_graph() 342 | self.saver = tf.train.Saver(tf.all_variables()) 343 | 344 | def construct_graph(self): 345 | vec_size = self.config.nmaps 346 | # Computation. 347 | self.emb_weights = tf.get_variable( 348 | "embedding", [self.config.niclass, vec_size], 349 | initializer=tf.random_uniform_initializer(-1.7, 1.7)) 350 | self.e0 = tf.scatter_update(self.emb_weights, 351 | tf.constant(0, dtype=tf.int32, shape=[1]), 352 | tf.zeros([1, vec_size])) 353 | 354 | adam = tf.train.AdamOptimizer(self.lr, epsilon=1e-4, use_locking=True) 355 | 356 | # Main graph creation loop, for every bin in data_utils. 357 | for length in sorted(list(set(data_utils.bins + [data_utils.forward_max]))): 358 | data_utils.print_out("Creating model for bin of length %d." % length) 359 | start_time = time.time() 360 | self.instances.append(NeuralGPUAtSize(self, length, adam)) 361 | tf.get_variable_scope().reuse_variables() # Later rounds reuse variables 362 | data_utils.print_out("Created model for bin of length %d in" 363 | " %.2f s." % (length, time.time() - start_time)) 364 | 365 | def get_instance_for_length(self, length): 366 | for instance in self.instances: 367 | if instance.length >= length: 368 | return instance 369 | raise IndexError('Max instance size %s; %s is too large!' % (instance.length, length)) 370 | 371 | def step(self, sess, batch, *args, **kws): 372 | """Run a step of the network.""" 373 | inp, target, taskid = batch 374 | instance = self.get_instance_for_length(target.shape[-1]) 375 | return instance.step(sess, batch, *args, **kws) 376 | 377 | def simple_step(self, sess, a): 378 | """Run a simple operation on one input. 379 | 380 | Reverses the order for you, so you can input in little endian. 381 | """ 382 | if isinstance(a, basestring): 383 | a = [data_utils.to_id(c) for c in a] 384 | else: 385 | a = list(a) 386 | l = self.get_instance_for_length(len(a)).length 387 | pad = l - len(a) 388 | input = np.array([a[::-1] + [0]*pad]) 389 | result = self.step(sess, (input, input, [0]), False) 390 | relevant_output = result.output.argmax(axis=-1)[0, :(-pad if pad else None)] 391 | return ''.join(map(data_utils.to_symbol, relevant_output[::-1])) 392 | -------------------------------------------------------------------------------- /neuralgpu/generators.py: -------------------------------------------------------------------------------- 1 | """Generators for the different problems.""" 2 | 3 | import math 4 | import random 5 | import sys 6 | import time 7 | import operator 8 | import functools 9 | import numpy as np 10 | 11 | from . import data_utils 12 | from .data_utils import SPACE, START, MINUS, DUP 13 | 14 | # This maps task names to DataGenerator instances 15 | generators = {} 16 | 17 | PADDING = False 18 | 19 | def to_base(num, b, l=1): 20 | if num < 0: 21 | val = to_base(-num, b, (l - 1) or 1) 22 | return np.concatenate([val, [MINUS-1]]) 23 | assert num >= 0 24 | ans = [] 25 | while num: 26 | ans.append(num%b) 27 | num //= b 28 | while len(ans) < l: 29 | ans.append(0) 30 | return np.array(ans) 31 | 32 | def from_base(lst, b): 33 | num = 0 34 | for v in lst[::-1]: 35 | num = num*b + v 36 | return num 37 | 38 | 39 | class DataGenerator(object): 40 | """The base class for generating problem input/output pairs""" 41 | nclass = 33 42 | name = '' 43 | taskid = 0 44 | height = None 45 | min_length = 1 46 | 47 | def is_valid_length(self, l): 48 | """Can this problem have instances of length l?""" 49 | return True 50 | 51 | def rand_pair(self, length): 52 | """Random data pair for a task. Total length should be <= length.""" 53 | raise NotImplementedError() 54 | 55 | def rand_pair_padded(self, length): 56 | """Construct a random data pair, then pad the inputs to a valid size.""" 57 | pad_length = data_utils.pad(length) 58 | inp, outp = self.rand_pair(length) 59 | inp = np.array(inp) 60 | if len(inp.shape) == 1: 61 | inp = np.array([inp]) 62 | padding_func = lambda x: np.pad(x, [(0,0)]*(len(x.shape)-1) + 63 | [(0, pad_length - x.shape[-1])], 'constant') 64 | inp, outp = padding_func(inp), padding_func(outp) 65 | assert inp.shape[-1] == pad_length, outp.shape[-1] == pad_length 66 | return inp, outp 67 | 68 | def get_batch(self, length, batch_size): 69 | """Construct a complete batch of problem instances""" 70 | inps, outps = [], [] 71 | for _ in range(batch_size): 72 | inp, outp = self.rand_pair_padded(length) 73 | inps.append(inp) 74 | outps.append(outp) 75 | 76 | inp = np.stack(inps, 0) 77 | outp = np.stack(outps, 0) 78 | return inp, outp, np.array([self.taskid] * batch_size) 79 | 80 | def _initialize(self, nclass): 81 | self.nclass = nclass 82 | 83 | def __repr__(self): 84 | return "<%s name='%s' taskid=%s>" % (self.__class__.__name__, self.name, self.taskid) 85 | 86 | class OpGenerator(DataGenerator): 87 | """Generator for instances using operations on two variables in some base""" 88 | min_length = 3 89 | 90 | def __init__(self, base, f, sep, zero_pad=True): 91 | self.base = base 92 | self.f = f 93 | self.sep = sep 94 | self.zero_pad = zero_pad 95 | 96 | def is_valid_length(self, l): 97 | return l%2 == 1 and l >= self.min_length 98 | 99 | def _rand_inputs(self, k): 100 | k = int(k) 101 | n1 = random.randint(0, self.base**k-1) 102 | n2 = random.randint(0, self.base**k-1) 103 | return (n1, n2) 104 | 105 | def rand_pair(self, l): 106 | k = int((l-1 - 2*PADDING)//2) 107 | n1, n2 = self._rand_inputs(k) 108 | result = self.f(n1, n2) 109 | inp = np.concatenate([[START] if PADDING else [], 110 | to_base(n1, self.base, k if self.zero_pad else 1) + 1, 111 | [self.sep], 112 | to_base(n2, self.base, k if self.zero_pad else 1) + 1, 113 | #[22] if PADDING else [] 114 | ]) 115 | outp = np.concatenate([#[START] if PADDING else [], 116 | to_base(result, self.base, 2*k+1 if self.zero_pad else 1) + 1, 117 | #[22] if PADDING else [] 118 | ]) 119 | return inp, outp 120 | 121 | generators.update(dict(badd=OpGenerator(2, operator.add, 11), 122 | qadd=OpGenerator(4, operator.add, 12), 123 | add=OpGenerator(10, operator.add, 13), 124 | bmul=OpGenerator(2, operator.mul, 14), 125 | qmul=OpGenerator(4, operator.mul, 15), 126 | omul=OpGenerator(8, operator.mul, 17), 127 | fmul=OpGenerator(5, operator.mul, 18), 128 | mul=OpGenerator(10, operator.mul, 16),)) 129 | 130 | generators.update(dict(baddz=OpGenerator(2, operator.add, 11, False), 131 | qaddz=OpGenerator(4, operator.add, 12, False), 132 | addz=OpGenerator(10, operator.add, 13, False), 133 | bmulz=OpGenerator(2, operator.mul, 14, False), 134 | qmulz=OpGenerator(4, operator.mul, 15, False), 135 | mulz=OpGenerator(10, operator.mul, 16, False),)) 136 | 137 | class ToughAddGenerator(OpGenerator): 138 | """More adversarial inputs for addition""" 139 | def __init__(self, base, sep, zero_pad=True): 140 | super(ToughAddGenerator, self).__init__(base, operator.add, sep, zero_pad) 141 | 142 | def _rand_inputs(self, k): 143 | r = random.random() 144 | if r < 0.2: 145 | lo, hi = sorted([random.randint(1, k), random.randint(1, k)]) 146 | vals = (self.base**hi - self.base**(lo-1), random.randint(0,self.base**(lo)-1)) 147 | elif r < .4: 148 | k2 = random.choice([k, random.randint(1, k)]) 149 | lo = random.randint(1, self.base**k2-1) 150 | vals = (lo, self.base**k2 - lo - random.randint(0,1)) 151 | else: 152 | vals = (random.randint(0, self.base**k-1), random.randint(0, self.base**k-1)) 153 | if random.random() > .5: 154 | return vals 155 | else: 156 | return vals[::-1] 157 | 158 | generators.update(dict(baddt=ToughAddGenerator(2, 11), 159 | qaddt=ToughAddGenerator(4, 12), 160 | addt=ToughAddGenerator(10, 13),)) 161 | 162 | 163 | 164 | class AlignedOpGenerator(OpGenerator): 165 | """Two-line binary inputs""" 166 | min_length = 2 167 | def rand_pair(self, l): 168 | k = int((l-1 - 2*PADDING)//2) 169 | n1, n2 = self._rand_inputs(k) 170 | result = self.f(n1, n2) 171 | n1, n2 = [np.concatenate([[START] if PADDING else [], 172 | to_base(n, self.base, k) + 1, 173 | #[22] if PADDING else [] 174 | ]) for n in [n1,n2]] 175 | preferred_length = l#max(len(n1), len(n2))+1 176 | pad_n1, pad_n2 = [np.pad(n,(0, preferred_length-len(n)), 'constant') for n in (n1, n2)] 177 | pad_n2[len(n2)] = self.sep 178 | inp2 = np.vstack([pad_n1, pad_n2]) 179 | o = np.concatenate([[START] if PADDING else [], to_base(result, self.base, l) + 1]) 180 | outp = np.pad(o, (0, preferred_length - len(o)), 'constant', constant_values=SPACE) 181 | return inp2, outp 182 | 183 | class AlignedToughAddGenerator(AlignedOpGenerator, ToughAddGenerator): 184 | pass 185 | 186 | generators.update(dict(badde=AlignedOpGenerator(2, operator.add, 11), 187 | qadde=AlignedOpGenerator(4, operator.add, 12), 188 | adde=AlignedOpGenerator(10, operator.add, 13), 189 | bmule=AlignedOpGenerator(2, operator.mul, 14), 190 | qmule=AlignedOpGenerator(4, operator.mul, 15), 191 | mule=AlignedOpGenerator(10, operator.mul, 16), 192 | baddet=AlignedToughAddGenerator(2, 11), 193 | qaddet=AlignedToughAddGenerator(4, 12), 194 | addet=AlignedToughAddGenerator(10, 13), 195 | baddzt=ToughAddGenerator(2, 11, False), 196 | qaddzt=ToughAddGenerator(4, 12, False), 197 | addzt=ToughAddGenerator(10, 13, False), 198 | )) 199 | 200 | class FGenerator(DataGenerator): 201 | def __init__(self, f): 202 | self.f = f 203 | 204 | def rand_pair(self, l): 205 | x = np.random.randint(self.nclass - 1, size=l) + 1 206 | return list(x), list(self.f(x)) 207 | 208 | generators.update(dict(rev=FGenerator(lambda l: l[::-1]), 209 | sort=FGenerator(sorted), 210 | id=FGenerator(lambda l: l), 211 | )) 212 | 213 | 214 | # With spacing 215 | class SpacedGenerator(DataGenerator): 216 | height=4 217 | 218 | def is_valid_length(self, l): 219 | return super(SpacedGenerator, self).is_valid_length(l) and l >= self.min_length 220 | 221 | def rand_pair(self, l): 222 | l2 = np.random.randint(self.min_length, l) 223 | inp, res = self._rand_pair(l2) 224 | if not hasattr(inp[0], '__iter__'): 225 | inp = [inp] 226 | inp = np.array(inp) 227 | goal_dims = (self.height, l) 228 | bots = (0, 1 if PADDING else 0) 229 | tops = (goal_dims[0] - inp.shape[0], goal_dims[1] - inp.shape[1]) 230 | placed_loc = [np.random.randint(b, t+1) for b, t in zip(bots, tops)] 231 | final_inp = np.zeros(goal_dims) + SPACE 232 | if PADDING: 233 | final_inp[:,0] = START 234 | final_inp[placed_loc[0]:placed_loc[0]+inp.shape[0], 235 | placed_loc[1]:placed_loc[1]+inp.shape[1]] = inp 236 | res = np.concatenate([res, [SPACE] * (l - len(res))]) 237 | return (final_inp, res) 238 | 239 | class CopyGenerator(SpacedGenerator): 240 | def __init__(self, base): 241 | self.base = base 242 | 243 | def _rand_pair(self, l): 244 | x = [np.random.randint(self.base)+1 for _ in range(l)] 245 | inp = x 246 | res = x 247 | return inp, res 248 | 249 | class DupGenerator(SpacedGenerator): 250 | min_length = 2 251 | def __init__(self, base): 252 | self.base = base 253 | 254 | def _rand_pair(self, l): 255 | x = [np.random.randint(self.base)+1 for _ in range(l//2)] 256 | inp = [DUP] + x 257 | res = x + x 258 | return inp, res 259 | 260 | class SpacedAlignedOpGenerator(SpacedGenerator, OpGenerator): 261 | def _rand_pair(self, l): 262 | k = int((l-1)//2) 263 | n1, n2 = self._rand_inputs(k) 264 | result = self.f(n1, n2) 265 | n1, n2 = [to_base(n, self.base) + 1 for n in [n1,n2]] 266 | preferred_length = max(len(n1), len(n2)) 267 | inp = np.array([np.pad(n, (0, preferred_length - len(n)), 'constant', 268 | constant_values=SPACE) for n in (n1, n2)]) 269 | inp = np.concatenate([[[SPACE, self.sep]], inp.T]).T 270 | o = to_base(result, self.base) + 1 271 | return inp, o 272 | 273 | class TSAOG(SpacedAlignedOpGenerator, ToughAddGenerator): 274 | pass 275 | 276 | class SpacedOpGenerator(SpacedGenerator, OpGenerator): 277 | def _rand_pair(self, l): 278 | k = int((l-1)//2) 279 | n1, n2 = self._rand_inputs(k) 280 | result = self.f(n1, n2) 281 | n1, n2 = [to_base(n, self.base) + 1 for n in [n1,n2]] 282 | inp = np.concatenate([n1, [self.sep], n2]) 283 | o = to_base(result, self.base) + 1 284 | return inp, o 285 | 286 | class TSOG(SpacedOpGenerator, ToughAddGenerator): 287 | pass 288 | 289 | generators.update(dict(scopy=CopyGenerator(10), 290 | sdup=DupGenerator(10), 291 | sbcopy=CopyGenerator(2), 292 | sbdup=DupGenerator(2), 293 | sbadde=SpacedAlignedOpGenerator(2, operator.add, 11), 294 | sbmule=SpacedAlignedOpGenerator(2, operator.mul, 14), 295 | sbaddet=TSAOG(2, 11), 296 | sbadd=SpacedOpGenerator(2, operator.add, 11), 297 | sbaddt=TSOG(2, 11), 298 | sbaddz=SpacedOpGenerator(2, operator.add, 11, False), 299 | sbaddzt=TSOG(2, 11, False), 300 | sbmul=SpacedOpGenerator(2, operator.mul, 14), 301 | )) 302 | 303 | 304 | class MultiOpGenerator(DataGenerator): 305 | """Inputs where a single operation can appear many times""" 306 | def __init__(self, base, f, sep, num, zero_chance=1, zero_pad=True): 307 | self.base = base 308 | self.f = f 309 | self.sep = sep 310 | self.num = num 311 | self.zero_pad = zero_pad 312 | self.min_length = 1 if num is None else 2*num - 1 313 | self.zero_chance = zero_chance 314 | 315 | def is_valid_length(self, l): 316 | return l >= self.min_length 317 | 318 | def _rand_inputs(self, k, num, allow_zero): 319 | k = int(k) 320 | return [random.randint(0 if allow_zero else 1, self.base**k-1) for i in range(num)] 321 | 322 | def rand_pair(self, l): 323 | num = self.num 324 | if num is None: 325 | num = random.randint(1, (l+1)//2) 326 | k = int((l+1)//num-1) 327 | allow_zero = random.random() < self.zero_chance 328 | ns = self._rand_inputs(k, num, allow_zero) 329 | result = functools.reduce(self.f, ns) 330 | input_arrays = [] 331 | for i, n in enumerate(ns): 332 | if i: 333 | input_arrays.append([self.sep]) 334 | input_arrays.append(to_base(n, self.base, k if self.zero_pad else 1)+1) 335 | inp = np.concatenate(input_arrays) 336 | outp = np.concatenate([ 337 | to_base(result, self.base, (k+1)*num-1 if self.zero_pad else 1) + 1, 338 | ]) 339 | return inp, outp 340 | 341 | generators.update({'3badd':MultiOpGenerator(2, operator.add, 11, 3), 342 | '3qadd':MultiOpGenerator(4, operator.add, 12, 3), 343 | '3add':MultiOpGenerator(10, operator.add, 13, 3), 344 | '3bmul':MultiOpGenerator(2, operator.mul, 14, 3), 345 | }) 346 | generators.update({'kbadd':MultiOpGenerator(2, operator.add, 11, None), 347 | 'kqadd':MultiOpGenerator(4, operator.add, 12, None), 348 | 'kadd':MultiOpGenerator(10, operator.add, 13, None), 349 | 'kbmul':MultiOpGenerator(2, operator.mul, 14, None, .3), 350 | }) 351 | 352 | class ExpressionGenerator(DataGenerator): 353 | """Inputs where each character has a chance of being a random operator.""" 354 | min_length = 1 355 | 356 | def __init__(self, base, operators, op_chance): 357 | self.base = base 358 | self.operators = dict(operators) 359 | self.nums = range(base) 360 | self.op_chance = op_chance 361 | 362 | self.to_num = {i: i+1 for i in self.nums} 363 | self.to_num.update(self.operators) 364 | 365 | def rand_pair(self, l): 366 | ans = [] 367 | inp = [] 368 | last_num = [] 369 | valid_op = False 370 | for i in range(l): 371 | if valid_op and random.random() < self.op_chance: 372 | choice = random.choice(self.operators.keys()) 373 | else: 374 | choice = random.choice(self.nums) 375 | inp.append(self.to_num[choice]) 376 | if choice in self.operators: 377 | ans.append(from_base(last_num, self.base)) 378 | last_num = [] 379 | ans.append(choice) 380 | valid_op = False 381 | else: 382 | last_num.append(choice) 383 | if i == l-2: 384 | valid_op = False 385 | else: 386 | valid_op = True 387 | ans.append(from_base(last_num, self.base)) 388 | string_expr = ''.join(map(str, ans[::-1])) 389 | string_expr = string_expr.replace('/', '//') 390 | try: 391 | result = eval(string_expr) 392 | except ZeroDivisionError: 393 | return self.rand_pair(l) 394 | if result < 0: 395 | return self.rand_pair(l) 396 | outp = to_base(result, self.base, l)+1 397 | return inp, outp 398 | 399 | generators.update({'bexpr':ExpressionGenerator(2, zip('+*', [11, 14]), .3), 400 | 'qexpr':ExpressionGenerator(4, zip('+*', [12, 15]), .3), 401 | 'expr':ExpressionGenerator(10, zip('+*', [13, 16]), .3),}) 402 | 403 | generators.update({'bexpra':ExpressionGenerator(2, zip('+*/-', [11, 14,17,20]), .3), 404 | 'qexpra':ExpressionGenerator(4, zip('+*/-', [12, 15,18,21]), .3), 405 | 'expra':ExpressionGenerator(10, zip('+*/-', [13, 16,19,22]), .3),}) 406 | 407 | generators.update({'bexprp':ExpressionGenerator(2, zip('+', [11]), .3), 408 | 'qexprp':ExpressionGenerator(4, zip('+', [12]), .3), 409 | 'exprp':ExpressionGenerator(10, zip('+', [13]), .3),}) 410 | 411 | generators.update({'bexprs':ExpressionGenerator(2, zip('+-', [11, 20]), .3), 412 | 'qexprs':ExpressionGenerator(4, zip('+-', [12, 21]), .3), 413 | 'exprs':ExpressionGenerator(10, zip('+-', [13, 22]), .3),}) 414 | 415 | generators.update({'bexprsm':ExpressionGenerator(2, zip('+*-', [11, 14,20]), .3), 416 | 'qexprsm':ExpressionGenerator(4, zip('+*-', [12, 15,21]), .3), 417 | 'exprsm':ExpressionGenerator(10, zip('+*-', [13, 16,22]), .3),}) 418 | 419 | for k in generators: 420 | generators[k].name = k 421 | 422 | def set_height(self, height): 423 | for k in generators: 424 | generators[k].height = height 425 | 426 | -------------------------------------------------------------------------------- /neuralgpu/trainer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Neural GPU for Learning Algorithms.""" 16 | 17 | from __future__ import print_function 18 | 19 | import math 20 | import os 21 | import random 22 | import sys 23 | import time 24 | import subprocess 25 | import yaml 26 | 27 | import matplotlib.animation as anim 28 | import matplotlib.pyplot as plt 29 | import numpy as np 30 | import tensorflow as tf 31 | 32 | from tensorflow.python.platform import gfile 33 | 34 | from . import data_utils as data 35 | from .generators import generators 36 | from .model import NeuralGPU 37 | from . import curriculum 38 | from . import mytf 39 | from . import records 40 | from .config import NeuralConfig 41 | 42 | def define_flags(): 43 | """This is placed in a function so reload() works""" 44 | tf.app.flags.DEFINE_float("lr", 0.001, "Learning rate.") 45 | tf.app.flags.DEFINE_float("init_weight", 1.0, "Initial weights deviation.") 46 | tf.app.flags.DEFINE_float("max_grad_norm", 1.0, "Clip gradients to this norm.") 47 | tf.app.flags.DEFINE_float("cutoff", 1.2, "Cutoff at the gates.") 48 | tf.app.flags.DEFINE_float("cutoff_tanh", 0.0, "Cutoff at tanh.") 49 | tf.app.flags.DEFINE_float("pull", 0.0005, "Starting pull of the relaxations.") 50 | tf.app.flags.DEFINE_float("pull_incr", 1.2, "Increase pull by that much.") 51 | tf.app.flags.DEFINE_float("curriculum_bound", 0.15, "Move curriculum < this.") 52 | tf.app.flags.DEFINE_float("dropout", 0.15, "Dropout that much.") 53 | tf.app.flags.DEFINE_integer("max_steps", 0, "Quit after this many steps.") 54 | tf.app.flags.DEFINE_integer("batch_size", 32, "Batch size.") 55 | tf.app.flags.DEFINE_integer("low_batch_size", 16, "Low batch size.") 56 | tf.app.flags.DEFINE_integer("steps_per_epoch", 200, "Steps per epoch.") 57 | tf.app.flags.DEFINE_integer("nmaps", 24, "Number of floats in each cell.") 58 | tf.app.flags.DEFINE_integer("niclass", 33, "Number of classes (0 is padding).") 59 | tf.app.flags.DEFINE_integer("noclass", 33, "Number of classes (0 is padding).") 60 | tf.app.flags.DEFINE_integer("max_length", 41, "Maximum length.") 61 | tf.app.flags.DEFINE_integer("rx_step", 6, "Relax that many recursive steps.") 62 | tf.app.flags.DEFINE_integer("random_seed", 125459, "Random seed.") 63 | tf.app.flags.DEFINE_integer("time_till_ckpt", 30, "How many tests per checkpoint") 64 | tf.app.flags.DEFINE_integer("time_till_eval", 2, "Number of steps between evals") 65 | tf.app.flags.DEFINE_integer("nconvs", 2, "How many convolutions / 1 step.") 66 | tf.app.flags.DEFINE_integer("kw", 3, "Kernel width.") 67 | tf.app.flags.DEFINE_integer("kh", 3, "Kernel height.") 68 | tf.app.flags.DEFINE_integer("height", 4, "Height.") 69 | tf.app.flags.DEFINE_integer("forward_max", 401, "Maximum forward length.") 70 | tf.app.flags.DEFINE_integer("nprint", 0, "How many test examples to print out.") 71 | tf.app.flags.DEFINE_integer("mode", 0, "Mode: 0-train other-decode.") 72 | tf.app.flags.DEFINE_bool("animate", False, "Whether to produce an animation.") 73 | tf.app.flags.DEFINE_float("smooth_grad", 0.0, "Whether to avoid clipping gradient") 74 | tf.app.flags.DEFINE_float("smooth_grad_tanh", 0.0, "Whether to avoid clipping tanh gradient") 75 | tf.app.flags.DEFINE_string("task", "badd", "Which task are we learning?") 76 | tf.app.flags.DEFINE_string("train_dir", "/tmp/neural", "Directory to store models.") 77 | 78 | tf.app.flags.DEFINE_float("layer_scale", 1.0, "Number of layers to use") 79 | 80 | # Batchnorm: 0 = none 81 | # 2 = correct 82 | # 1 = not quite correct, because of how masking is done, but simpler. 83 | tf.app.flags.DEFINE_integer("do_batchnorm", 0, "Whether to use batch normalization.") 84 | 85 | tf.app.flags.DEFINE_bool("do_resnet", False, "Whether to use resnets.") 86 | 87 | tf.app.flags.DEFINE_bool("print_one", True, "Print one example each evaluation") 88 | 89 | # output layer: 0 = standard: output layer n on length-n inputs 90 | # 1 = alternate: output sum of first n layers on length-n inputs. 91 | tf.app.flags.DEFINE_integer("output_layer", 0, "Which layer to output.") 92 | 93 | # progressive_curriculum: 0 = none: always train on first task. 94 | # 1-5: progress through the tasks in sequence, 95 | # training each one to length max_len then move on. 96 | # The different options have subtle changes; see 97 | # BetterCurriculum for details. 98 | # 5 is probably the best one. 99 | tf.app.flags.DEFINE_integer("progressive_curriculum", 0, "Whether to use progressive curriculum.") 100 | tf.app.flags.DEFINE_bool("taskid", False, "Feed task id to algorithm in each layer") 101 | 102 | tf.app.flags.DEFINE_bool("always_large", False, "Perform the large test even when the model is inaccurate") 103 | 104 | FLAGS = tf.app.flags.FLAGS 105 | if not FLAGS.__parsed: # Hack so reload() works 106 | define_flags() 107 | 108 | EXTRA_EVAL = 2 109 | 110 | 111 | log_output = None 112 | step_output = None 113 | 114 | def log_parameters(checkpoint_dir): 115 | """Write enough information in checkpoint_dir for reproducibility. 116 | 117 | Also check that we're in a new checkpoint directory. 118 | """ 119 | global log_output, step_output 120 | command_fname = os.path.join(checkpoint_dir, 'commandline') 121 | if gfile.Exists(command_fname): 122 | old_argv = open(command_fname).read().strip() 123 | new_argv = ' '.join(sys.argv) 124 | if old_argv != new_argv: 125 | data.print_out('ERROR: restarted with changed argv') 126 | data.print_out('WAS %s' % old_argv) 127 | data.print_out('NOW %s' % new_argv) 128 | raise ValueError("Bad log dir: partial state exists with different arguments") 129 | else: 130 | print('Reusing existing log dir') 131 | #raise ValueError("Even though the argv didn't change, we'll still kill you.") 132 | 133 | with open(command_fname, 'w') as f: 134 | f.write(' '.join(sys.argv)+'\n') 135 | 136 | with open(os.path.join(checkpoint_dir, 'all_args'), 'w') as f: 137 | yaml.dump(FLAGS.__flags, f, default_flow_style=False) 138 | 139 | with open(os.path.join(checkpoint_dir, 'git-rev'), 'w') as f: 140 | subprocess.call(['git', 'rev-parse', 'HEAD'], stdout=f) 141 | 142 | log_output = open(os.path.join(checkpoint_dir, 'results'), 'a', 1) 143 | step_output = open(os.path.join(checkpoint_dir, 'steps'), 'a', 1) 144 | 145 | def load_model(sess, checkpoint_dir, reconfig={}): 146 | # possibly tf.reset_default_graph() 147 | with open(os.path.join(checkpoint_dir, 'all_args')) as f: 148 | options = yaml.load(f) 149 | options.update(reconfig) 150 | FLAGS._parse_flags() 151 | FLAGS.__flags.update(options) 152 | data.forward_max = max(FLAGS.forward_max, data.bins[-1]) 153 | config = NeuralConfig(FLAGS) 154 | model = NeuralGPU(config) 155 | ckpt = tf.train.get_checkpoint_state(checkpoint_dir) 156 | if ckpt and gfile.Exists(ckpt.model_checkpoint_path): 157 | model.saver.restore(sess, ckpt.model_checkpoint_path) 158 | return model 159 | 160 | def get_checkpoint_dir(): 161 | #return FLAGS.train_dir + ('-seed%s-pid%s' % (FLAGS.random_seed, os.getpid())) 162 | return FLAGS.train_dir 163 | 164 | def get_config_from_flags(checkpoint_dir = None): 165 | # Set random seed. 166 | seed = FLAGS.random_seed 167 | tf.set_random_seed(seed) 168 | random.seed(seed) 169 | np.random.seed(seed) 170 | 171 | # Create checkpoint directory if it does not exist. 172 | if checkpoint_dir is None: 173 | checkpoint_dir = get_checkpoint_dir() 174 | if not gfile.IsDirectory(checkpoint_dir): 175 | data.print_out("Creating checkpoint directory %s." % checkpoint_dir) 176 | try: 177 | gfile.MkDir(os.path.dirname(checkpoint_dir)) 178 | except OSError as e: 179 | pass 180 | gfile.MkDir(checkpoint_dir) 181 | 182 | data.err_tee = data.TeeErr(open(os.path.join(checkpoint_dir, "err"), 'w')) 183 | 184 | data.print_out("NN ", newline=False) 185 | 186 | config = NeuralConfig(FLAGS) 187 | 188 | # Check data sizes. 189 | while len(data.bins) > 1 and data.bins[-2] > config.max_length + EXTRA_EVAL: 190 | data.bins = data.bins[:-1] 191 | assert data.bins[0] > FLAGS.rx_step 192 | data.forward_max = max(FLAGS.forward_max, data.bins[-1]) 193 | 194 | return config 195 | 196 | def initialize(sess, checkpoint_dir=None): 197 | """Initialize data and model.""" 198 | config = get_config_from_flags(checkpoint_dir) 199 | data.print_out(str(sys.argv)) 200 | data.print_out(str(config)) 201 | 202 | if checkpoint_dir is None: 203 | checkpoint_dir = get_checkpoint_dir() 204 | log_parameters(checkpoint_dir) 205 | 206 | # Initialize data for each task. 207 | nclass = min(config.niclass, config.noclass) 208 | tasks = config.task.split(",") 209 | data_generators = [generators[t] for t in tasks] 210 | for g in data_generators: 211 | g._initialize(nclass) 212 | 213 | # Create model and initialize it. 214 | tf.get_variable_scope().set_initializer( 215 | tf.uniform_unit_scaling_initializer(factor=1.8 * FLAGS.init_weight)) 216 | model = NeuralGPU(config) 217 | data.print_out("Created model.") 218 | sess.run(tf.initialize_all_variables()) 219 | data.print_out("Initialized variables.") 220 | 221 | # Load model from parameters if a checkpoint exists. 222 | ckpt = tf.train.get_checkpoint_state(checkpoint_dir) 223 | model.curriculum = None 224 | if ckpt and gfile.Exists(ckpt.model_checkpoint_path): 225 | data.print_out("Reading model parameters from %s" 226 | % ckpt.model_checkpoint_path) 227 | model.saver.restore(sess, ckpt.model_checkpoint_path) 228 | try: 229 | model.curriculum = yaml.load(open(os.path.join(checkpoint_dir, 'neural_gpu_curriculum.ckpt'))) 230 | except IOError: 231 | pass 232 | 233 | if model.curriculum is None: 234 | if FLAGS.progressive_curriculum: 235 | model.curriculum = curriculum.BetterCurriculum(data_generators, model.config, 236 | FLAGS.progressive_curriculum) 237 | else: 238 | model.curriculum = curriculum.GeneralizeCurriculum(data_generators, model.config) 239 | 240 | # Return the model and needed variables. 241 | return model 242 | 243 | 244 | def single_test(l, model, sess, task, nprint, batch_size, print_out=True, 245 | offset=None, get_steps=False, batch=None): 246 | """Test model on test data of length l using the given session.""" 247 | if batch is None: 248 | batch, _ = model.curriculum.draw_example(batch_size, l, task) 249 | result = model.step(sess, batch, False, get_steps=get_steps) 250 | errors, total, seq_err = result.accuracy(nprint) 251 | seq_err = float(seq_err) / batch_size 252 | if total > 0: 253 | errors = float(errors) / total 254 | if print_out: 255 | data.print_out(" %s len %d errors %.2f sequence-errors %.2f" 256 | % (task, l, 100*errors, 100*seq_err)) 257 | return errors, seq_err, result 258 | 259 | 260 | def multi_test(l, model, sess, task, nprint, batch_size, offset=None): 261 | """Run multiple tests at lower batch size to save memory.""" 262 | errors, seq_err = 0.0, 0.0 263 | to_print = nprint 264 | low_batch = FLAGS.low_batch_size 265 | low_batch = min(low_batch, batch_size) 266 | for mstep in range(batch_size // low_batch): 267 | cur_offset = None if offset is None else offset + mstep * low_batch 268 | err, sq_err, result = single_test(l, model, sess, task, to_print, low_batch, 269 | False, cur_offset) 270 | to_print = max(0, to_print - low_batch) 271 | errors += err 272 | seq_err += sq_err 273 | errors = float(low_batch) * float(errors) / batch_size 274 | seq_err = float(low_batch) * float(seq_err) / batch_size 275 | data.print_out(" %s len %d errors %.2f sequence-errors %.2f" 276 | % (task, l, 100*errors, 100*seq_err)) 277 | return errors, seq_err, result 278 | 279 | class Timer(object): 280 | """Utility class for tracking time used in a function""" 281 | def __init__(self, label, print_fn=data.print_out): 282 | self.startt = time.time() 283 | self.label = label 284 | self.print_fn = print_fn 285 | self.print_fn('Start %s' % self.label) 286 | 287 | def done(self): 288 | self.print_fn('Finish %s, took %s seconds' % (self.label, time.time()-self.startt)) 289 | 290 | def train_for_a_bit(sess, model, batch_size, nsteps, thresh=0.0): 291 | results_record = records.ResultsRecord(batch_size) 292 | for _ in range(nsteps): 293 | 294 | batch, within_bounds = model.curriculum.draw_example(batch_size) 295 | 296 | # Run a step and time it. 297 | start_time = time.time() 298 | result = model.step(sess, batch, True) 299 | 300 | # Accumulate statistics only if we did not exceed curriculum length. 301 | results_record.feed(result, time.time() - start_time, within_bounds) 302 | 303 | global_step, lr, pull = sess.run( [model.global_step, model.lr, model.pull]) 304 | # Normalize and print out accumulated statistics. 305 | message = ('step %s ' % (global_step, ) + 306 | 'len %s ' % model.curriculum.length_str + 307 | 'lr %.8f pull %.3f ' % (lr, pull) + 308 | '%s' % str(results_record) 309 | ) 310 | data.print_out(message) 311 | print(message, file=step_output) 312 | if FLAGS.do_batchnorm: 313 | mytf.print_bn_state(sess, model.config.nmaps) 314 | 315 | would_extend = model.curriculum.consider_extending(results_record) 316 | decent = (would_extend >= 1) 317 | extended = (would_extend >= 2) 318 | # If errors are below the curriculum threshold, move curriculum forward. 319 | if decent: 320 | # Either increase pull or, if it's large, average parameters. 321 | if pull < 0.1: 322 | sess.run(model.pull_incr_op) 323 | else: 324 | data.print_out(" Averaging parameters.") 325 | sess.run(model.avg_op) 326 | 327 | # Lower learning rate if we're worse than the last 3 checkpoints. 328 | # [XXX the logic isn't great in mixed-task settings; it picks one 329 | # task semi-arbitrary.] 330 | first_record = sorted(results_record.record_for_task.items())[0][1] 331 | acc_perp = data.safe_exp(first_record.avg_loss) 332 | if acc_perp > thresh: 333 | data.print_out("Lower learning rate: %s %s" % (acc_perp, thresh)) 334 | sess.run(model.lr_decay_op) 335 | return (extended, acc_perp) 336 | 337 | def run_evaluation(sess, model, batch_size): 338 | global_step, = sess.run( [model.global_step]) 339 | for task in model.curriculum.tasks(): 340 | errors = [] 341 | for batch, length in model.curriculum.test_examples(batch_size, task): 342 | _, seq_err, result = single_test(length, model, sess, task, 343 | FLAGS.nprint, batch_size, batch=batch) 344 | errors.append(seq_err) 345 | if len(errors) >= 4 and min(errors[-4:]) == 1: 346 | break 347 | if FLAGS.print_one: 348 | data.print_out(result.to_string(0)) 349 | if seq_err < 0.05 or FLAGS.always_large: # Run larger test if we're good enough. 350 | _, seq_err, result = multi_test(data.forward_max, model, sess, task, 351 | FLAGS.nprint, batch_size * 4) 352 | data.print_out("LARGE ERROR: %s %s %s" % (global_step, seq_err, task)) 353 | log_output.write('%s %s %s\n' % (global_step, seq_err, task)) 354 | if FLAGS.print_one: 355 | data.print_out(result.to_string(0)) 356 | if seq_err < 0.01: # Super-large test on 1-task large-forward models. 357 | if data.forward_max > 4000 and len(tasks) == 1: 358 | multi_test(data.forward_max, model, sess, task, FLAGS.nprint, 359 | batch_size * 16, 0) 360 | 361 | def checkpoint(sess, model, checkpoint_dir): 362 | checkpoint_path = os.path.join(checkpoint_dir, "neural_gpu.ckpt") 363 | global_step, = sess.run( [model.global_step]) 364 | model.saver.save(sess, checkpoint_path, 365 | global_step=model.global_step, 366 | write_meta_graph=False) 367 | with open(os.path.join(checkpoint_dir, 'neural_gpu_curriculum.ckpt'), 'w') as f: 368 | yaml.dump(model.curriculum, f) 369 | 370 | 371 | def train_loop(sess, model, batch_size, checkpoint_dir): 372 | time_till_ckpt = FLAGS.time_till_ckpt 373 | time_till_eval = FLAGS.time_till_eval 374 | # Main training loop. 375 | accuracies = [1e4]*3 376 | while True: 377 | data.print_out("Reminder: checkpoint dir %s" % checkpoint_dir) 378 | timer = Timer("training steps") 379 | extended, acc = train_for_a_bit(sess, model, batch_size, FLAGS.steps_per_epoch, 380 | max(accuracies[-3:])) 381 | accuracies.append(acc) 382 | if extended: # If we extended, don't just lower the learning rate 383 | accuracies.append(1000) 384 | timer.done() 385 | 386 | # Save checkpoint. 387 | time_till_ckpt -= 1 388 | if time_till_ckpt == 0: 389 | time_till_ckpt = FLAGS.time_till_ckpt 390 | timer = Timer("saving checkpoint") 391 | checkpoint(sess, model, checkpoint_dir) 392 | timer.done() 393 | 394 | # Run evaluation. 395 | global_step, = sess.run( [model.global_step]) 396 | time_till_eval -= 1 397 | if time_till_eval == 0: 398 | time_till_eval = FLAGS.time_till_eval 399 | timer = Timer("running evaluation %s" % global_step) 400 | run_evaluation(sess, model, batch_size) 401 | timer.done() 402 | 403 | global_step, = sess.run( [model.global_step]) 404 | if FLAGS.max_steps and global_step >= FLAGS.max_steps: 405 | data.print_out("Finished all %s steps" % global_step) 406 | checkpoint(sess, model, checkpoint_dir) 407 | break 408 | 409 | def start_and_train(): 410 | """Train the model.""" 411 | with tf.Session() as sess: 412 | timer = Timer('initialization') 413 | model = initialize(sess) 414 | timer.done() 415 | train_loop(sess, model, FLAGS.batch_size, get_checkpoint_dir()) 416 | -------------------------------------------------------------------------------- /plots/get_pretty_score.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | from __future__ import print_function 3 | import fileinput 4 | 5 | import sys 6 | import numpy as np 7 | import pandas as pd 8 | import argparse 9 | import glob 10 | import scipy.signal 11 | import os 12 | import yaml 13 | import shutil 14 | import joblib 15 | import functools 16 | import re 17 | import pickle 18 | 19 | import collections 20 | import pylab 21 | from matplotlib import rc 22 | import matplotlib 23 | import matplotlib.gridspec as gridspec 24 | import matplotlib.ticker as mtick 25 | 26 | rc('font', size='12') 27 | rc('text', usetex=True) 28 | rc('axes', labelsize='large') 29 | 30 | rc('axes', prop_cycle="cycler('color', ['b','g','r','c','m','y','k'] + " 31 | "['orange', 'darkgreen', 'indigo', 'gold', 'fuchsia'])") 32 | #pylab.rcParams['axes.prop_cycle'] = ("cycler('color', 'bgrcmyk'*2)") 33 | 34 | parser = argparse.ArgumentParser(description='Get scores') 35 | 36 | RESULT='score' 37 | 38 | 39 | parser.add_argument("--key", type=str, default="seq-errors,score") 40 | parser.add_argument("--job", type=str, default='plot') 41 | parser.add_argument("--task", type=str, default=None) 42 | parser.add_argument("--exclude_opts", type=str, default=None) 43 | parser.add_argument("--title", type=str, default='') 44 | parser.add_argument("--titles", type=str, default='') 45 | parser.add_argument("--savedir", type=str, default='') 46 | parser.add_argument("--min-length", type=int, default=2) 47 | parser.add_argument("--dirs-in-name", type=int, default=1) 48 | parser.add_argument("--one-legend", type=bool, default=True) 49 | parser.add_argument("--global-legend", type=str, default='') 50 | parser.add_argument("--save-to", type=str, default='') 51 | parser.add_argument("--skip-dir", action='store_true') 52 | parser.add_argument("--success", action='store_true') 53 | parser.add_argument("--recache", action='store_true') 54 | parser.add_argument("--separate_seeds", action='store_true') 55 | parser.add_argument("--median", action='store_true') 56 | parser.add_argument("--order", type=str, default='') 57 | parser.add_argument("--colorcycle", type=str, default='') 58 | parser.add_argument("--std", type=bool, default=True) 59 | parser.add_argument("--only-same", type=bool, default=False) 60 | parser.add_argument("--smoothing", type=int, default='11') 61 | parser.add_argument("--remove_strings", type=str, default='') 62 | parser.add_argument("--remove_strings2", type=str, default='') 63 | parser.add_argument('files', type=str, nargs='+', 64 | help='Log files to examine') 65 | parser.add_argument("--xlims", type=str, default='') 66 | parser.add_argument("--ylims", type=str, default='') 67 | 68 | parser.add_argument("--nbinsx", type=str, default='') 69 | parser.add_argument("--nbinsy", type=str, default='') 70 | 71 | parser.add_argument("--overlay", type=int, default=1) 72 | parser.add_argument("--only-plot", type=str, default=None) 73 | 74 | parser.add_argument("--xticks", type=str, default='') 75 | parser.add_argument("--yticks", type=str, default='') 76 | parser.add_argument("--lw", type=int, default=3) 77 | parser.add_argument("--figsize", type=str, default='') 78 | 79 | parser.add_argument('--traces', dest='traces', action='store_true') 80 | parser.add_argument('--no-traces', dest='traces', action='store_false') 81 | parser.set_defaults(traces=False) 82 | 83 | parser.add_argument('--startx', dest='startx', action='store_true') 84 | parser.add_argument('--no-startx', dest='startx', action='store_false') 85 | parser.set_defaults(startx=True) 86 | parser.add_argument('--starty', dest='starty', action='store_true') 87 | parser.add_argument('--no-starty', dest='starty', action='store_false') 88 | parser.set_defaults(starty=True) 89 | 90 | parser.add_argument('--simplify', dest='simplify', action='store_true') 91 | parser.add_argument('--no-simplify', dest='simplify', action='store_false') 92 | parser.set_defaults(simplify=True) 93 | 94 | memory = joblib.Memory(cachedir='/home/ecprice/neural_gpu/cache', 95 | verbose=1) 96 | 97 | def recache(f): 98 | g = memory.cache(f) 99 | @functools.wraps(g) 100 | def cached(*args, **kwargs): 101 | if recache.do_recache: 102 | try: 103 | shutil.rmtree(g.get_output_dir(*args, **kwargs)[0]) 104 | except OSError: # Not actually in cache 105 | pass 106 | return g(*args, **kwargs) 107 | return cached 108 | 109 | recache.do_recache = False 110 | 111 | @recache 112 | def get_results_dict(fname): 113 | if not os.path.exists(fname): 114 | return {} 115 | answer = {} 116 | with open(fname) as f: 117 | for line in f: 118 | words = line.split() 119 | if not words: # Blank line on restart 120 | continue 121 | loc, val = words[:2] 122 | taskname = words[2] 123 | if taskname not in answer: 124 | answer[taskname] = pd.Series(name=RESULT) 125 | try: 126 | answer[taskname].loc[int(loc)] = float(val) 127 | except ValueError: 128 | pass 129 | return answer 130 | 131 | def get_scores_dict(fname): 132 | with open(fname) as f: 133 | for line in f: 134 | if line.startswith('step '): 135 | entries = line.split() 136 | d = collections.OrderedDict(zip(entries[::2], entries[1::2])) 137 | try: 138 | yield d 139 | except ValueError: 140 | break 141 | 142 | @recache 143 | def get_dfs(dirname, tasknames): 144 | fname = dirname+'/steps' 145 | if not os.path.exists(fname): 146 | fname = dirname+'/log0' 147 | data_series = {t:{} for t in tasknames} 148 | for d in get_scores_dict(fname): 149 | lens = d['len'].split('/') 150 | if 'progressive_curriculum=5' in fname: 151 | missing = [i for i in range(len(tasknames)) if lens[i] != '41'] or [len(tasknames)-1] 152 | else: 153 | missing = [] 154 | for key in d: 155 | vals = d[key].split('/') 156 | if len(vals) == 1 and (key == 'step' or not missing): 157 | vals *= len(tasknames) 158 | elif len(vals) == len(missing): 159 | vals2 = [np.nan]*len(tasknames) 160 | for i, v in zip(missing, vals): 161 | vals2[i] = v 162 | vals = vals2 163 | elif len(vals) < len(tasknames): #Failed to get data for one 164 | vals = [np.nan]*len(tasknames) 165 | for val, task in zip(vals, tasknames): 166 | data_series[task].setdefault(key, []).append(float(val)) 167 | dfs = {} 168 | for task in data_series: 169 | try: 170 | dfs[task] = pd.DataFrame(data_series[task], index=data_series[task]['step']) 171 | dfs[task] = dfs[task].drop_duplicates(subset='step', keep='last') 172 | except KeyError: #Hasn't gotten to 'step' line yet 173 | pass 174 | return dfs 175 | 176 | def matches(fname, exclude_opts): 177 | if exclude_opts: 178 | for opt in exclude_opts.split('|'): 179 | if opt in fname: 180 | return True 181 | return False 182 | 183 | class Scores(object): 184 | def __init__(self, dirname, tasknames=None, prefix=''): 185 | self.dirname = dirname 186 | self.index = 0 187 | if tasknames is None: 188 | tasknames = get_tasks(self.key) 189 | self.tasknames = tasknames 190 | self.prefix = prefix 191 | self.result_dfs = {} 192 | self.dfs = {} 193 | 194 | @property 195 | def key(self): 196 | return get_key(self.dirname) 197 | 198 | def args_str(self, task=None): 199 | label = get_key(self.dirname[len(self.prefix):]) 200 | return (label + 201 | (' (%s)' % task if task and len(self.tasknames) > 1 else '')) 202 | 203 | def last_loc(self): 204 | options = ([d.index[-1] for d in self.result_dfs.values()] + 205 | [d.index[-1] for d in self.dfs.values()]) 206 | ans = max(options or [3]) 207 | if ans == 200200 or ans == 60200: 208 | ans -= 200 209 | return ans 210 | 211 | def get_scores(self, key, task): 212 | if key == RESULT: 213 | self._load_results() 214 | if task is None: 215 | assert len(self.result_dfs) == 1 216 | task = self.result_dfs.keys()[0] 217 | if task not in self.result_dfs: 218 | basic = pd.Series([1], name=RESULT) 219 | basic.loc[self.last_loc()] = 1 220 | return basic 221 | return self.result_dfs[task] 222 | else: 223 | self._load_scores() 224 | if task is None: 225 | assert len(self.dfs) == 1 226 | task = self.dfs.keys()[0] 227 | if task not in self.dfs: 228 | return None 229 | if key in ['errors', 'seq-errors']: 230 | scale = 0.01 231 | else: 232 | scale = 1 233 | return self.dfs[task].get(key) * scale 234 | 235 | def _load_results(self): 236 | if self.result_dfs: 237 | return 238 | self.result_dfs = get_results_dict(self.dirname+'/results') 239 | 240 | def _load_scores(self): 241 | if self.dfs: 242 | return 243 | self.dfs = get_dfs(self.dirname, self.tasknames) 244 | 245 | def commandline(self): 246 | return open(os.path.join(self.dirname, 'commandline')).read().split() 247 | 248 | def total_steps(self): 249 | lens = self.get_scores('len', self.tasknames[0]) 250 | return lens.index[-1].item() if lens is not None else None 251 | 252 | def get_name(fname): 253 | fname = remove_defaults(fname) 254 | for s in args.remove_strings2.split('|'): 255 | fname = fname.replace(s, '') 256 | ans = '/'.join(fname.split('/')[:2]) 257 | ans = ans.replace('_', r'\_') 258 | return ans 259 | 260 | def plot_startx(key): 261 | pylab.xlabel('Steps of training') 262 | def plot_starty(key): 263 | if key: 264 | mapping = {'score': 'Test error', 265 | 'seq-errors': 'Training error',} 266 | pylab.ylabel(mapping.get(key, key)) 267 | else: 268 | pylab.ylabel('Sequence error on large input') 269 | 270 | def plot_results(fname, frame): 271 | label = get_name(fname)#fname 272 | fmt = dict() 273 | if frame is None: #Just put in legend 274 | pylab.plot([], label=label, **fmt) 275 | return 276 | x = frame.index 277 | ysets = list(frame.T.values) 278 | if args.smoothing > 1: 279 | f = lambda y: scipy.signal.savgol_filter(y, args.smoothing, 1) if len(y) > args.smoothing else y 280 | else: 281 | f = lambda y: y 282 | ysets = np.array(map(f, ysets)).T 283 | y = np.median(ysets, axis=1) if args.median else ysets.mean(axis=1) 284 | v=pylab.plot(x, y, 285 | label=label, 286 | **fmt 287 | ) 288 | if args.traces: 289 | for ys in list(ysets.T): 290 | pylab.plot(x, ys, alpha=0.2, 291 | color=v[0].get_color(), 292 | ) 293 | pylab.fill_between(frame.index, ysets.min(axis=1), ysets.max(axis=1), 294 | alpha=0.15, color=v[0].get_color()) 295 | 296 | #for k in frame.columns: 297 | # pylab.scatter(frame.index, frame[k].values, alpha=0.15, color=v[0].get_color()) 298 | 299 | def get_tasks(key): 300 | if 'task' not in key: 301 | return ['rev'] 302 | else: 303 | locs = key.split('=') 304 | index = [i for i,a in enumerate(locs) if a.endswith('task')][0]+1 305 | tasks = locs[index].split('-')[0].split(',') 306 | return tasks 307 | 308 | def remove_defaults(fname): 309 | for default in ['max_steps=200000', 310 | 'max_steps=40000', 311 | 'max_steps=60000', 312 | 'max_steps=80000', 313 | 'max_steps=100000', 314 | 'forward_max=201', 315 | # 'forward_max=401', 316 | 'max_length=41', 317 | 'time_till_eval=4', 318 | 'always_large=True', 319 | 'do_resnet=False', 320 | 'do_binarization=0.0', 321 | 'do_batchnorm=0', 322 | 'do_shifter=0', 323 | 'progressive_curriculum=False', 324 | 'cutoff_tanh=0.0', 325 | 'input_height=2', 326 | 'batch_size=32', 327 | ]: 328 | fname = fname.replace(default+'-', '') 329 | if fname.endswith(default): 330 | fname = fname[:-len(default)-1] 331 | if fname.startswith('random_seed='): 332 | fname = fname.split('-', 1)[1] 333 | if 'task' in fname and len(fname.split('task=')[1].split('-')[0].split(',')) == 1: 334 | for s in ['2', '3', '4', '5', 'True']: 335 | fname = fname.replace('-progressive_curriculum=%s' % s, '') 336 | if args.simplify: 337 | fname = fname.replace('badd,baddt', 'badd') 338 | fname = fname.replace('baddt,badd', 'baddt') 339 | fname = fname.replace('badde,baddet', 'badde') 340 | fname = fname.replace('baddet,badde', 'baddet') 341 | fname = fname.replace('baddz,baddzt', 'baddz') 342 | fname = fname.replace('baddzt,baddz', 'baddzt') 343 | fname = re.sub('(task=[^-]*)-(nmaps=[0-9]*)', r'\2-\1', fname) 344 | for s in args.remove_strings.split('|'): 345 | fname = fname.replace(s, '') 346 | return fname 347 | 348 | def get_key(fname): 349 | if not args.separate_seeds: 350 | fname = fname.split('-seed')[0] 351 | fname = '/'.join(fname.split('/')[-args.dirs_in_name:]) 352 | fname = remove_defaults(fname) 353 | return fname 354 | 355 | def get_prefix(fileset): 356 | longest_cp = os.path.commonprefix(fileset) 357 | i = 1 358 | while i <= len(longest_cp) and longest_cp[-i] not in '-/': 359 | i += 1 360 | return longest_cp[:len(longest_cp)+ 1-i] 361 | 362 | def sort_key_fn(label): 363 | return label.replace('nmaps=24', 'nmaps=024') 364 | 365 | badkeys = set() 366 | def plot_all(func, scores, column=None, taskset=None, order=None): 367 | d = {} 368 | for s in scores: 369 | d.setdefault(s.key, []).append(s) 370 | 371 | keys = sorted(d, key=sort_key_fn) 372 | ordered_keys = [] 373 | for key in keys: 374 | if matches(key, args.exclude_opts): 375 | continue 376 | ordered_keys.append(key) 377 | if order: 378 | ordered_keys = [ordered_keys[i-1] for i in order] 379 | for key in ordered_keys: 380 | for task in d[key][0].tasknames: 381 | if (key, task) in badkeys: 382 | continue 383 | if task not in taskset: 384 | continue 385 | columns = [score.get_scores(column, task) 386 | for score in d[key]] 387 | columns = [c for c in columns if c is not None and not c.isnull().all()] 388 | def strip_last(c): 389 | if c is None or c.index[-1] != 200200: 390 | return c 391 | return c[c.index[:-1]] 392 | columns = map(strip_last, columns) 393 | if column == 'len' and args.success: 394 | if not [c for c in columns if c is not None and c.values[-1] > 10]: 395 | badkeys.add((key, task)) 396 | continue 397 | median_len = np.median([len(c) for c in columns if c is not None]) 398 | if column != 'score': 399 | columns = [c for c in columns if len(c) >= median_len / 2 and len(c) >= args.min_length] 400 | else: 401 | length_fn = lambda c: c.last_valid_index() // 200 402 | median_len = np.median(map(length_fn, columns)) 403 | columns = [c for c in columns if length_fn(c) >= median_len / 2 and length_fn(c) >= args.min_length and len(c) > 1] 404 | data = pd.DataFrame(columns).T 405 | if not len(data): 406 | func(score.args_str(), None) 407 | continue 408 | try: 409 | loc = data.first_valid_index() 410 | except IndexError: 411 | continue 412 | data.loc[loc] = data.loc[loc].fillna(1) 413 | data = data.interpolate(method='nearest') 414 | func(score.args_str(), data) 415 | 416 | legend_locs = dict(score='upper right', 417 | len='lower right', 418 | errors='upper right') 419 | 420 | def get_filter(column): 421 | if column == 'len': 422 | return lambda x: x == 41 423 | else: 424 | return lambda x: x < 0.01 425 | 426 | def get_print_results(scores, column, avg=10): 427 | assert len(set(x.key for x in scores)) == 1 428 | ans = {} 429 | for task in scores[0].tasknames: 430 | columns = [score.get_scores(column, task) for score in scores] 431 | columns = [c for c in columns if c is not None] 432 | if not columns: 433 | continue 434 | last_values = [np.mean(c.values[-avg:]).item() for c in columns] 435 | filt = get_filter(column) 436 | times = [c.index[np.where(filt(c))] for c in columns] 437 | first_time = [t[0].item() if len(t) else None for t in times] 438 | ans[task] = {} 439 | ans[task]['last'] = last_values 440 | ans[task]['first-time'] = first_time 441 | ans[task]['fraction'] = len([x for x in first_time if x is not None]) * 1. / len(times) 442 | 443 | return ans 444 | 445 | def construct_parsed_data(scores, columns, save_dir): 446 | d = {} 447 | for s in scores: 448 | if s.total_steps() < 50000: 449 | continue 450 | d.setdefault(s.key, []).append(s) 451 | 452 | for i, key in enumerate(d): 453 | ans = {} 454 | ans['metadata'] = dict(commandline=d[key][0].commandline(), 455 | count = len(d[key]), 456 | steps = [s.total_steps() for s in d[key]] 457 | ) 458 | for col in columns: 459 | ans[col] = get_print_results(d[key], col) 460 | with open(os.path.join(save_dir, key), 'w') as f: 461 | print(yaml.safe_dump(ans), file=f) 462 | print("Done %s/%s" % (i+1, len(d))) 463 | 464 | @recache 465 | def is_valid_dir(f): 466 | return os.path.exists(os.path.join(f, 'log0')) 467 | 468 | gs = None 469 | 470 | def run_plots(args, scores, all_tasks, keys): 471 | global gs 472 | if args.colorcycle: 473 | if ',' in args.colorcycle: 474 | lst = args.colorcycle.split(',') 475 | else: 476 | lst = list(args.colorcycle) 477 | rc('axes', prop_cycle=matplotlib.cycler('color', lst)) 478 | 479 | rc('lines', linewidth=args.lw) 480 | title = args.title 481 | if not title: 482 | title = os.path.split(args.files[0])[-2] 483 | pylab.suptitle(title, size=18) 484 | goal_xlim = None 485 | axes = [[None for _ in range(len(all_tasks))] for _ in range(len(keys))] 486 | 487 | figkws = {} 488 | if args.figsize: 489 | figkws['figsize']=map(int, args.figsize.split(',')) 490 | fig = pylab.figure(1,**figkws) 491 | task_overlays = args.overlay 492 | if gs is None: 493 | gs = gridspec.GridSpec(len(keys), len(all_tasks) / task_overlays) 494 | for ki, key in enumerate(keys): 495 | for i, task in enumerate(all_tasks): 496 | full_plot_index = ki*len(all_tasks) + i 497 | plot_index = full_plot_index // task_overlays 498 | if args.only_plot and plot_index + 1 != int(args.only_plot.split(',')[0]): 499 | continue 500 | print("Subplot %s/%s" % (full_plot_index+1, len(all_tasks)*len(keys))) 501 | sharex = axes[0][i] 502 | if args.only_plot: 503 | newloc = int(args.only_plot.split(',')[1]) 504 | ax = fig.add_subplot(gs[newloc-1]) 505 | axes[ki][i] = ax 506 | else: 507 | axes[ki][i] = fig.add_subplot(gs[plot_index], sharex=sharex) 508 | if ki == len(keys)-1 and args.startx: 509 | plot_startx(key) 510 | if i == 0 and args.starty: 511 | plot_starty(key) 512 | order = get_value(args.order, i) 513 | if order: 514 | order = map(int, order.split(',')) 515 | plot_all(plot_results, scores, column=key, taskset = [task], order=order) 516 | if not args.global_legend and (not args.one_legend or (ki == len(keys)-1 and 517 | (i == len(all_tasks)-1 or 1))): 518 | pylab.legend(loc=legend_locs.get(key, 0)) 519 | if not args.titles: 520 | pylab.title('Task %s' % task) 521 | else: 522 | pylab.title(args.titles.split('|')[plot_index]) 523 | maxy = None 524 | if key in ('score', 'errors', 'seq-errors'): 525 | maxy = 1 526 | axes[ki][i].yaxis.set_major_formatter(mtick.FuncFormatter( 527 | lambda x, pos: '% 2d\\%%' % (x*100) 528 | )) 529 | ylims = map(float, get_value(args.ylims, ki).split(',')) if args.ylims else (0,1) 530 | pylab.ylim(ylims) 531 | xlims = map(float, get_value(args.xlims, i).split(',')) if args.xlims else (0,None) 532 | pylab.xlim(xlims) 533 | 534 | if args.nbinsx: 535 | pylab.locator_params(axis='x',nbins=int(get_value(args.nbinsx, i))) 536 | if args.nbinsy: 537 | pylab.locator_params(axis='y',nbins=int(get_value(args.nbinsy, ki))) 538 | if args.yticks: 539 | pylab.yticks(map(float, get_value(args.yticks, ki).split(','))) 540 | 541 | axes[ki][i].xaxis.set_major_formatter(mtick.FuncFormatter( 542 | lambda x, pos: '%dk' % (x//1000) if x else '0' 543 | )) 544 | rect = [0,0,1,.92] 545 | if args.global_legend: 546 | if not args.only_plot: 547 | ax = [row for row in axes if row[0]][0][0] 548 | lines,labels = ax.get_legend_handles_labels() 549 | my_labels = args.global_legend.split('|') 550 | if my_labels == ['1']: 551 | my_labels = labels 552 | if my_labels != ['0']: 553 | if my_labels != ['2']: 554 | fig.legend(lines, my_labels, loc='lower center', 555 | ncol=2, labelspacing=0.) 556 | rect = [0, 0.1, 1, 0.92] 557 | gs.tight_layout(fig, rect=rect) 558 | if args.save_to: 559 | pylab.savefig(args.save_to) 560 | else: 561 | pylab.show() 562 | 563 | def get_value(s, i): 564 | v = s.split('|') 565 | if len(v) == 1: 566 | return v[0] 567 | return v[i] 568 | 569 | 570 | def main(): 571 | global args 572 | args = parser.parse_args() 573 | recache.do_recache = args.recache 574 | print("Started") 575 | all_tasks = sorted(set(x for file in args.files for x in get_tasks(get_key(file)))) 576 | if args.task: 577 | all_tasks = args.task.split(',') 578 | keys = args.key.split(',') 579 | prefix = get_prefix(args.files) 580 | scores = [Scores(f, prefix=prefix) for f in args.files if is_valid_dir(f)] 581 | if args.job == 'parse': 582 | if args.savedir: 583 | construct_parsed_data(scores, keys, args.savedir) 584 | else: 585 | ans = {} 586 | for key in keys: 587 | ans[key] = get_print_results(scores, key) 588 | print(yaml.safe_dump(ans)) 589 | elif args.job == 'plot': 590 | run_plots(args, scores, all_tasks, keys) 591 | 592 | ''' 593 | python get_pretty_score.py cachedlogs/{Jul,A}*/*24*={b,}add{e,z,}* --task badd,badde,baddz,add,adde,addz --remove_strings '|-progressive_|curriculum=2|curriculum=5' --exclude='forward_max|rx_step|cutoff|binar|grad_noise|t,|dropout|badd,add|batchnorm|resnet' --min-length 30 --title 'Alignment helps addition' --titles='||Binary addition, 24 filters|' --xlims='0,30000' --nbinsx=3 --global-legend='Padded|Aligned|Unpadded' --overlay=3 --save-to=moo.pdf --no-startx dump magic1.pickle 594 | python get_pretty_score.py cachedlogs/{Jul,A}*/*128*={b,}add{e,z,}* --task badd,badde,baddz,add,adde,addz --remove_strings '|-progressive_|curriculum=2|curriculum=5' --exclude='kbadd|qbadd|qadd|3badd|3add|kadd|curric|forward_max|rx_step|cutoff|binar|grad_noise|t,|dropout|badd,add|curriculum|resnet|batchn' --min-length 30 --title 'Alignment helps addition' --titles='Binary, 128 filters|Decimal, 128 filters||' --xlims='0,30000' --nbinsx=3 --overlay=3 --save-to=moo.pdf --global-legend='Padded|Aligned|Unpadded' dump magic2.pickle 595 | 596 | python get_pretty_score.py cachedlogs/{Jul,A}*/*24*=bmul{e,z,}-* --task mul,mule,mulz,bmul,bmule,bmulz --remove_strings '|-progressive_|curriculum=2|curriculum=5|max_steps=80000-' --exclude='forward_max|rx_step|cutoff|binar|grad_noise|t,|dropout|batchn|resn|layer' --min-length 30 --title 'Alignment hurts multiplication' --overlay=3 --global-legend=2 --titles '|||Binary multiplication, 24 filters' --no-startx --xlims='0,100000' --save-to=moo.pdf dump magic3.pickle 597 | 598 | python get_pretty_score.py cachedlogs/{Jul,A}*/*128*=bmul{e,z,}-* --task mul,mule,mulz,bmul,bmule,bmulz --remove_strings '|-progressive_|curriculum=2|curriculum=5|max_steps=80000-' --exclude='forward_max|rx_step|cutoff|binar|grad_noise|t,|dropout|batchn|resn|layer' --min-length 30 --title 'Alignment helps addition, hurts multiplication' --overlay=3 --global-legend=2 --titles '|||Binary multiplication, 128 filters' --save-to=moo.pdf dump magic4.pickle 599 | 600 | ''' 601 | if __name__ == '__main__': 602 | if sys.argv[1] == 'magic': 603 | for i, loc in enumerate(['3,1', '3,3', '4,2', '4,4']): 604 | sys.argv[1:] = pickle.load(open('magic%s.pickle' % (i+1))) + ['--only-plot', loc] 605 | main() 606 | sys.exit() 607 | if len(sys.argv) > 1 and 'dump' == sys.argv[-2]: 608 | loc = sys.argv.pop() 609 | sys.argv.pop() 610 | pickle.dump(sys.argv[1:], open(loc, 'w')) 611 | sys.exit() 612 | main() 613 | --------------------------------------------------------------------------------