├── .gitignore ├── README.md ├── akaitsuki-slow ├── config.py ├── feed_dict.pbtxt ├── feed_dict.py └── main.py ├── autotune ├── README.md ├── autograd_lib.py ├── autograd_lib_test.py ├── autograd_test.py ├── ciresan_bench.py ├── curvature_test.py ├── eval_conv2d_approx.py ├── factored_test.py ├── globals.py ├── hessian_test.py ├── linalg_bench.py ├── linesearch_test_disabled.py ├── lyapunov_test.py ├── mnist_end2end_test.py ├── plotting_test.py ├── pytorch_benchmark.py ├── scipy_benchmark.py ├── svd_benchmark.py ├── test │ ├── bad_sigmas.pt │ ├── factored.pt │ └── gesvd_crash.txt ├── train_ciresan.py ├── train_ciresan_cca.py ├── train_ciresan_factored.py ├── train_ciresan_new.py ├── train_medium.py ├── train_small.py ├── train_small_xent.py ├── train_small_xent_factored.py ├── train_tiny.py ├── train_tiny_xent.py ├── util.py └── util_test.py ├── aws-recipes.ipynb ├── aws-scratch.ipynb ├── benchmark_huggingface_predict.py ├── bin └── tfversion ├── clipping-profile.ipynb ├── cluster ├── .gitignore ├── README.md ├── async_adder.py ├── aws.py ├── benchmark_grpc_recv.py ├── benchmarks │ ├── .DS_Store │ ├── .gitignore │ ├── LICENSE │ ├── README.md │ ├── bower_components │ │ ├── d3 │ │ │ ├── .bower.json │ │ │ ├── .gitattributes │ │ │ ├── CONTRIBUTING.md │ │ │ ├── LICENSE │ │ │ ├── README.md │ │ │ ├── bower.json │ │ │ ├── d3.js │ │ │ ├── d3.min.js │ │ │ └── package.js │ │ └── plottable │ │ │ ├── .bower.json │ │ │ ├── bower.json │ │ │ ├── plottable.css │ │ │ ├── plottable.d.ts │ │ │ ├── plottable.js │ │ │ └── plottable.min.js │ ├── dashboard_app │ │ ├── .DS_Store │ │ ├── app.yaml │ │ ├── main.py │ │ ├── main_test.py │ │ ├── requirements.txt │ │ ├── static │ │ │ ├── css │ │ │ │ └── style.css │ │ │ └── js │ │ │ │ └── benchmark_latency_chart.js │ │ └── templates │ │ │ ├── index.html │ │ │ └── test.html │ ├── index.html │ ├── js │ │ ├── csv_benchmark_chart.js │ │ └── latency_chart.js │ ├── scripts │ │ ├── Dockerfile.tf_cnn_benchmarks │ │ ├── benchmark_configs.yml │ │ ├── tf_cnn_benchmarks │ │ │ ├── .DS_Store │ │ │ ├── README.md │ │ │ ├── benchmark_cnn.py │ │ │ ├── benchmark_storage.py │ │ │ ├── cbuild_benchmark_storage.py │ │ │ ├── cnn_util.py │ │ │ ├── convnet_builder.py │ │ │ ├── datasets.py │ │ │ ├── models │ │ │ │ ├── __init__.py │ │ │ │ ├── alexnet_model.py │ │ │ │ ├── densenet_model.py │ │ │ │ ├── googlenet_model.py │ │ │ │ ├── inception_model.py │ │ │ │ ├── lenet_model.py │ │ │ │ ├── model.py │ │ │ │ ├── model_config.py │ │ │ │ ├── overfeat_model.py │ │ │ │ ├── resnet_model.py │ │ │ │ ├── trivial_model.py │ │ │ │ └── vgg_model.py │ │ │ ├── preprocessing.py │ │ │ ├── tf_cnn_benchmarks.py │ │ │ └── variable_mgr.py │ │ └── util │ │ │ ├── __init__.py │ │ │ ├── benchmark_util.py │ │ │ ├── benchmark_util_test.py │ │ │ ├── convert_csv_to_json.py │ │ │ └── convert_csv_to_json_test.py │ ├── soumith_benchmarks.html │ └── tools │ │ ├── k8s_tensorflow_lib.py │ │ ├── k8s_tensorflow_test.py │ │ ├── kubectl_util.py │ │ ├── kubectl_util_test.py │ │ └── run_distributed_benchmarks.py ├── client_transfer_benchmark.py ├── cloud-formation-example │ ├── README.md │ ├── iam.yaml │ ├── tensorflow.yaml │ └── zone.sh ├── connect ├── connect.py ├── delete_placement_groups.py ├── fill_efs.py ├── imagenet64 │ ├── README.md │ ├── aws.py │ ├── launch.py │ ├── requirements.txt │ └── variable_mgr.py ├── instance_info.py ├── launch_async_adder.py ├── launch_micro.py ├── launch_ray.py ├── launch_simple_tf.py ├── local_distributed_benchmark.py ├── myutil.py ├── ray_add.py ├── simple_distributed.py ├── terminate_instances.py ├── test_aws.py ├── tf-tools │ ├── .gitignore │ ├── benchmark │ │ ├── multi_gpu │ │ │ ├── advanced_tweaks_compare.sh │ │ │ ├── image_classification_bench_tests.sh │ │ │ ├── stats_monitor.sh │ │ │ ├── test_runner.sh │ │ │ └── unit_test_stats_monitor.sh │ │ └── runner │ │ │ ├── cluster_aws.py │ │ │ ├── command_builder.py │ │ │ ├── configs │ │ │ └── aws │ │ │ │ ├── multi_server.yaml │ │ │ │ └── yaroslav.yaml │ │ │ ├── instance_info.py │ │ │ ├── launch_experiment.py │ │ │ ├── test_cluster_aws.py │ │ │ ├── test_command_builder.py │ │ │ └── util.py │ └── install │ │ ├── aws_amzlinux.md │ │ └── aws_ubuntu16_04.md ├── tmux.py └── upload_test.txt ├── conditional_backprop.py ├── configure_tf.sh ├── configure_tf_cpu.sh ├── danjar_peek.py ├── distributed ├── README.md ├── benchmark_grpc_recv.py └── client_transfer_benchmark.py ├── double_memory_bug.py ├── dynamic_stitch_gpu.py ├── dynamic_stitch_gpu_profile.pbtxt ├── eager_lbfgs ├── .ipynb_checkpoints │ └── performance-checkpoint.ipynb ├── common_gd.py ├── data │ ├── short_batch.csv │ ├── short_eager_batch.csv │ ├── short_eager_loss.csv │ ├── short_eager_time.csv │ ├── short_pytorch_loss.csv │ └── short_pytorch_time.csv ├── eager_lbfgs.py ├── performance.ipynb ├── pytorch_lbfgs.py ├── run_experiment.py ├── torch_lbfgs.lua └── util.py ├── enqueue_many_test.py ├── enqueue_many_test_singlerun.py ├── ericyue-slowreader ├── benchmark-batch-noqueuerunners-timeline.json ├── benchmark-batch-noqueuerunners.profile ├── benchmark-batch-noqueuerunners.py ├── benchmark-batch.py ├── benchmark-reader.py ├── benchmark-synthetic-batch.py ├── benchmark-synthetic.py ├── benchmark.py ├── data.zlib └── profile-batch.py ├── example.png ├── free_gpus.py ├── github_pyfunc_slowness.py ├── gpu-memory-transfer.ipynb ├── gpu_oom.py ├── gpu_svd_bench.py ├── graph_template.py ├── graphvis.png ├── imagenet15-scratch.ipynb ├── input_benchmarks ├── convert_to_records.py ├── fully_connected_feed.py ├── fully_connected_preloaded_var.py ├── fully_connected_reader.py ├── timeline.feed.json ├── timeline.reader.json └── timeline.var.json ├── inverse_segfault.py ├── jupyter-version.png ├── keras_autoencoder ├── keras_large.py ├── util.py └── weightnorm.py ├── khatri_rao_benchmark.py ├── lazy_dog.py ├── linalg-benchmark ├── README.md ├── bad_matrix.py ├── benchmark.py ├── environment.yml ├── get_cores_per_socket.py ├── launch.py ├── launch_tensorflow_svd_crash.py ├── requirements.txt ├── results.txt └── tensorflow_svd_crash.py ├── line_search_example ├── data │ └── step_lengths_ada.csv ├── line_search_example.py └── util.py ├── linearize ├── linearize.py ├── linearize_test.py └── memory_util.py ├── matmul_benchmark.py ├── matmul_benchmark_seq.py ├── matmul_times ├── 1080-float16.csv ├── 1080-float32.csv ├── g3-float16.csv ├── g3-float32.csv ├── nvidia-p3-float16.csv ├── nvidia-p3-float32.csv ├── p2-float16.csv └── p2-float32.csv ├── mavelin ├── machine1.py └── machine3.py ├── memory tracking.ipynb ├── memory-probe-examples.ipynb ├── memory-release-check.ipynb ├── natural_gradient_multilayer.py ├── node-merge.ipynb ├── notebook_util.py ├── numpy_initializers ├── kfac_cifar.py └── util.py ├── parallel_dequeue_test.py ├── phantomjs-tryout.ipynb ├── phantomjs-tryout.js ├── pytorch-hessian.ipynb ├── queue_mismatch.py ├── queues_talk ├── queues.ipynb └── slides.pdf ├── resnet_8_simple.pbtxt ├── resnet_leak_report.py ├── resnet_leak_report2.py ├── resource_variable_test.py ├── rotations_comparison.py ├── saving memory by using functions.ipynb ├── simple_rewiring.ipynb ├── simple_train.py ├── svd_benchmark.py ├── svd_noconverge.py ├── svd_test.py ├── tensorflow-memory-talk.pdf ├── tf_initializer_bug_report.py ├── tiny_runs ├── qr_test.py └── tiny_tf.py └── whitening_util.py /.gitignore: -------------------------------------------------------------------------------- 1 | /__pycache__ 2 | /.ipynb_checkpoints 3 | *# 4 | *~ 5 | /linalg-benchmark/.idea/linalg-benchmark.iml 6 | /linalg-benchmark/.idea/misc.xml 7 | /linalg-benchmark/.idea/modules.xml 8 | /linalg-benchmark/.idea/vcs.xml 9 | /linalg-benchmark/.idea/workspace.xml 10 | /linalg-benchmark/.idea 11 | .DS_Store 12 | __pycache__ 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # stuff 2 | -------------------------------------------------------------------------------- /akaitsuki-slow/config.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | 4 | def str2bool(v): 5 | return v.lower() in ('y', 'yes', 't', 'true', '1') 6 | 7 | 8 | def get_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.register('type', 'bool', str2bool) 11 | 12 | parser.add_argument('--random_seed', 13 | type=int, 14 | default=1013, 15 | help='Random seed') 16 | 17 | parser.add_argument('--vocab_size', 18 | type=int, 19 | default=10000, 20 | help='Default embed size') 21 | 22 | parser.add_argument('--embed_size', 23 | type=int, 24 | default=128, 25 | help='Default embedding size if embedding_file is not given') 26 | 27 | parser.add_argument('--hidden_size', 28 | type=int, 29 | default=128, 30 | help='Hidden size of RNN units') 31 | 32 | parser.add_argument('--num_labels', 33 | type=int, 34 | default=96, 35 | help='num labels') 36 | 37 | parser.add_argument('--bidir', 38 | type='bool', 39 | default=True, 40 | help='bidir: whether to use a bidirectional RNN') 41 | 42 | parser.add_argument('--num_layers', 43 | type=int, 44 | default=1, 45 | help='Number of RNN layers') 46 | 47 | parser.add_argument('--rnn_type', 48 | type=str, 49 | default='gru', 50 | help='RNN type: lstm or gru (default)') 51 | 52 | parser.add_argument('--batch_size', 53 | type=int, 54 | default=32, 55 | help='Batch size') 56 | 57 | parser.add_argument('--dropout_rate', 58 | type=float, 59 | default=0.2, 60 | help='Dropout rate') 61 | 62 | parser.add_argument('--optimizer', 63 | type=str, 64 | default='sgd', 65 | help='Optimizer: sgd (default) or adam or rmsprop') 66 | 67 | parser.add_argument('--learning_rate', '-lr', 68 | type=float, 69 | default=0.1, 70 | help='Learning rate for SGD') 71 | 72 | return parser.parse_args() 73 | 74 | -------------------------------------------------------------------------------- /akaitsuki-slow/feed_dict.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tensorflow.python.client import timeline 4 | 5 | 6 | sess = tf.Session() 7 | a = tf.placeholder(tf.float32) 8 | b = a*2 9 | c0 = sess.run([b], feed_dict={a:2.}) 10 | 11 | run_metadata = tf.RunMetadata() 12 | run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) 13 | run_options.output_partition_graphs=True 14 | 15 | c0 = sess.run([b], feed_dict={a:2.}, options=run_options, 16 | run_metadata=run_metadata) 17 | with open("feed_dict.pbtxt", "w") as f: 18 | f.write(str(run_metadata)) 19 | -------------------------------------------------------------------------------- /autotune/README.md: -------------------------------------------------------------------------------- 1 | To run tests in this directory 2 | 3 | ``` 4 | pytest 5 | ``` 6 | 7 | If there's a slow test, you can run this file directly to see timings of individual tests, ie 8 | 9 | ``` 10 | python linesearch_test.py 11 | ``` 12 | -------------------------------------------------------------------------------- /autotune/globals.py: -------------------------------------------------------------------------------- 1 | # Module to hold global variables for curvature computation functions. 2 | # This is needed sincne functionality may be split over several modules 3 | 4 | from typing import Optional 5 | 6 | import torch 7 | from torch.utils.tensorboard import SummaryWriter 8 | 9 | event_writer: Optional[SummaryWriter] = None 10 | project_name: Optional[str] = 'train_ciresan' # project name to use for wandb logging 11 | logdir_base: str = '/ncluster/runs' 12 | run_name: Optional[str] = None # run name to use, corresponds to logging dir and wandb run name 13 | logdir: Optional[str] = None # logdir 14 | token_count: int = 0 # TODO(y): rename to global-step. Meaning is context-specific, in case of sequences it's number of tokens 15 | 16 | args = None # global arg values 17 | debug_dump_stats: bool = False # print activations/backprops to console 18 | debug_linalg_crashes: bool = False # save matrices that cause linalg routines to crash 19 | 20 | 21 | # debug_hard_crashes_on_nans: bool = True # crash if encountering NaN 22 | 23 | hacks_disable_hess = False 24 | 25 | 26 | if torch.cuda.is_available(): 27 | device = torch.device('cuda') 28 | print("Using GPU") 29 | else: 30 | device = torch.device('cpu') 31 | 32 | 33 | def reset_global_step(): 34 | global token_count 35 | token_count = 0 36 | 37 | 38 | def increment_global_step(incr: int): 39 | global token_count 40 | token_count += incr 41 | 42 | 43 | def get_global_step() -> int: 44 | return token_count 45 | 46 | 47 | -------------------------------------------------------------------------------- /autotune/linalg_bench.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | from typing import Optional, Tuple, Callable 5 | 6 | # import torch 7 | import scipy 8 | import torch 9 | from torchcurv.optim import SecondOrderOptimizer 10 | 11 | 12 | import torch.nn as nn 13 | 14 | import util as u 15 | 16 | import numpy as np 17 | 18 | """ 19 | MKL version unknown 20 | PyTorch version 1.2.0 21 | Scipy version: 1.2.1 22 | Numpy version: 1.16.4 23 | 1024-by-1024 matrix 24 | 7079.93 linalg.solve_lyapunov 25 | 280.11 linalg.pinvh 26 | 1186.08 linalg.pinv 27 | 49.18 linalg.inv 28 | 118.23 qr 29 | 413.42 svd 30 | """ 31 | 32 | class Net(nn.Module): 33 | def __init__(self, d): 34 | super().__init__() 35 | self.w = nn.Linear(d, 1, bias=False) 36 | 37 | def forward(self, x: torch.Tensor): 38 | result = self.w(x) 39 | return result 40 | 41 | 42 | class timeit: 43 | """Decorator to measure length of time spent in the block in millis and log 44 | it to TensorBoard. This function is 45 | """ 46 | 47 | def __init__(self, tag=""): 48 | self.tag = tag 49 | 50 | def __enter__(self): 51 | self.start = time.perf_counter() 52 | return self 53 | 54 | def __exit__(self, *args): 55 | self.end = time.perf_counter() 56 | interval_ms = 1000 * (self.end - self.start) 57 | print(f"{interval_ms:8.2f} {self.tag}") 58 | 59 | 60 | def get_mkl_version(): 61 | import ctypes 62 | import numpy as np 63 | 64 | # this recipe only works on Linux 65 | try: 66 | ver = np.zeros(199, dtype=np.uint8) 67 | mkl = ctypes.cdll.LoadLibrary("libmkl_rt.so") 68 | mkl.MKL_Get_Version_String(ver.ctypes.data_as(ctypes.c_char_p), 198) 69 | return ver[ver != 0].tostring() 70 | except: 71 | return 'unknown' 72 | 73 | 74 | def print_cpu_info(): 75 | ver = 'unknown' 76 | try: 77 | for l in open("/proc/cpuinfo").read().split('\n'): 78 | if 'model name' in l: 79 | ver = l 80 | break 81 | except: 82 | pass 83 | 84 | 85 | def linalg_bench(): 86 | if np.__config__.get_info("lapack_mkl_info"): 87 | print("MKL version", get_mkl_version()) 88 | else: 89 | print("not using MKL") 90 | 91 | print("PyTorch version", torch.version.__version__) 92 | 93 | print("Scipy version: ", scipy.version.full_version) 94 | print("Numpy version: ", np.version.full_version) 95 | 96 | for d in [1024]: 97 | print(f"{d}-by-{d} matrix") 98 | n = 10000 99 | assert n > 2*d # to prevent singularity 100 | X = np.random.random((d, 10000)) 101 | Y = np.random.random((d, 10000)) 102 | H = X @ X.T 103 | S = Y @ Y.T 104 | 105 | with timeit(f"linalg.solve_lyapunov"): 106 | result = scipy.linalg.solve_lyapunov(H, S) 107 | #print(result[0,0]) 108 | 109 | with timeit(f"linalg.pinvh"): 110 | result = scipy.linalg.pinvh(H) 111 | #print(result[0, 0]) 112 | 113 | with timeit(f"linalg.pinv"): 114 | result = scipy.linalg.pinv(H) 115 | #print(result[0, 0]) 116 | 117 | 118 | with timeit(f"linalg.inv"): 119 | result = scipy.linalg.inv(H) 120 | #print(result[0, 0]) 121 | 122 | with timeit(f"qr"): 123 | result = scipy.linalg.qr(H) 124 | #print(result[0, 0]) 125 | 126 | with timeit(f"qr-pivoting"): 127 | result = scipy.linalg.qr(H, pivoting=True) 128 | #print(result[0, 0]) 129 | 130 | with timeit(f"svd"): 131 | result = scipy.linalg.svd(H) 132 | #print(result[0, 0]) 133 | 134 | 135 | 136 | if __name__ == '__main__': 137 | linalg_bench() 138 | -------------------------------------------------------------------------------- /autotune/pytorch_benchmark.py: -------------------------------------------------------------------------------- 1 | """ 2 | (pytorch_p36) [ec2-user@ip-172-31-6-232 cifar]$ python pytorch_benchmark.py 3 | MKL version b'Intel(R) Math Kernel Library Version 2019.0.4 Product Build 20190411 for Intel(R) 64 architecture applications' 4 | PyTorch version 1.1.0 5 | Scipy version: 1.3.0 6 | Numpy version: 1.16.4 7 | Benchmarking 1024-by-1024 matrix on cuda:0 8 | 882.84 svd 9 | 17.22 inv 10 | 227.04 pinv 11 | 452.77 eig 12 | 227.18 svd 13 | 14 | 15 | Laptop 16 | 17 | MKL version unknown 18 | PyTorch version 1.2.0 19 | Scipy version: 1.2.1 20 | Numpy version: 1.16.4 21 | CPU version: unknown 22 | CPU logical cores: 8 23 | CPU physical cores: 4 24 | CPU physical sockets: 0 25 | Benchmarking 1024-by-1024 matrix on cpu 26 | 170.24 svd 27 | 22.41 inv 28 | 206.70 pinv 29 | 247.92 eig 30 | 180.16 pinverse 31 | 20.08 solve 32 | 124.89 svd 33 | 14.57 inv 34 | 197.24 pinv 35 | 221.06 eig 36 | 213.46 pinverse 37 | 21.75 solve 38 | 39 | """ 40 | import os 41 | import sys 42 | import time 43 | 44 | import numpy as np 45 | 46 | import util as u 47 | 48 | import torch 49 | 50 | # from @eamartin 51 | def empty_aligned(n, align): 52 | """Get n bytes of memory wih alignment align.""" 53 | a = np.empty(n + (align - 1), dtype=np.float32) 54 | data_align = a.ctypes.data % align 55 | offset = 0 if data_align == 0 else (align - data_align) 56 | return a[offset: offset + n] 57 | 58 | 59 | def benchmark(method): 60 | 61 | start_time = time.time() 62 | times = [] 63 | 64 | for i in range(1): 65 | if method == 'svd': 66 | _result = torch.svd(H) 67 | open('/dev/null', 'w').write(str(_result[0])) 68 | elif method == 'inv': 69 | _result = torch.inverse(H) 70 | open('/dev/null', 'w').write(str(_result[0])) 71 | elif method == 'pinv': 72 | _result = u.pinv(H) 73 | open('/dev/null', 'w').write(str(_result[0])) 74 | elif method == 'pinverse': 75 | _result = torch.pinverse(H) 76 | open('/dev/null', 'w').write(str(_result[0])) 77 | elif method == 'eig': 78 | _result = torch.symeig(H, eigenvectors=True) 79 | open('/dev/null', 'w').write(str(_result[0])) 80 | elif method == 'svd': 81 | _result = torch.svd(H) 82 | open('/dev/null', 'w').write(str(_result[0])) 83 | elif method == 'solve': 84 | _result = torch.solve(S, H) 85 | open('/dev/null', 'w').write(str(_result[0])) 86 | else: 87 | assert False 88 | new_time = time.time() 89 | elapsed_time = 1000 * (new_time - start_time) 90 | print(f"{elapsed_time:8.2f} {method}") 91 | start_time = new_time 92 | times.append(elapsed_time) 93 | 94 | 95 | if __name__ == '__main__': 96 | methods = ['svd', 'inv', 'pinv', 'eig', 'pinverse', 'solve']*2 97 | 98 | u.print_version_info() 99 | d = 1024 100 | 101 | x0 = torch.rand(d).reshape((d, 1)).float() 102 | 103 | X = torch.rand((d, 10000)) 104 | Y = torch.rand((d, 10000)) 105 | H = X @ X.t() 106 | S = Y @ Y.t() 107 | 108 | if torch.cuda.is_available(): 109 | [x0, X, Y, H, S] = u.move_to_gpu([x0, X, Y, H, S]) 110 | 111 | print(f"Benchmarking {d}-by-{d} matrix on {x0.device}") 112 | for method in methods: 113 | benchmark(method) 114 | 115 | # Other timings: svd 116 | # n=1000 Times: min: 126.04, median: 132.48 117 | # n=2000 Times: min: 573.03, median: 621.49 118 | # n=4096 Times: min: 5586.02, median: 6032.16 119 | # Other timings: inv 120 | # Times: min: 17.87, median: 23.41, mean: 27.90 121 | -------------------------------------------------------------------------------- /autotune/svd_benchmark.py: -------------------------------------------------------------------------------- 1 | # Fastest way to compute eigenvectors for 4k matrix? 2 | # 3 | # Inverse on i3.metal 4 | # n=4096: 368 ms ± 1.51 ms per loop 5 | # 6 | # Xeon V3 benchmarks: 7 | # n=4096 eigs min: 27758.34, median: 28883.69 8 | # n=4096 gesdd min: 7241.70, median: 8477.95 9 | # n=4096 gesvd min=20487.48, median: 22057.64, 10 | # n=4096 inv min: 556.67, median: 579.25, 11 | # n=4096 linsolve: min: 534.40, median: 558.06, mean: 579.19 12 | # 13 | # Xeon V4: 14 | # n=4096 gesdd min: 5586.02, median: 6032.16 15 | # 16 | # 17 | # i7-5820K CPU @ 3.30GHz 18 | # n=4096 gesdd 7288.02, median: 7397.23, mean: 7478.78 19 | # n=4096 inv 520 msec 20 | # 21 | # after upgrading things 22 | # b'Intel(R) Math Kernel Library Version 2017.0.3 Product Build 20170413 for Intel(R) 64 architecture applications' 23 | # n=4096 inv 1427.54 24 | 25 | 26 | from scipy import linalg # for svd 27 | import numpy as np 28 | import time 29 | import sys 30 | 31 | 32 | # from @eamartin 33 | def empty_aligned(n, align): 34 | """Get n bytes of memory wih alignment align.""" 35 | a = np.empty(n + (align - 1), dtype=np.float32) 36 | data_align = a.ctypes.data % align 37 | offset = 0 if data_align == 0 else (align - data_align) 38 | return a[offset : offset + n] 39 | 40 | 41 | def benchmark(method): 42 | n=1024 43 | x_old = np.random.randn(n*n).reshape((n,n)).astype(dtype=np.float32) 44 | x = empty_aligned(n*n, 32).reshape((n, n)) 45 | x[:] = x_old 46 | x = x @ x.T 47 | 48 | x0 = np.random.randn(n).reshape((n,1)).astype(dtype=np.float32) 49 | 50 | start_time = time.time() 51 | times = [] 52 | 53 | for i in range(1): 54 | if method == 'gesdd': 55 | result = linalg.svd(x) 56 | elif method == 'gesvd': 57 | result = linalg.svd(x, lapack_driver='gesvd') 58 | elif method == 'eigh': 59 | result = linalg.eigh(x) 60 | elif method == 'inv': 61 | result = linalg.inv(x) 62 | elif method == 'inv2': 63 | result = linalg.inv(x, overwrite_a=True) 64 | elif method == 'linsolve': 65 | result = linalg.solve(x, x0) 66 | else: 67 | assert False 68 | new_time = time.time() 69 | elapsed_time = 1000*(new_time - start_time) 70 | print(f"elapsed_time:8.2f} {method}") 71 | start_time = new_time 72 | times.append(elapsed_time) 73 | 74 | 75 | if __name__=='__main__': 76 | methods = ['gesdd', 'gesvd', 'eigh', 'inv', 'inv2', 'linsolve'] 77 | 78 | for method in methods: 79 | benchmark(method) 80 | 81 | 82 | 83 | 84 | # Other timings: svd 85 | # n=1000 Times: min: 126.04, median: 132.48 86 | # n=2000 Times: min: 573.03, median: 621.49 87 | # n=4096 Times: min: 5586.02, median: 6032.16 88 | # Other timings: inv 89 | # Times: min: 17.87, median: 23.41, mean: 27.90 90 | -------------------------------------------------------------------------------- /autotune/test/bad_sigmas.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaroslavvb/stuff/a8024ead315aa1b5d6976940b3a062178f0e499d/autotune/test/bad_sigmas.pt -------------------------------------------------------------------------------- /autotune/test/factored.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaroslavvb/stuff/a8024ead315aa1b5d6976940b3a062178f0e499d/autotune/test/factored.pt -------------------------------------------------------------------------------- /bin/tfversion: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | os.environ['TF_CPP_MIN_LOG_LEVEL']='2' 4 | import tensorflow as tf 5 | version=tf.__version__ 6 | print("version: %s"%(version,)) 7 | commit = tf.__git_version__ 8 | print("__git_version__: %s"%(commit,)) 9 | # commit looks like this 10 | # 'v1.0.0-65-g4763edf-dirty' 11 | commit = commit.replace("'","") 12 | if commit.endswith('-dirty'): 13 | dirty = True 14 | commit = commit[:-len('-dirty')] 15 | commit=commit.rsplit('-g', 1)[1] 16 | url = 'https://github.com/tensorflow/tensorflow/commit/'+commit 17 | print("Commit %s" %(url,)) -------------------------------------------------------------------------------- /cluster/.gitignore: -------------------------------------------------------------------------------- 1 | /.DS_Store 2 | -------------------------------------------------------------------------------- /cluster/README.md: -------------------------------------------------------------------------------- 1 | # cluster 2 | train on AWS 3 | -------------------------------------------------------------------------------- /cluster/benchmarks/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaroslavvb/stuff/a8024ead315aa1b5d6976940b3a062178f0e499d/cluster/benchmarks/.DS_Store -------------------------------------------------------------------------------- /cluster/benchmarks/.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /cluster/benchmarks/README.md: -------------------------------------------------------------------------------- 1 | # Instructions for adding distributed benchmarks to continuous run: 2 | 3 | 1. You can add your benchmark file under 4 | [tensorflow/benchmarks/scripts](https://github.com/tensorflow/benchmarks/tree/master/scripts) directory. The benchmark should accept `task_index`, `job_name`, `ps_hosts` and `worker_hosts` flags. You can copy-paste the following flag definitions: 5 | 6 | ```python 7 | tf.app.flags.DEFINE_integer("task_index", None, "Task index, should be >= 0.") 8 | tf.app.flags.DEFINE_string("job_name", None, "job name: worker or ps") 9 | tf.app.flags.DEFINE_string("ps_hosts", None, "Comma-separated list of hostname:port pairs") 10 | tf.app.flags.DEFINE_string("worker_hosts", None, "Comma-separated list of hostname:port pairs") 11 | ``` 12 | 2. Report benchmark values by calling `store_data_in_json` from your benchmark 13 | code. This function is defined in 14 | [benchmark\_util.py](https://github.com/tensorflow/benchmarks/blob/master/scripts/util/benchmark_util.py). 15 | 3. Create a Dockerfile that sets up dependencies and runs your benchmark. For 16 | example, see [Dockerfile.tf\_cnn\_benchmarks](https://github.com/tensorflow/benchmarks/blob/master/scripts/Dockerfile.tf_cnn_benchmarks). 17 | 4. Add the benchmark to 18 | [benchmark\_configs.yml](https://github.com/tensorflow/benchmarks/blob/master/scripts/benchmark_configs.yml) 19 | * Set `benchmark_name` to a descriptive name for your benchmark and make sure 20 | it is unique. 21 | * Set `worker_count` and `ps_count`. 22 | * Set `docker_file` to the Dockerfile path starting with `benchmarks/` 23 | directory. 24 | * Optionally, you can pass flags to your benchmark by adding `args` list. 25 | 5. Send PR with the changes to annarev. 26 | 27 | Currently running benchmarks: 28 | https://benchmarks-dot-tensorflow-testing.appspot.com/ 29 | 30 | For any questions, please contact annarev@google.com. 31 | -------------------------------------------------------------------------------- /cluster/benchmarks/bower_components/d3/.bower.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "d3", 3 | "version": "3.5.5", 4 | "main": "d3.js", 5 | "scripts": [ 6 | "d3.js" 7 | ], 8 | "ignore": [ 9 | ".DS_Store", 10 | ".git", 11 | ".gitignore", 12 | ".npmignore", 13 | ".spmignore", 14 | ".travis.yml", 15 | "Makefile", 16 | "bin", 17 | "component.json", 18 | "composer.json", 19 | "index.js", 20 | "lib", 21 | "node_modules", 22 | "package.json", 23 | "src", 24 | "test" 25 | ], 26 | "homepage": "https://github.com/mbostock-bower/d3-bower", 27 | "_release": "3.5.5", 28 | "_resolution": { 29 | "type": "version", 30 | "tag": "v3.5.5", 31 | "commit": "264ea13e4ed8583b37a91f7640aa22fdee6b2f26" 32 | }, 33 | "_source": "https://github.com/mbostock-bower/d3-bower.git", 34 | "_target": "3.5.5", 35 | "_originalSource": "d3" 36 | } -------------------------------------------------------------------------------- /cluster/benchmarks/bower_components/d3/.gitattributes: -------------------------------------------------------------------------------- 1 | bower.json -diff merge=ours 2 | component.json -diff merge=ours 3 | d3.js -diff merge=ours 4 | d3.min.js -diff merge=ours 5 | package.js -diff merge=ours 6 | -------------------------------------------------------------------------------- /cluster/benchmarks/bower_components/d3/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | **Important:** these GitHub issues are for *bug reports and feature requests only*. Please use [StackOverflow](http://stackoverflow.com/questions/tagged/d3.js) or the [d3-js Google group](https://groups.google.com/d/forum/d3-js) for general help. 4 | 5 | If you’re looking for ways to contribute, please [peruse open issues](https://github.com/mbostock/d3/issues?milestone=&page=1&state=open). The icebox is a good place to find ideas that are not currently in development. If you already have an idea, please check past issues to see whether your idea or a similar one was previously discussed. 6 | 7 | Before submitting a pull request, consider implementing a live example first, say using [bl.ocks.org](http://bl.ocks.org). Real-world use cases go a long way to demonstrating the usefulness of a proposed feature. The more complex a feature’s implementation, the more usefulness it should provide. Share your demo using the #d3js tag on Twitter or by sending it to the [d3-js Google group](https://groups.google.com/d/forum/d3-js). 8 | 9 | If your proposed feature does not involve changing core functionality, consider submitting it instead as a [D3 plugin](https://github.com/d3/d3-plugins). New core features should be for general use, whereas plugins are suitable for more specialized use cases. When in doubt, it’s easier to start with a plugin before “graduating” to core. 10 | 11 | To contribute new documentation or add examples to the gallery, just [edit the Wiki](https://github.com/mbostock/d3/wiki)! 12 | 13 | ## How to Submit a Pull Request 14 | 15 | 1. Click the “Fork” button to create your personal fork of the D3 repository. 16 | 17 | 2. After cloning your fork of the D3 repository in the terminal, run `npm install` to install D3’s dependencies. 18 | 19 | 3. Create a new branch for your new feature. For example: `git checkout -b my-awesome-feature`. A dedicated branch for your pull request means you can develop multiple features at the same time, and ensures that your pull request is stable even if you later decide to develop an unrelated feature. 20 | 21 | 4. The `d3.js` and `d3.min.js` files are built from source files in the `src` directory. _Do not edit `d3.js` directly._ Instead, edit the source files, and then run `make` to build the generated files. 22 | 23 | 5. Use `make test` to run tests and verify your changes. If you are adding a new feature, you should add new tests! If you are changing existing functionality, make sure the existing tests run, or update them as appropriate. 24 | 25 | 6. Sign D3’s [Individual Contributor License Agreement](https://docs.google.com/forms/d/1CzjdBKtDuA8WeuFJinadx956xLQ4Xriv7-oDvXnZMaI/viewform). Unless you are submitting a trivial patch (such as fixing a typo), this form is needed to verify that you are able to contribute. 26 | 27 | 7. Submit your pull request, and good luck! 28 | -------------------------------------------------------------------------------- /cluster/benchmarks/bower_components/d3/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2010-2015, Michael Bostock 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * The name Michael Bostock may not be used to endorse or promote products 15 | derived from this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | DISCLAIMED. IN NO EVENT SHALL MICHAEL BOSTOCK BE LIABLE FOR ANY DIRECT, 21 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 22 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 25 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 26 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | -------------------------------------------------------------------------------- /cluster/benchmarks/bower_components/d3/README.md: -------------------------------------------------------------------------------- 1 | # Data-Driven Documents 2 | 3 | 4 | 5 | **D3.js** is a JavaScript library for manipulating documents based on data. **D3** helps you bring data to life using HTML, SVG and CSS. D3’s emphasis on web standards gives you the full capabilities of modern browsers without tying yourself to a proprietary framework, combining powerful visualization components and a data-driven approach to DOM manipulation. 6 | 7 | Want to learn more? [See the wiki.](https://github.com/mbostock/d3/wiki) 8 | 9 | For examples, [see the gallery](https://github.com/mbostock/d3/wiki/Gallery) and [mbostock’s bl.ocks](http://bl.ocks.org/mbostock). 10 | -------------------------------------------------------------------------------- /cluster/benchmarks/bower_components/d3/bower.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "d3", 3 | "version": "3.5.5", 4 | "main": "d3.js", 5 | "scripts": [ 6 | "d3.js" 7 | ], 8 | "ignore": [ 9 | ".DS_Store", 10 | ".git", 11 | ".gitignore", 12 | ".npmignore", 13 | ".spmignore", 14 | ".travis.yml", 15 | "Makefile", 16 | "bin", 17 | "component.json", 18 | "composer.json", 19 | "index.js", 20 | "lib", 21 | "node_modules", 22 | "package.json", 23 | "src", 24 | "test" 25 | ] 26 | } 27 | -------------------------------------------------------------------------------- /cluster/benchmarks/bower_components/d3/package.js: -------------------------------------------------------------------------------- 1 | // Package metadata for Meteor.js. 2 | 3 | Package.describe({ 4 | name: "d3js:d3", // http://atmospherejs.com/d3js/d3 5 | summary: "D3 (official): A JavaScript visualization library for HTML and SVG.", 6 | version: "3.5.5", 7 | git: "https://github.com/mbostock/d3.git" 8 | }); 9 | 10 | Package.onUse(function(api) { 11 | api.versionsFrom(["METEOR@1.0"]); 12 | api.addFiles("d3.js", "client"); 13 | }); 14 | -------------------------------------------------------------------------------- /cluster/benchmarks/bower_components/plottable/.bower.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "plottable", 3 | "description": "A modular charting library built on D3", 4 | "version": "2.2.0", 5 | "main": [ 6 | "plottable.js", 7 | "plottable.css" 8 | ], 9 | "typescript": { 10 | "definition": "plottable.d.ts" 11 | }, 12 | "license": "MIT", 13 | "ignore": [ 14 | "**/*", 15 | "!bower.json", 16 | "!plottable.js", 17 | "!plottable.css", 18 | "!plottable.min.js", 19 | "!plottable.d.ts" 20 | ], 21 | "keywords": [ 22 | "plottable", 23 | "plottablejs", 24 | "plottable.js", 25 | "d3", 26 | "data viz", 27 | "chart", 28 | "charts", 29 | "reusable charts", 30 | "visualization", 31 | "scatterplot", 32 | "bar chart", 33 | "plot", 34 | "plots" 35 | ], 36 | "dependencies": { 37 | "d3": "3.5.5" 38 | }, 39 | "homepage": "http://plottablejs.org", 40 | "repository": { 41 | "type": "git", 42 | "url": "git://github.com/palantir/plottable.git" 43 | }, 44 | "devDependencies": { 45 | "chai": "2.0.0", 46 | "mocha": "2.2.5", 47 | "jQuery": "2.1.0", 48 | "jquery.simulate": "1.2.0", 49 | "requirejs": "2.1.18", 50 | "sinon": "1.16.1" 51 | }, 52 | "_release": "2.2.0", 53 | "_resolution": { 54 | "type": "version", 55 | "tag": "v2.2.0", 56 | "commit": "e36001d8b6640cd23599905255d61b4ab58a648d" 57 | }, 58 | "_source": "https://github.com/palantir/plottable.git", 59 | "_target": "^2.2.0", 60 | "_originalSource": "plottable", 61 | "_direct": true 62 | } -------------------------------------------------------------------------------- /cluster/benchmarks/bower_components/plottable/bower.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "plottable", 3 | "description": "A modular charting library built on D3", 4 | "version": "2.2.0", 5 | "main": [ 6 | "plottable.js", 7 | "plottable.css" 8 | ], 9 | "typescript": { 10 | "definition": "plottable.d.ts" 11 | }, 12 | "license": "MIT", 13 | "ignore": [ 14 | "**/*", 15 | "!bower.json", 16 | "!plottable.js", 17 | "!plottable.css", 18 | "!plottable.min.js", 19 | "!plottable.d.ts" 20 | ], 21 | "keywords": [ 22 | "plottable", 23 | "plottablejs", 24 | "plottable.js", 25 | "d3", 26 | "data viz", 27 | "chart", 28 | "charts", 29 | "reusable charts", 30 | "visualization", 31 | "scatterplot", 32 | "bar chart", 33 | "plot", 34 | "plots" 35 | ], 36 | "dependencies": { 37 | "d3": "3.5.5" 38 | }, 39 | "homepage": "http://plottablejs.org", 40 | "repository": { 41 | "type": "git", 42 | "url": "git://github.com/palantir/plottable.git" 43 | }, 44 | "devDependencies": { 45 | "chai": "2.0.0", 46 | "mocha": "2.2.5", 47 | "jQuery": "2.1.0", 48 | "jquery.simulate": "1.2.0", 49 | "requirejs": "2.1.18", 50 | "sinon": "1.16.1" 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /cluster/benchmarks/dashboard_app/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaroslavvb/stuff/a8024ead315aa1b5d6976940b3a062178f0e499d/cluster/benchmarks/dashboard_app/.DS_Store -------------------------------------------------------------------------------- /cluster/benchmarks/dashboard_app/app.yaml: -------------------------------------------------------------------------------- 1 | runtime: python 2 | env: flex 3 | entrypoint: gunicorn -b :$PORT main:app 4 | service: benchmarks 5 | 6 | runtime_config: 7 | python_version: 3 8 | -------------------------------------------------------------------------------- /cluster/benchmarks/dashboard_app/main_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import json 17 | import main 18 | import unittest 19 | import urllib 20 | 21 | class TestMain(unittest.TestCase): 22 | 23 | def testArgumentInvalidFormat(self): 24 | self.assertEqual('', main.argument_name('')) 25 | self.assertEqual('', main.argument_name('arg=val')) 26 | self.assertEqual('', main.argument_name('-arg=val')) 27 | self.assertEqual('', main.argument_name('--argval')) 28 | self.assertEqual('', main.argument_name('--=val')) 29 | self.assertEqual('', main.argument_name('--=')) 30 | 31 | def testArgumentValidFormat(self): 32 | self.assertEqual('abc', main.argument_name('--abc=123')) 33 | self.assertEqual('a', main.argument_name('--a=123')) 34 | 35 | def testIndexPage(self): 36 | main.app.testing = True 37 | client = main.app.test_client() 38 | 39 | r = client.get('/') 40 | self.assertEqual(200, r.status_code) 41 | self.assertIn('sample_logged_benchmark', r.data.decode('utf-8')) 42 | 43 | def testTestPage_InvalidTest(self): 44 | main.app.testing = True 45 | client = main.app.test_client() 46 | 47 | r = client.get('/test/abc') 48 | self.assertEqual(200, r.status_code) 49 | self.assertIn('No data for benchmark', str(r.data)) 50 | 51 | def testTestPage_SampleTest(self): 52 | main.app.testing = True 53 | client = main.app.test_client() 54 | sample_benchmark_name = '//tensorflow/examples/benchmark:sample_logged_benchmark' 55 | 56 | r = client.get( 57 | '/test/%252F%252Ftensorflow%252Fexamples%252Fbenchmark%253Asample_logged_benchmark') 58 | self.assertEqual(200, r.status_code) 59 | self.assertIn( 60 | 'Performance plots for %s' % sample_benchmark_name, str(r.data)) 61 | 62 | def testFetchBenchmarkData_InvalidTest(self): 63 | main.app.testing = True 64 | client = main.app.test_client() 65 | 66 | r = client.get('/benchmark_data/?test=abc&entry=cde') 67 | self.assertEqual(200, r.status_code) 68 | self.assertEqual(b'[]', r.data) 69 | 70 | def testFetchBenchmarkData_SampleTest(self): 71 | main.app.testing = True 72 | client = main.app.test_client() 73 | 74 | encoded_benchmark_name = ( 75 | '/test/%252F%252Ftensorflow%252Fexamples%252Fbenchmark%253Asample_logged_benchmark') 76 | r = client.get('/benchmark_data/?test=%s&entry=SampleBenchmark.sum_wall_time' % 77 | encoded_benchmark_name) 78 | self.assertEqual(200, r.status_code) 79 | self.assertEqual(b'[]', r.data) 80 | 81 | 82 | if __name__ == '__main__': 83 | unittest.main() 84 | -------------------------------------------------------------------------------- /cluster/benchmarks/dashboard_app/requirements.txt: -------------------------------------------------------------------------------- 1 | Flask==0.12.2 2 | gunicorn==19.7.1 3 | google-cloud 4 | -------------------------------------------------------------------------------- /cluster/benchmarks/dashboard_app/static/css/style.css: -------------------------------------------------------------------------------- 1 | /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ============================================================================== 15 | */ 16 | 17 | body { 18 | font-family: roboto, sans-serif; 19 | } 20 | 21 | h2 { 22 | font-weight: 400; 23 | } 24 | 25 | em { 26 | color: #666666; 27 | font-size: 18px; 28 | font-style: normal; 29 | } 30 | 31 | .outer_div { 32 | max-width: 1000px; 33 | margin: 20px; 34 | } 35 | 36 | table, th, td { 37 | border-collapse: collapse; 38 | border: 1px solid #d9d9d9; 39 | } 40 | 41 | th, td { 42 | padding: 15px; 43 | } 44 | 45 | th { 46 | text-align: left; 47 | font-weight: normal; 48 | } 49 | 50 | ul { 51 | width: 100%; 52 | margin: 0; 53 | padding: 0; 54 | } 55 | 56 | li { 57 | font-size: 14px; 58 | background-color: white; 59 | list-style: none; 60 | border: 1px solid #d9d9d9; 61 | border-radius: 2px; 62 | margin: 10px 0 0 0; 63 | } 64 | 65 | li:hover { 66 | background-color: #eeeeee; 67 | } 68 | 69 | li a { 70 | display: inline-block; 71 | width: 100%; 72 | height: 100%; 73 | color: black; 74 | text-decoration: none; 75 | padding: 8px 8px; 76 | } 77 | 78 | svg { 79 | margin-top: 20px; 80 | } 81 | 82 | #filter_input { 83 | display: block; 84 | width: 100%; 85 | font-size: 14px; 86 | padding: 8px 8px; 87 | border: 1px solid #d9d9d9; 88 | border-radius: 2px; 89 | box-sizing: border-box; 90 | } 91 | 92 | #filter_label, #arguments_label { 93 | color: #666666; 94 | font-size: 16px; 95 | } 96 | 97 | #latest_value_label { 98 | margin-bottom: 20px; 99 | } 100 | 101 | plottable .title-label text{ 102 | font-size: 16px; 103 | font-family: roboto, sans-serif; 104 | } 105 | -------------------------------------------------------------------------------- /cluster/benchmarks/dashboard_app/static/js/benchmark_latency_chart.js: -------------------------------------------------------------------------------- 1 | // Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | /** 16 | * @fileoverview Provides a way to create a benchmark latency chart. 17 | */ 18 | 19 | /** 20 | * Constructor. 21 | * @param {string} svg_element_id svg element to add the chart to. 22 | * @param {string} test_id of the test to plot data for. 23 | * @param {string} entry_id of the specific test entry to plot. 24 | */ 25 | var BenchmarkLatencyChart = function(svg_element, test_id, entry_id) { 26 | this.svg_element = svg_element; 27 | this.test_id = test_id; 28 | this.entry_id = entry_id; 29 | }; 30 | 31 | /** 32 | * Adds data to the given plots. 33 | */ 34 | BenchmarkLatencyChart.prototype.addData_ = function(plot) { 35 | const encodedTestId = encodeURIComponent(this.test_id); 36 | const encodedEntryId = encodeURIComponent(this.entry_id); 37 | const jsonDataUrl = 38 | '/benchmark_data/?test=' + encodedTestId + '&entry=' + encodedEntryId 39 | d3.json(jsonDataUrl, function(data) { 40 | benchmarks = [] 41 | for (var i = 0; i < data.length; i++) { 42 | const name = this.entry_id; 43 | const timestamp = new Date(+data[i]['start'] / 1000); 44 | const mean_latency = data[i]['timing']; 45 | benchmarks.push( 46 | {name: name, timestamp: timestamp, 47 | mean_latency: +mean_latency}); 48 | } 49 | plot.addDataset( 50 | new Plottable.Dataset(benchmarks, {name: 'Forward'})); 51 | }); 52 | }; 53 | 54 | /** 55 | * Create the chart. 56 | */ 57 | BenchmarkLatencyChart.prototype.makeChart = function() { 58 | const xScale = new Plottable.Scales.Time(); 59 | const yScaleForward = new Plottable.Scales.Linear(); 60 | 61 | const plot = new LatencyChart( 62 | this.entry_id, 'value', 63 | xScale, yScaleForward); 64 | 65 | this.addData_(plot); 66 | 67 | const table = new Plottable.Components.Table([[plot.table]]); 68 | table.renderTo(this.svg_element); 69 | 70 | plot.addTooltip(); 71 | new Plottable.Interactions.Click() 72 | .attachTo(plot.linePlot) 73 | .onClick(function(p) { 74 | plot.updateForPosition(p); 75 | }); 76 | }; 77 | -------------------------------------------------------------------------------- /cluster/benchmarks/dashboard_app/templates/index.html: -------------------------------------------------------------------------------- 1 | 17 | 18 | 19 | 20 | 21 | 22 | 40 | 41 | 42 |
43 | Filter 44 | 45 | 50 |
51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /cluster/benchmarks/dashboard_app/templates/test.html: -------------------------------------------------------------------------------- 1 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 |
25 |

Performance plots for {{ test_id }}

26 | 27 | {% if arguments %} 28 |
Arguments:
29 |
{{ arguments }}
30 | {% endif %} 31 | 32 | 33 | {% for entry in entries %} 34 | 36 | 37 |
Latest value: {{ entry.latest_value }} at {{ latest_time }}.
38 | {% endfor %} 39 | 40 |
41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /cluster/benchmarks/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 15 | 16 | 17 |
18 | Soumith benchmarks 19 |
20 | 21 | 22 | -------------------------------------------------------------------------------- /cluster/benchmarks/js/csv_benchmark_chart.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @fileoverview Provides a way to create a mean latency chart based on a 3 | * csv file with latency data. 4 | */ 5 | 6 | /** 7 | * Constructor. 8 | * @param {string} svg_element_id svg element to add the chart to. 9 | * @param {string} latency_csv_file File to read input data from. The file 10 | * must have lines in the following format: 11 | * (Forward|Forward-Backward),timestamp,num_batches,mean,sd 12 | */ 13 | var CsvLatencyChart = function(svg_element_id, latency_csv_file) { 14 | this.svg_element_id = svg_element_id; 15 | this.latency_csv_file = latency_csv_file; 16 | }; 17 | 18 | /** 19 | * Adds data to the given plots. 20 | */ 21 | CsvLatencyChart.prototype.addData_ = function( 22 | plotForward, plotForwardBackward) { 23 | d3.text(this.latency_csv_file, function(data) { 24 | data = d3.csv.parseRows(data); 25 | const parseDate = d3.time.format('%Y-%m-%d %H:%M:%S').parse; 26 | let forwardBenchmarks = []; 27 | let forwardBackwardBenchmarks = []; 28 | for (var i = 0; i < data.length; i++) { 29 | const name = data[i][0]; 30 | const timestamp = data[i][1]; 31 | const mean_latency = data[i][3]; 32 | // Timestamp has the format: 2016-08-31 23:38:55.159320 33 | // However, we can't parse this date format using d3 time 34 | // functions, so we remove everything after the dot before parsing. 35 | const dateUpToSeconds = timestamp.split('.')[0] 36 | if (name == 'Forward') { 37 | forwardBenchmarks.push( 38 | {name: name, timestamp: parseDate(dateUpToSeconds), 39 | mean_latency: +mean_latency}); 40 | } else { 41 | forwardBackwardBenchmarks.push( 42 | {name: name, timestamp: parseDate(dateUpToSeconds), 43 | mean_latency: +mean_latency}); 44 | } 45 | } 46 | plotForward.addDataset( 47 | new Plottable.Dataset(forwardBenchmarks, {name: 'Forward'})); 48 | plotForwardBackward.addDataset( 49 | new Plottable.Dataset( 50 | forwardBackwardBenchmarks, {name: 'Forward-Backward'})); 51 | }); 52 | }; 53 | 54 | /** 55 | * Create the chart. 56 | */ 57 | CsvLatencyChart.prototype.makeChart = function() { 58 | const xScale = new Plottable.Scales.Time(); 59 | const yScaleForward = new Plottable.Scales.Linear(); 60 | const yScaleForwardBackward = new Plottable.Scales.Linear(); 61 | 62 | const plotForward = new LatencyChart( 63 | 'Forward pass per-batch latency', 'Mean latency (sec)', 64 | xScale, yScaleForward); 65 | const plotForwardBackward = new LatencyChart( 66 | 'Forward-backward pass per-batch latency', 'Mean latency (sec)', 67 | xScale, yScaleForwardBackward); 68 | 69 | this.addData_(plotForward, plotForwardBackward); 70 | 71 | const table = new Plottable.Components.Table([ 72 | [plotForward.table], 73 | [plotForwardBackward.table] 74 | ]); 75 | table.renderTo(this.svg_element_id); 76 | 77 | plotForward.addTooltip(); 78 | plotForwardBackward.addTooltip(); 79 | new Plottable.Interactions.Click() 80 | .attachTo(plotForward.linePlot) 81 | .onClick(function(p) { 82 | plotForward.updateForPosition(p); 83 | plotForwardBackward.updateForPosition(p); 84 | }); 85 | new Plottable.Interactions.Click() 86 | .attachTo(plotForwardBackward.linePlot) 87 | .onClick(function(p) { 88 | plotForward.updateForPosition(p); 89 | plotForwardBackward.updateForPosition(p); 90 | }); 91 | }; 92 | -------------------------------------------------------------------------------- /cluster/benchmarks/scripts/Dockerfile.tf_cnn_benchmarks: -------------------------------------------------------------------------------- 1 | FROM tensorflow/tensorflow:nightly-gpu 2 | 3 | RUN apt-get update && apt-get install -y python-pip && pip install google-cloud 4 | COPY tf_cnn_benchmarks/ ./tf_cnn_benchmarks/ 5 | RUN touch tf_cnn_benchmarks/__init__.py 6 | RUN mkdir ./util/ 7 | COPY util/ ./util/ 8 | ENTRYPOINT ["python", "-m", "tf_cnn_benchmarks.tf_cnn_benchmarks"] 9 | -------------------------------------------------------------------------------- /cluster/benchmarks/scripts/benchmark_configs.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | # Distributed benchmark configs to run with continuous build. 17 | # For each benchmark, the following properties are supported: 18 | # 19 | # benchmark_name: (required) unique name of the benchmark to run 20 | # args: (optional) argument values to pass to the benchmark. 21 | # env_vars: (optional) environment variables to set for benchmark jobs. 22 | # worker_count: (required) number of worker jobs to run 23 | # ps_count: (required) number of ps jobs to run. 24 | # gpus_per_machine: (optional) number of required gpus per worker 25 | # (currently only supporting <= 1). 26 | # docker_file: (required) docker file to build a docker image for. 27 | # Path to the docker file should be relative to Jenkins build folder. 28 | # 'benchmarks' github repo will be cloned to 'benchmarks' folder. 29 | # 30 | # Example: 31 | # - benchmark_name: "benchmark_alexnet" 32 | # args: 33 | # data_format: "NHWC" 34 | # worker_count: 1 35 | # ps_count: 2 36 | # docker_file: "benchmarks/models/Dockerfile.alexnet_distributed_test" 37 | 38 | - benchmark_name: "tf_cnn_benchmark_resnet50" 39 | args: 40 | data_format: "NHWC" 41 | model: "resnet50" 42 | result_storage: "cbuild_benchmark_datastore" 43 | num_gpus: 8 44 | local_parameter_device: "cpu" 45 | worker_count: 2 46 | ps_count: 2 47 | gpus_per_machine: 8 48 | docker_file: "benchmarks/scripts/Dockerfile.tf_cnn_benchmarks" 49 | -------------------------------------------------------------------------------- /cluster/benchmarks/scripts/tf_cnn_benchmarks/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaroslavvb/stuff/a8024ead315aa1b5d6976940b3a062178f0e499d/cluster/benchmarks/scripts/tf_cnn_benchmarks/.DS_Store -------------------------------------------------------------------------------- /cluster/benchmarks/scripts/tf_cnn_benchmarks/README.md: -------------------------------------------------------------------------------- 1 | # tf_cnn_benchmarks: High performance benchmarks 2 | 3 | tf_cnn_benchmarks contains implementations of several popular convolutional 4 | models, and is designed to be as fast as possible. tf_cnn_benchmarks supports 5 | both running on a single machine or running in distributed mode across multiple 6 | hosts. See the [High-Performance models 7 | guide](https://www.tensorflow.org/performance/performance_models) for more 8 | information. 9 | 10 | These models utilize many of the strategies in the [TensorFlow Performance 11 | Guide](https://www.tensorflow.org/performance/performance_guide). Benchmark 12 | results can be found [here](https://www.tensorflow.org/performance/benchmarks). 13 | 14 | These models are designed for performance. For models that have clean and 15 | easy-to-read implementations, see the [TensorFlow Official 16 | Models](https://github.com/tensorflow/models/tree/master/official). 17 | 18 | ## Getting Started 19 | 20 | To run ResNet50 with synthetic data without distortions with a single GPU, run 21 | 22 | ``` 23 | python tf_cnn_benchmarks.py --num_gpus=1 --batch_size=32 --model=resnet50 --variable_update=parameter_server 24 | ``` 25 | 26 | Some important flags are 27 | 28 | * model: Model to use, e.g. resnet50, inception3, vgg16, and alexnet. 29 | * num_gpus: Number of GPUs to use. 30 | * data_dir: Path to data to process. If not set, synthetic data is used. To 31 | use Imagenet data use these 32 | [instructions](https://github.com/tensorflow/models/tree/master/research/inception#getting-started) 33 | as a starting point. 34 | * batch_size: Batch size for each GPU. 35 | * variable_update: The method for managing variables: parameter_server 36 | ,replicated, distributed_replicated, independent 37 | * local_parameter_device: Device to use as parameter server: cpu or gpu. 38 | 39 | See 40 | [benchmark_cnn.py](https://github.com/tensorflow/benchmarks/blob/master/scripts/tf_cnn_benchmarks/benchmark_cnn.py) 41 | for the full list of flags. The `_DEFAULT_PARAMS` dict in that file contains the 42 | flags. 43 | -------------------------------------------------------------------------------- /cluster/benchmarks/scripts/tf_cnn_benchmarks/benchmark_storage.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Provides ways to store benchmark output.""" 16 | 17 | 18 | def store_benchmark(data, storage_type=None): 19 | """Store benchmark data. 20 | 21 | Args: 22 | data: Dictionary mapping from string benchmark name to 23 | numeric benchmark value. 24 | storage_type: (string) Specifies where to store benchmark 25 | result. If storage_type is 26 | 'cbuild_benchmark_datastore': store outputs in our continuous 27 | build datastore. gcloud must be setup in current environment 28 | pointing to the project where data will be added. 29 | """ 30 | if storage_type == 'cbuild_benchmark_datastore': 31 | try: 32 | # pylint: disable=g-import-not-at-top 33 | import cbuild_benchmark_storage 34 | # pylint: enable=g-import-not-at-top 35 | except ImportError: 36 | raise ImportError( 37 | 'Missing cbuild_benchmark_storage.py required for ' 38 | 'benchmark_cloud_datastore option') 39 | cbuild_benchmark_storage.upload_to_benchmark_datastore(data) 40 | else: 41 | assert False, 'unknown storage_type: ' + storage_type 42 | -------------------------------------------------------------------------------- /cluster/benchmarks/scripts/tf_cnn_benchmarks/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaroslavvb/stuff/a8024ead315aa1b5d6976940b3a062178f0e499d/cluster/benchmarks/scripts/tf_cnn_benchmarks/models/__init__.py -------------------------------------------------------------------------------- /cluster/benchmarks/scripts/tf_cnn_benchmarks/models/alexnet_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Alexnet model configuration. 17 | 18 | References: 19 | Krizhevsky, Alex, Ilya Sutskever, and Geoffrey E. Hinton 20 | ImageNet Classification with Deep Convolutional Neural Networks 21 | Advances in Neural Information Processing Systems. 2012 22 | """ 23 | 24 | import tensorflow as tf 25 | from models import model 26 | 27 | 28 | class AlexnetModel(model.Model): 29 | """Alexnet cnn model.""" 30 | 31 | def __init__(self): 32 | super(AlexnetModel, self).__init__('alexnet', 224 + 3, 512, 0.005) 33 | 34 | def add_inference(self, cnn): 35 | # Note: VALID requires padding the images by 3 in width and height 36 | cnn.conv(64, 11, 11, 4, 4, 'VALID') 37 | cnn.mpool(3, 3, 2, 2) 38 | cnn.conv(192, 5, 5) 39 | cnn.mpool(3, 3, 2, 2) 40 | cnn.conv(384, 3, 3) 41 | cnn.conv(384, 3, 3) 42 | cnn.conv(256, 3, 3) 43 | cnn.mpool(3, 3, 2, 2) 44 | cnn.reshape([-1, 256 * 6 * 6]) 45 | cnn.affine(4096) 46 | cnn.dropout() 47 | cnn.affine(4096) 48 | cnn.dropout() 49 | 50 | 51 | class AlexnetCifar10Model(model.Model): 52 | """Alexnet cnn model for cifar datasets. 53 | 54 | The model architecture follows the one defined in the tensorflow tutorial 55 | model. 56 | 57 | Reference model: tensorflow/models/tutorials/image/cifar10/cifar10.py 58 | Paper: http://www.cs.toronto.edu/~kriz/learning-features-2009-TR.pdf 59 | """ 60 | 61 | def __init__(self): 62 | super(AlexnetCifar10Model, self).__init__('alexnet', 32, 128, 0.1) 63 | 64 | def add_inference(self, cnn): 65 | cnn.conv(64, 5, 5, 1, 1, 'SAME', stddev=5e-2) 66 | cnn.mpool(3, 3, 2, 2, mode='SAME') 67 | cnn.lrn(depth_radius=4, bias=1.0, alpha=0.001 / 9.0, beta=0.75) 68 | cnn.conv(64, 5, 5, 1, 1, 'SAME', bias=0.1, stddev=5e-2) 69 | cnn.lrn(depth_radius=4, bias=1.0, alpha=0.001 / 9.0, beta=0.75) 70 | cnn.mpool(3, 3, 2, 2, mode='SAME') 71 | shape = cnn.top_layer.get_shape().as_list() 72 | flat_dim = shape[1] * shape[2] * shape[3] 73 | cnn.reshape([-1, flat_dim]) 74 | cnn.affine(384, stddev=0.04, bias=0.1) 75 | cnn.affine(192, stddev=0.04, bias=0.1) 76 | 77 | def get_learning_rate(self, global_step, batch_size): 78 | num_examples_per_epoch = 50000 79 | num_epochs_per_decay = 100 80 | decay_steps = int(num_epochs_per_decay * num_examples_per_epoch / 81 | batch_size) 82 | decay_factor = 0.1 83 | return tf.train.exponential_decay( 84 | self.learning_rate, global_step, decay_steps, decay_factor, 85 | staircase=True) 86 | -------------------------------------------------------------------------------- /cluster/benchmarks/scripts/tf_cnn_benchmarks/models/densenet_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Densenet model configuration. 17 | 18 | References: 19 | "Densely Connected Convolutional Networks": https://arxiv.org/pdf/1608.06993 20 | """ 21 | import numpy as np 22 | from six.moves import xrange # pylint: disable=redefined-builtin 23 | import tensorflow as tf 24 | from models import model as model_lib 25 | 26 | 27 | class DensenetCifar10Model(model_lib.Model): 28 | """Densenet cnn network configuration.""" 29 | 30 | def __init__(self, model, layer_counts, growth_rate): 31 | self.growth_rate = growth_rate 32 | super(DensenetCifar10Model, self).__init__(model, 32, 64, 0.1, 33 | layer_counts=layer_counts) 34 | self.batch_norm_config = {'decay': 0.9, 'epsilon': 1e-5, 'scale': True} 35 | 36 | def dense_block(self, cnn, growth_rate): 37 | input_layer = cnn.top_layer 38 | c = cnn.batch_norm(input_layer, **self.batch_norm_config) 39 | c = tf.nn.relu(c) 40 | c = cnn.conv(growth_rate, 3, 3, 1, 1, stddev=np.sqrt(2.0/9/growth_rate), 41 | activation=None, input_layer=c) 42 | channel_index = 3 if cnn.channel_pos == 'channels_last' else 1 43 | cnn.top_layer = tf.concat([input_layer, c], channel_index) 44 | cnn.top_size += growth_rate 45 | 46 | def transition_layer(self, cnn): 47 | in_size = cnn.top_size 48 | cnn.batch_norm(**self.batch_norm_config) 49 | cnn.top_layer = tf.nn.relu(cnn.top_layer) 50 | cnn.conv(in_size, 1, 1, 1, 1, stddev=np.sqrt(2.0/9/in_size)) 51 | cnn.apool(2, 2, 2, 2) 52 | 53 | def add_inference(self, cnn): 54 | if self.layer_counts is None: 55 | raise ValueError('Layer counts not specified for %s' % self.get_model()) 56 | if self.growth_rate is None: 57 | raise ValueError('Growth rate not specified for %s' % self.get_model()) 58 | 59 | cnn.conv(16, 3, 3, 1, 1, activation=None) 60 | # Block 1 61 | for _ in xrange(self.layer_counts[0]): 62 | self.dense_block(cnn, self.growth_rate) 63 | self.transition_layer(cnn) 64 | # Block 2 65 | for _ in xrange(self.layer_counts[1]): 66 | self.dense_block(cnn, self.growth_rate) 67 | self.transition_layer(cnn) 68 | # Block 3 69 | for _ in xrange(self.layer_counts[2]): 70 | self.dense_block(cnn, self.growth_rate) 71 | cnn.batch_norm(**self.batch_norm_config) 72 | cnn.top_layer = tf.nn.relu(cnn.top_layer) 73 | channel_index = 3 if cnn.channel_pos == 'channels_last' else 1 74 | cnn.top_size = cnn.top_layer.get_shape().as_list()[channel_index] 75 | cnn.spatial_mean() 76 | 77 | def get_learning_rate(self, global_step, batch_size): 78 | num_batches_per_epoch = int(50000 / batch_size) 79 | boundaries = num_batches_per_epoch * np.array([150, 225, 300], 80 | dtype=np.int64) 81 | boundaries = [x for x in boundaries] 82 | values = [0.1, 0.01, 0.001, 0.0001] 83 | return tf.train.piecewise_constant(global_step, boundaries, values) 84 | -------------------------------------------------------------------------------- /cluster/benchmarks/scripts/tf_cnn_benchmarks/models/googlenet_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Googlenet model configuration. 17 | 18 | References: 19 | Szegedy, Christian, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed, 20 | Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, and Andrew Rabinovich 21 | Going deeper with convolutions 22 | arXiv preprint arXiv:1409.4842 (2014) 23 | """ 24 | 25 | from models import model 26 | 27 | 28 | class GooglenetModel(model.Model): 29 | 30 | def __init__(self): 31 | super(GooglenetModel, self).__init__('googlenet', 224, 32, 0.005) 32 | 33 | def add_inference(self, cnn): 34 | def inception_v1(cnn, k, l, m, n, p, q): 35 | cols = [[('conv', k, 1, 1)], [('conv', l, 1, 1), ('conv', m, 3, 3)], 36 | [('conv', n, 1, 1), ('conv', p, 5, 5)], 37 | [('mpool', 3, 3, 1, 1, 'SAME'), ('conv', q, 1, 1)]] 38 | cnn.inception_module('incept_v1', cols) 39 | 40 | cnn.conv(64, 7, 7, 2, 2) 41 | cnn.mpool(3, 3, 2, 2, mode='SAME') 42 | cnn.conv(64, 1, 1) 43 | cnn.conv(192, 3, 3) 44 | cnn.mpool(3, 3, 2, 2, mode='SAME') 45 | inception_v1(cnn, 64, 96, 128, 16, 32, 32) 46 | inception_v1(cnn, 128, 128, 192, 32, 96, 64) 47 | cnn.mpool(3, 3, 2, 2, mode='SAME') 48 | inception_v1(cnn, 192, 96, 208, 16, 48, 64) 49 | inception_v1(cnn, 160, 112, 224, 24, 64, 64) 50 | inception_v1(cnn, 128, 128, 256, 24, 64, 64) 51 | inception_v1(cnn, 112, 144, 288, 32, 64, 64) 52 | inception_v1(cnn, 256, 160, 320, 32, 128, 128) 53 | cnn.mpool(3, 3, 2, 2, mode='SAME') 54 | inception_v1(cnn, 256, 160, 320, 32, 128, 128) 55 | inception_v1(cnn, 384, 192, 384, 48, 128, 128) 56 | cnn.apool(7, 7, 1, 1, mode='VALID') 57 | cnn.reshape([-1, 1024]) 58 | -------------------------------------------------------------------------------- /cluster/benchmarks/scripts/tf_cnn_benchmarks/models/lenet_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Lenet model configuration. 17 | 18 | References: 19 | LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick Haffner 20 | Gradient-based learning applied to document recognition 21 | Proceedings of the IEEE (1998) 22 | """ 23 | 24 | from models import model 25 | 26 | 27 | class Lenet5Model(model.Model): 28 | 29 | def __init__(self): 30 | super(Lenet5Model, self).__init__('lenet5', 28, 32, 0.005) 31 | 32 | def add_inference(self, cnn): 33 | # Note: This matches TF's MNIST tutorial model 34 | cnn.conv(32, 5, 5) 35 | cnn.mpool(2, 2) 36 | cnn.conv(64, 5, 5) 37 | cnn.mpool(2, 2) 38 | cnn.reshape([-1, 64 * 7 * 7]) 39 | cnn.affine(512) 40 | -------------------------------------------------------------------------------- /cluster/benchmarks/scripts/tf_cnn_benchmarks/models/model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Base model configuration for CNN benchmarks.""" 16 | 17 | 18 | class Model(object): 19 | """Base model configuration for CNN benchmarks.""" 20 | 21 | def __init__(self, 22 | model, 23 | image_size, 24 | batch_size, 25 | learning_rate, 26 | layer_counts=None, 27 | fp16_loss_scale=128): 28 | self.model = model 29 | self.image_size = image_size 30 | self.batch_size = batch_size 31 | self.default_batch_size = batch_size 32 | self.learning_rate = learning_rate 33 | self.layer_counts = layer_counts 34 | # TODO(reedwm) Set custom loss scales for each model instead of using the 35 | # default of 128. 36 | self.fp16_loss_scale = fp16_loss_scale 37 | 38 | def get_model(self): 39 | return self.model 40 | 41 | def get_image_size(self): 42 | return self.image_size 43 | 44 | def get_batch_size(self): 45 | return self.batch_size 46 | 47 | def set_batch_size(self, batch_size): 48 | self.batch_size = batch_size 49 | 50 | def get_default_batch_size(self): 51 | return self.default_batch_size 52 | 53 | def get_layer_counts(self): 54 | return self.layer_counts 55 | 56 | def get_fp16_loss_scale(self): 57 | return self.fp16_loss_scale 58 | 59 | def get_learning_rate(self, global_step, batch_size): 60 | del global_step 61 | del batch_size 62 | return self.learning_rate 63 | 64 | def add_inference(self, unused_cnn): 65 | raise ValueError('Must be implemented in derived classes') 66 | -------------------------------------------------------------------------------- /cluster/benchmarks/scripts/tf_cnn_benchmarks/models/overfeat_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Overfeat model configuration. 17 | 18 | References: 19 | OverFeat: Integrated Recognition, Localization and Detection using 20 | Convolutional Networks 21 | Pierre Sermanet, David Eigen, Xiang Zhang, Michael Mathieu, Rob Fergus, 22 | Yann LeCun, 2014 23 | http://arxiv.org/abs/1312.6229 24 | """ 25 | 26 | from models import model 27 | 28 | 29 | class OverfeatModel(model.Model): 30 | 31 | def __init__(self): 32 | super(OverfeatModel, self).__init__('overfeat', 231, 32, 0.005) 33 | 34 | def add_inference(self, cnn): 35 | # Note: VALID requires padding the images by 3 in width and height 36 | cnn.conv(96, 11, 11, 4, 4, mode='VALID') 37 | cnn.mpool(2, 2) 38 | cnn.conv(256, 5, 5, 1, 1, mode='VALID') 39 | cnn.mpool(2, 2) 40 | cnn.conv(512, 3, 3) 41 | cnn.conv(1024, 3, 3) 42 | cnn.conv(1024, 3, 3) 43 | cnn.mpool(2, 2) 44 | cnn.reshape([-1, 1024 * 6 * 6]) 45 | cnn.affine(3072) 46 | cnn.affine(4096) 47 | -------------------------------------------------------------------------------- /cluster/benchmarks/scripts/tf_cnn_benchmarks/models/trivial_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Trivial model configuration.""" 16 | 17 | from models import model 18 | 19 | 20 | class TrivialModel(model.Model): 21 | """Trivial model configuration.""" 22 | 23 | def __init__(self): 24 | super(TrivialModel, self).__init__('trivial', 224 + 3, 32, 0.005) 25 | 26 | def add_inference(self, cnn): 27 | cnn.reshape([-1, 227 * 227 * 3]) 28 | cnn.affine(1) 29 | cnn.affine(4096) 30 | -------------------------------------------------------------------------------- /cluster/benchmarks/scripts/tf_cnn_benchmarks/models/vgg_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Vgg model configuration. 17 | 18 | Includes multiple models: vgg11, vgg16, vgg19, corresponding to 19 | model A, D, and E in Table 1 of [1]. 20 | 21 | References: 22 | [1] Simonyan, Karen, Andrew Zisserman 23 | Very Deep Convolutional Networks for Large-Scale Image Recognition 24 | arXiv:1409.1556 (2014) 25 | """ 26 | 27 | from six.moves import xrange # pylint: disable=redefined-builtin 28 | from models import model 29 | 30 | 31 | def _construct_vgg(cnn, num_conv_layers): 32 | """Build vgg architecture from blocks.""" 33 | assert len(num_conv_layers) == 5 34 | for _ in xrange(num_conv_layers[0]): 35 | cnn.conv(64, 3, 3) 36 | cnn.mpool(2, 2) 37 | for _ in xrange(num_conv_layers[1]): 38 | cnn.conv(128, 3, 3) 39 | cnn.mpool(2, 2) 40 | for _ in xrange(num_conv_layers[2]): 41 | cnn.conv(256, 3, 3) 42 | cnn.mpool(2, 2) 43 | for _ in xrange(num_conv_layers[3]): 44 | cnn.conv(512, 3, 3) 45 | cnn.mpool(2, 2) 46 | for _ in xrange(num_conv_layers[4]): 47 | cnn.conv(512, 3, 3) 48 | cnn.mpool(2, 2) 49 | cnn.reshape([-1, 512 * 7 * 7]) 50 | cnn.affine(4096) 51 | cnn.dropout() 52 | cnn.affine(4096) 53 | cnn.dropout() 54 | 55 | 56 | class Vgg11Model(model.Model): 57 | 58 | def __init__(self): 59 | super(Vgg11Model, self).__init__('vgg11', 224, 64, 0.005) 60 | 61 | def add_inference(self, cnn): 62 | _construct_vgg(cnn, [1, 1, 2, 2, 2]) 63 | 64 | 65 | class Vgg16Model(model.Model): 66 | 67 | def __init__(self): 68 | super(Vgg16Model, self).__init__('vgg16', 224, 64, 0.005) 69 | 70 | def add_inference(self, cnn): 71 | _construct_vgg(cnn, [2, 2, 3, 3, 3]) 72 | 73 | 74 | class Vgg19Model(model.Model): 75 | 76 | def __init__(self): 77 | super(Vgg19Model, self).__init__('vgg19', 224, 64, 0.005) 78 | 79 | def add_inference(self, cnn): 80 | _construct_vgg(cnn, [2, 2, 4, 4, 4]) 81 | -------------------------------------------------------------------------------- /cluster/benchmarks/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Benchmark script for TensorFlow. 17 | 18 | See the README for more information. 19 | """ 20 | 21 | from __future__ import print_function 22 | 23 | 24 | import tensorflow as tf 25 | 26 | import benchmark_cnn 27 | import cnn_util 28 | from cnn_util import log_fn 29 | 30 | benchmark_cnn.define_flags() 31 | 32 | 33 | def main(extra_flags): 34 | # extra_flags is a list of command line arguments, excluding those defined 35 | # in tf.flags.FLAGS. extra_flags[0] is always the program name. It is an error 36 | # to supply flags not defined with tf.flags.FLAGS, so we raise an ValueError 37 | # in that case. 38 | assert len(extra_flags) >= 1 39 | if len(extra_flags) > 1: 40 | raise ValueError('Received unknown flags: %s' % extra_flags[1:]) 41 | 42 | params = benchmark_cnn.make_params_from_flags() 43 | benchmark_cnn.setup(params) 44 | bench = benchmark_cnn.BenchmarkCNN(params) 45 | 46 | tfversion = cnn_util.tensorflow_version_tuple() 47 | log_fn('TensorFlow: %i.%i' % (tfversion[0], tfversion[1])) 48 | 49 | bench.print_info() 50 | bench.run() 51 | 52 | 53 | if __name__ == '__main__': 54 | tf.app.run() 55 | -------------------------------------------------------------------------------- /cluster/benchmarks/scripts/util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaroslavvb/stuff/a8024ead315aa1b5d6976940b3a062178f0e499d/cluster/benchmarks/scripts/util/__init__.py -------------------------------------------------------------------------------- /cluster/benchmarks/scripts/util/benchmark_util_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Tests for benchmark_util.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import datetime 22 | import json 23 | import os 24 | import tempfile 25 | import unittest 26 | 27 | import benchmark_util 28 | 29 | 30 | class BenchmarkUtilTest(unittest.TestCase): 31 | 32 | def testStoreDataWithNoEntries(self): 33 | with tempfile.NamedTemporaryFile() as temp_file: 34 | timing_entries = [] 35 | benchmark_util.store_data_in_json( 36 | timing_entries, datetime.date(2017, 1, 1), temp_file.name) 37 | with open(temp_file.name, 'r') as json_file: 38 | json_output = json.loads(json_file.read()) 39 | self.assertEqual('TestBenchmark', json_output['name']) 40 | self.assertEqual(u'1483228800', json_output['startTime']) 41 | 42 | def testStoreDataWithEntries(self): 43 | with tempfile.NamedTemporaryFile() as temp_file: 44 | timing_entries = [benchmark_util.StatEntry('test', 0.1, 1)] 45 | benchmark_util.store_data_in_json( 46 | timing_entries, datetime.date(2017, 1, 1), temp_file.name) 47 | 48 | with open(temp_file.name, 'r') as json_file: 49 | json_output = json.loads(json_file.read()) 50 | self.assertEqual(1, len(json_output['entries']['entry'])) 51 | self.assertEqual('test', json_output['entries']['entry'][0]['name']) 52 | self.assertEqual(0.1, json_output['entries']['entry'][0]['wallTime']) 53 | self.assertEqual(u'1', json_output['entries']['entry'][0]['iters']) 54 | self.assertEqual(u'1483228800', json_output['startTime']) 55 | self.assertEqual('TestBenchmark', json_output['name']) 56 | 57 | 58 | if __name__ == '__main__': 59 | unittest.main() 60 | -------------------------------------------------------------------------------- /cluster/benchmarks/scripts/util/convert_csv_to_json.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Convert CSV benchmark data to JSON format. 16 | 17 | CSV benchmark data has the format: 18 | Description,timestamp,num_batches,time mean value,time sd 19 | 20 | JSON benchmark data in in the format of TestResults proto 21 | converted to JSON. 22 | https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/test_log.proto. 23 | """ 24 | import argparse 25 | import csv 26 | from datetime import datetime 27 | 28 | import benchmark_util 29 | 30 | 31 | def get_data_from_csv(csv_reader): 32 | """Creates a list of StatEntry objects based on data in CSV data. 33 | 34 | Input CSV data must be in the format: 35 | Description,timestamp,num_batches,time mean value,time sd 36 | 37 | Args: 38 | csv_reader: csv.reader instance. 39 | 40 | Returns: 41 | A tuple of datetime timestamp and list of benchmark_util.StatEntry objects. 42 | 43 | Raises: 44 | ValueError: if CSV is invalid. 45 | """ 46 | timestamp = None 47 | stat_entries = [] 48 | 49 | for row in csv_reader: 50 | if len(row) != 5: 51 | raise ValueError('Expected 5 entries per line in the input CSV file, ' 52 | 'but found %d entries.' % len(row)) 53 | if '' in row: 54 | raise ValueError('Found empty entries in row: %s' % row) 55 | 56 | # Set timestamp based on the first line in CSV file. 57 | if timestamp is None: 58 | # Example of time formatting: 2017-06-26 02:59:29.325579 59 | timestamp = datetime.strptime(row[1], "%Y-%m-%d %H:%M:%S.%f") 60 | stat_entries.append( 61 | benchmark_util.StatEntry(row[0], float(row[3]), 1)) 62 | return timestamp, stat_entries 63 | 64 | 65 | def main(): 66 | with open(FLAGS.input_csv_file, 'r') as csvfile: 67 | csv_reader = csv.reader(csvfile) 68 | timestamp, stat_entries = get_data_from_csv(csv_reader) 69 | benchmark_util.store_data_in_json( 70 | stat_entries, timestamp, 71 | output_file=FLAGS.output_json_file, 72 | test_name=FLAGS.test_name) 73 | 74 | 75 | if __name__ == '__main__': 76 | parser = argparse.ArgumentParser() 77 | parser.register( 78 | 'type', 'bool', lambda v: v.lower() in ('true', 't', 'y', 'yes')) 79 | parser.add_argument( 80 | '--test_name', type=str, default=None, required=True, 81 | help='Name of the test.') 82 | parser.add_argument( 83 | '--input_csv_file', type=str, default=None, required=True, 84 | help='Path to the CSV file.') 85 | parser.add_argument( 86 | '--output_json_file', type=str, default=None, required=True, 87 | help='Path to output JSON file.') 88 | FLAGS, _ = parser.parse_known_args() 89 | main() 90 | 91 | -------------------------------------------------------------------------------- /cluster/benchmarks/scripts/util/convert_csv_to_json_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Tests for convert_csv_to_json.""" 16 | import csv 17 | import datetime 18 | import unittest 19 | 20 | import convert_csv_to_json 21 | 22 | 23 | class ConvertCsvToJsonTest(unittest.TestCase): 24 | 25 | def testSingleEntryCSV(self): 26 | # Description,timestamp,num_batches,time mean value,time sd 27 | csv_reader = csv.reader( 28 | ['abc,2017-06-26 02:59:29.325579,10,2.15,0.1']) 29 | timestamp, stat_entries = convert_csv_to_json.get_data_from_csv(csv_reader) 30 | self.assertEqual( 31 | datetime.datetime(2017, 06, 26, 2, 59, 29, 325579), 32 | timestamp) 33 | self.assertEqual(1, len(stat_entries)) 34 | self.assertEqual('abc', stat_entries[0].name) 35 | self.assertEqual(2.15, stat_entries[0].stat_value) 36 | 37 | def testTwoEntryCSV(self): 38 | # Description,timestamp,num_batches,time mean value,time sd 39 | csv_reader = csv.reader( 40 | ['abc,2017-06-26 02:59:35.425579,10,2.15,0.1', 41 | 'def,2017-06-26 02:59:29.325579,10,10.1,0.1']) 42 | timestamp, stat_entries = convert_csv_to_json.get_data_from_csv(csv_reader) 43 | self.assertEqual( 44 | datetime.datetime(2017, 06, 26, 2, 59, 35, 425579), 45 | timestamp) 46 | self.assertEqual(2, len(stat_entries)) 47 | self.assertEqual('abc', stat_entries[0].name) 48 | self.assertEqual(2.15, stat_entries[0].stat_value) 49 | self.assertEqual('def', stat_entries[1].name) 50 | self.assertEqual(10.1, stat_entries[1].stat_value) 51 | 52 | def testInvalidCSV_LessEntries(self): 53 | csv_reader = csv.reader( 54 | ['abc,2017-06-26 02:59:29.325579,10,2.15']) 55 | with self.assertRaises(ValueError): 56 | timestamp, stat_entries = convert_csv_to_json.get_data_from_csv( 57 | csv_reader) 58 | 59 | def testInvalidCSV_MoreEntries(self): 60 | csv_reader = csv.reader( 61 | ['abc,2017-06-26 02:59:29.325579,10,2.15,0.1,extra_entry']) 62 | with self.assertRaises(ValueError): 63 | timestamp, stat_entries = convert_csv_to_json.get_data_from_csv( 64 | csv_reader) 65 | 66 | def testInvalidCSV_EmptyEntry(self): 67 | csv_reader = csv.reader( 68 | [',2017-06-26 02:59:29.325579,10,2.15,0.1']) 69 | with self.assertRaises(ValueError): 70 | timestamp, stat_entries = convert_csv_to_json.get_data_from_csv( 71 | csv_reader) 72 | 73 | def testInvalidCSV_InvalidDate(self): 74 | csv_reader = csv.reader(['abc,invaliddate,10,2.15,0.1']) 75 | with self.assertRaises(ValueError): 76 | timestamp, stat_entries = convert_csv_to_json.get_data_from_csv( 77 | csv_reader) 78 | 79 | def testInvalidCSV_InvalidValue(self): 80 | csv_reader = csv.reader( 81 | ['abc,2017-06-26 02:59:29.325579,10,invalidfloat,0.1']) 82 | with self.assertRaises(ValueError): 83 | timestamp, stat_entries = convert_csv_to_json.get_data_from_csv( 84 | csv_reader) 85 | 86 | 87 | if __name__ == '__main__': 88 | unittest.main() 89 | -------------------------------------------------------------------------------- /cluster/benchmarks/soumith_benchmarks.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 20 | 21 | 22 |
23 |

Alexnet

24 | 25 |

Googlenet

26 | 27 |

Overfeat

28 | 29 |

VGG

30 | 31 |
32 | 33 | 34 | 35 | 36 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /cluster/benchmarks/tools/kubectl_util_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Tests for kubectl_util.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import mock 22 | import subprocess 23 | import unittest 24 | 25 | import kubectl_util 26 | 27 | 28 | kubectl_util.WAIT_PERIOD_SECONDS = 1 29 | 30 | 31 | class KubectlUtilTest(unittest.TestCase): 32 | 33 | @mock.patch.object(subprocess, 'check_output') 34 | @mock.patch.object(subprocess, 'check_call') 35 | def testCreatePods(self, mock_check_call, mock_check_output): 36 | mock_check_output.return_value = 'nonempty' 37 | kubectl_util.CreatePods('test_pod', 'test.yaml') 38 | mock_check_call.assert_called_once_with( 39 | ['kubectl', 'create', '--filename=test.yaml']) 40 | mock_check_output.assert_called_once_with( 41 | ['kubectl', 'get', 'pods', '-o', 'name', '-a', '-l', 42 | 'name-prefix in (test_pod)'], universal_newlines=True) 43 | 44 | @mock.patch.object(subprocess, 'check_output') 45 | @mock.patch.object(subprocess, 'call') 46 | def testDeletePods(self, mock_check_call, mock_check_output): 47 | mock_check_output.return_value = '' 48 | kubectl_util.DeletePods('test_pod', 'test.yaml') 49 | mock_check_call.assert_called_once_with( 50 | ['kubectl', 'delete', '--filename=test.yaml']) 51 | mock_check_output.assert_called_once_with( 52 | ['kubectl', 'get', 'pods', '-o', 'name', '-a', '-l', 53 | 'name-prefix in (test_pod)'], universal_newlines=True) 54 | 55 | @mock.patch.object(subprocess, 'check_output') 56 | def testWaitForCompletion(self, mock_check_output): 57 | # Test success 58 | mock_check_output.return_value = '\'0,0,\'' 59 | self.assertTrue(kubectl_util.WaitForCompletion('test_pod')) 60 | 61 | # Test failure 62 | mock_check_output.return_value = '\'0,1,\'' 63 | self.assertFalse(kubectl_util.WaitForCompletion('test_pod')) 64 | 65 | # Test timeout 66 | with self.assertRaises(kubectl_util.TimeoutError): 67 | mock_check_output.return_value = '\'0,,\'' 68 | kubectl_util.WaitForCompletion('test_pod', timeout=5) 69 | 70 | 71 | if __name__ == '__main__': 72 | unittest.main() 73 | -------------------------------------------------------------------------------- /cluster/cloud-formation-example/README.md: -------------------------------------------------------------------------------- 1 | # TensorFlow 2 | 3 | 4 | Create Stack: 5 | ``` 6 | aws --region ap-southeast-2 cloudformation create-stack --stack-name tensorflow --template-body file://tensorflow.yaml --parameters ParameterKey=KeyName,ParameterValue=[KeyName] 7 | ``` 8 | 9 | Update Stack: 10 | ``` 11 | aws --region ap-southeast-2 cloudformation update-stack --stack-name tensorflow --template-body file://tensorflow.yaml --parameters ParameterKey=KeyName,ParameterValue=[KeyName] 12 | ``` 13 | 14 | Delete Stack: 15 | ``` 16 | aws --region ap-southeast-2 cloudformation delete-stack --stack-name tensorflow 17 | ``` 18 | 19 | Describe Stack: 20 | ``` 21 | aws --region ap-southeast-2 cloudformation describe-stacks --stack-name tensorflow 22 | ``` 23 | 24 | # Create DNS zone distributed.tensorflow. 25 | bash -x zone.sh create distributed.tensorflow. ap-southeast-2 vpc-9e314bfa 26 | # Launch cluster with CloudFormation 27 | aws --region ap-southeast-2 cloudformation create-stack --stack-name tensorflow --template-body file://tensorflow.yaml --parameters ParameterKey=KeyName,ParameterValue=ytang ParameterKey=SubnetId,ParameterValue=subnet-8eaba9ea ParameterKey=VPC,ParameterValue=vpc-9e314bfa 28 | # Destroy cluster with CloudFormation 29 | aws --region ap-southeast-2 cloudformation delete-stack --stack-name tensorflow 30 | # Delete DNS zone distributed.tensorflow. 31 | bash -x zone.sh delete distributed.tensorflow. ap-southeast-2 vpc-9e314bfa 32 | -------------------------------------------------------------------------------- /cluster/cloud-formation-example/iam.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Description: TensorFlow IamInstanceProfile CloudFormation 3 | Parameters: 4 | InstanceProfileName: 5 | Type: String 6 | Default: TensorFlowCloudFormation 7 | Resources: 8 | Role: 9 | Type: AWS::IAM::Role 10 | Properties: 11 | AssumeRolePolicyDocument: 12 | Version: 2012-10-17 13 | Statement: 14 | Effect: Allow 15 | Principal: 16 | Service: 17 | - ec2.amazonaws.com 18 | Action: 19 | - sts:AssumeRole 20 | Policies: 21 | Type: AWS::IAM::Policy 22 | Properties: 23 | PolicyDocument: 24 | Version: 2012-10-17 25 | Statement: 26 | - 27 | Effect: "Allow" 28 | Action: 29 | - "ec2:AssociateAddress" 30 | - "ec2:DisassociateAddress" 31 | Resource: "*" 32 | PolicyName: !Join [ "-", [ !Ref "AWS::StackName", "Policies" ] ] 33 | Roles: 34 | - !Ref Role 35 | InstanceProfile: 36 | Type: AWS::IAM::InstanceProfile 37 | Properties: 38 | Roles: 39 | - !Ref Role 40 | InstanceProfileName: !Ref InstanceProfileName 41 | -------------------------------------------------------------------------------- /cluster/cloud-formation-example/zone.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | set -e 3 | 4 | option=$1 5 | Name=$2 6 | Region=$3 7 | VPC=$4 8 | 9 | if [[ "$option" == "create" ]]; then 10 | aws --region $Region route53 create-hosted-zone --name $Name --vpc VPCRegion=$Region,VPCId=$VPC --caller-reference $Name.$(date "+%F-%T") 11 | exit 0 12 | elif [[ "$option" == "delete" ]]; then 13 | HostedZoneId=$(aws --region $Region route53 list-hosted-zones --query "HostedZones[?Name == '$Name'].Id" --output text | sed 's/\/hostedzone\///g') 14 | if [[ ! -z $HostedZoneId ]]; then 15 | aws --region $Region route53 list-resource-record-sets \ 16 | --hosted-zone-id $HostedZoneId | 17 | jq -c '.ResourceRecordSets[]' | 18 | while read -r resourcerecordset; do 19 | read -r name type <<<$(echo $(jq -r '.Name,.Type' <<<"$resourcerecordset")) 20 | if [ $type != "NS" -a $type != "SOA" ]; then 21 | aws --region $Region route53 change-resource-record-sets \ 22 | --hosted-zone-id $HostedZoneId \ 23 | --change-batch '{"Changes":[{"Action":"DELETE","ResourceRecordSet": '"$resourcerecordset"' }]}' \ 24 | --output text --query 'ChangeInfo.Id' 25 | fi 26 | done 27 | aws --region $Region route53 delete-hosted-zone --id $HostedZoneId 28 | fi 29 | exit 0 30 | else 31 | exit 1 32 | fi 33 | -------------------------------------------------------------------------------- /cluster/connect: -------------------------------------------------------------------------------- 1 | connect.py -------------------------------------------------------------------------------- /cluster/connect.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | 4 | Script to connect to most recent instance with containing given fragment: 5 | Usage: 6 | connect 7 | -- connects to most recently launched instance 8 | connect i3 9 | -- connects to most recently launchedn instance containing i3 in instance id 10 | 11 | 12 | Debugging/exploring: 13 | 14 | python 15 | from pprint import pprint 16 | import boto3 17 | ec2 = boto3.client('ec2') 18 | response = ec2.describe_instances() 19 | reservation=response['Reservations'][0] 20 | instance = reservation['Instances'][0] 21 | pprint(instance) 22 | """ 23 | 24 | # todo: allow to do ls, show tags 25 | # todo: handle KeyError: 'PublicIpAddress' 26 | 27 | import boto3 28 | import time 29 | import sys 30 | import os 31 | from datetime import datetime 32 | from operator import itemgetter 33 | 34 | 35 | def toseconds(dt): 36 | # to invert: 37 | # import pytz 38 | # utc = pytz.UTC 39 | # utc.localize(datetime.fromtimestamp(seconds)) 40 | return time.mktime(dt.utctimetuple()) 41 | 42 | def main(): 43 | fragment = '' 44 | if len(sys.argv)>1: 45 | fragment = sys.argv[1] 46 | 47 | ec2 = boto3.client('ec2') 48 | response = ec2.describe_instances() 49 | 50 | instance_list = [] 51 | for reservation in response['Reservations']: 52 | for instance in reservation['Instances']: 53 | instance_list.append((toseconds(instance['LaunchTime']), instance)) 54 | 55 | import pytz 56 | from tzlocal import get_localzone # $ pip install tzlocal 57 | 58 | sorted_instance_list = sorted(instance_list, key=itemgetter(0)) 59 | cmd = '' 60 | for (ts, instance) in reversed(sorted_instance_list): 61 | if fragment in instance['InstanceId']: 62 | 63 | localtime = instance['LaunchTime'].astimezone(get_localzone()) 64 | keyname = instance.get('KeyName','none') 65 | print("Connecting to %s launched at %s with key %s" % (instance['InstanceId'], localtime, keyname)) 66 | cmd = "ssh -i $HOME/Dropbox/yaroslav.pem -o StrictHostKeyChecking=no ubuntu@"+instance['PublicIpAddress'] 67 | break 68 | if not cmd: 69 | print("no instance id contains fragment '%s'"%(fragment,)) 70 | else: 71 | print(cmd) 72 | os.system(cmd) 73 | 74 | 75 | 76 | if __name__=='__main__': 77 | main() 78 | -------------------------------------------------------------------------------- /cluster/delete_placement_groups.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # delete all placement groups 4 | 5 | import boto3 6 | 7 | # {'PlacementGroups': [{'GroupName': 'gpu12', 8 | # 'State': 'available', 9 | # 'Strategy': 'cluster'}, 10 | # {'GroupName': 'gpu6', 'State': 'available', 'Strategy': 'cluster'}, 11 | # {'GroupName': 'gpu10', 'State': 'available', 'Strategy': 'cluster'}, 12 | # {'GroupName': 'gpu4', 'State': 'available', 'Strategy': 'cluster'}, 13 | # {'GroupName': 'cnn2', 'State': 'available', 'Strategy': 'cluster'}, 14 | # {'GroupName': 'gpu5', 'State': 'available', 'Strategy': 'cluster'}, 15 | # {'GroupName': 'gpu3', 'State': 'available', 'Strategy': 'cluster'}, 16 | # {'GroupName': 'tf', 'State': 'available', 'Strategy': 'cluster'}, 17 | # {'GroupName': 'gpu7', 'State': 'available', 'Strategy': 'cluster'}, 18 | # {'GroupName': 'gpu11', 'State': 'available', 'Strategy': 'cluster'}, 19 | # {'GroupName': 'gpu8', 'State': 'available', 'Strategy': 'cluster'}, 20 | # {'GroupName': 'gpu9', 'State': 'available', 'Strategy': 'cluster'}, 21 | # {'GroupName': 'cnn', 'State': 'available', 'Strategy': 'cluster'}], 22 | # 'ResponseMetadata': {'HTTPHeaders': {'content-type': 'text/xml;charset=UTF-8', 23 | # 'date': 'Tue, 28 Nov 2017 18:52:18 GMT', 24 | # 'server': 'AmazonEC2', 25 | # 'transfer-encoding': 'chunked', 26 | # 'vary': 'Accept-Encoding'}, 27 | # 'HTTPStatusCode': 200, 28 | # 'RequestId': '3d7adfe7-1109-413d-9aab-2f0aeafef968', 29 | # 'RetryAttempts': 0}} 30 | 31 | import boto3 32 | ec2 = boto3.client('ec2') 33 | 34 | result=ec2.describe_placement_groups() 35 | #print(result) 36 | for entry in result["PlacementGroups"]: 37 | name = entry.get('GroupName', '---') 38 | try: 39 | print("Deleting "+name) 40 | response = ec2.delete_placement_group(GroupName=name) 41 | print("Response was %d" %(response['ResponseMetadata']['HTTPStatusCode'])) 42 | except Exception as e: 43 | print("Failed with %s"%(e,)) 44 | -------------------------------------------------------------------------------- /cluster/fill_efs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import numpy as np 4 | import math 5 | import argparse 6 | 7 | parser = argparse.ArgumentParser(description='script to fill EFS with data') 8 | 9 | parser.add_argument('--gb', type=int, default=100, metavar='N', 10 | help='how many GBs to dump') 11 | parser.add_argument('--chunk_gb', type=int, default=1, metavar='N', 12 | help='how many GBs to dump') 13 | parser.add_argument('--fn', type=str, default="fill", metavar='N', 14 | help='filename') 15 | args = parser.parse_args() 16 | 17 | def main(): 18 | chunk_size = args.chunk_gb*1e9 19 | current_size = 0 20 | 21 | file_counter = 0 22 | max_file_counter = int(math.ceil(args.gb/args.chunk_gb)) 23 | while current_size < args.gb*1e9: 24 | fn = args.fn+"-%05d-of-%05d"%(file_counter, max_file_counter) 25 | file_counter+=1 26 | with open(fn, 'wb') as out: 27 | out.write(np.random.bytes(chunk_size)) 28 | print("Wrote %5.1f GBs"%(current_size/1e9)) 29 | current_size+=chunk_size 30 | 31 | if __name__=='__main__': 32 | main() 33 | -------------------------------------------------------------------------------- /cluster/imagenet64/requirements.txt: -------------------------------------------------------------------------------- 1 | boto3 2 | paramiko 3 | pyyaml 4 | tensorflow-gpu==1.4 5 | -------------------------------------------------------------------------------- /cluster/instance_info.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | import boto3 4 | 5 | """ 6 | A tool for retrieving basic information from the running EC2 instances. 7 | """ 8 | 9 | # Connect to EC2 10 | ec2 = boto3.resource('ec2') 11 | 12 | # Get information for all running instances 13 | running_instances = ec2.instances.filter(Filters=[{ 14 | 'Name': 'instance-state-name', 15 | 'Values': ['running']}]) 16 | 17 | ec2info = defaultdict() 18 | for instance in running_instances: 19 | for tag in instance.tags or []: 20 | if 'Name'in tag['Key']: 21 | name = tag['Value'] 22 | if name != 'tf': 23 | continue 24 | # Add instance info to a dictionary 25 | ec2info[instance.id] = { 26 | 'Name': name, 27 | 'Type': instance.instance_type, 28 | 'State': instance.state['Name'], 29 | 'Private IP': instance.private_ip_address, 30 | 'Public IP': instance.public_ip_address, 31 | 'Launch Time': instance.launch_time 32 | } 33 | 34 | attributes = ['Name', 'Type', 'State', 'Private IP', 'Public IP', 'Launch Time'] 35 | for instance_id, instance in ec2info.items(): 36 | for key in attributes: 37 | print("{0}: {1}".format(key, instance[key])) 38 | print("------") 39 | 40 | -------------------------------------------------------------------------------- /cluster/launch_simple_tf.py: -------------------------------------------------------------------------------- 1 | # simple example of launching tensorflow job 2 | 3 | import time 4 | import tensorflow as tf 5 | 6 | flags = tf.flags 7 | flags.DEFINE_string("role", "launcher", "either launcher or worker") 8 | flags.DEFINE_integer("data_mb", 128, "size of vector in MBs") 9 | flags.DEFINE_integer("iters_per_step", 10, "number of additions per step") 10 | flags.DEFINE_string("cluster", "aws", "where to run (aws or local)") 11 | FLAGS = flags.FLAGS 12 | 13 | 14 | def main(): 15 | if FLAGS.role == "launcher": 16 | launcher() 17 | elif FLAGS.role == "worker": 18 | worker() 19 | else: 20 | assert False, "Unknown role "+FLAGS.role 21 | 22 | 23 | def launcher(do_local=False): 24 | if FLAGS.cluster == 'local': 25 | import tmux 26 | job = tmux.tf_job('myjob', 1) 27 | elif FLAGS.cluster == 'aws': 28 | import aws 29 | job = aws.tf_job('myjob', 1) 30 | else: 31 | assert False, "Unknown cluster "+FLAGS.cluster 32 | 33 | task = job.tasks[0] 34 | task.upload(__file__) # copies current script onto machine 35 | setup_cmd = ("source ~/.bashrc && export PATH=~/anaconda3/bin:$PATH && " 36 | "source activate tf") 37 | task.run("%s && python %s --role=worker" % (setup_cmd, __file__,)) 38 | 39 | print("To see the output: tail -f %s" %(task.last_stdout)) 40 | print("To interact with the task, do "+task.connect_instructions) 41 | 42 | 43 | def worker(): 44 | """Worker script that runs on AWS machine. Adds vectors of ones forever, 45 | prints MB/s.""" 46 | 47 | def session_config(): 48 | optimizer_options = tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0) 49 | config = tf.ConfigProto( 50 | graph_options=tf.GraphOptions(optimizer_options=optimizer_options)) 51 | config.operation_timeout_in_ms = 10*1000 # abort after 10 seconds 52 | return config 53 | 54 | params_size = 250*1000*FLAGS.data_mb # 1MB is 250k floats 55 | dtype=tf.float32 56 | val = tf.ones((), dtype=dtype) 57 | vals = tf.fill([params_size], val) 58 | params = tf.Variable(vals) 59 | update = params.assign_add(vals) 60 | 61 | sess = tf.Session(config=session_config()) 62 | sess.run(params.initializer) 63 | 64 | while True: 65 | start_time = time.perf_counter() 66 | for i in range(FLAGS.iters_per_step): 67 | sess.run(update.op) 68 | 69 | elapsed_time = time.perf_counter() - start_time 70 | rate = float(FLAGS.iters_per_step)*FLAGS.data_mb/elapsed_time 71 | print('%.2f MB/s'%(rate,)) 72 | 73 | 74 | if __name__=='__main__': 75 | main() 76 | -------------------------------------------------------------------------------- /cluster/local_distributed_benchmark.py: -------------------------------------------------------------------------------- 1 | """Benchmark tensorflow distributed by adding vector of ones on worker2 2 | to variable on worker1 as fast as possible. 3 | On 2014 macbook, TensorFlow 0.10 this shows 4 | Local rate: 2175.28 MB per second 5 | Distributed rate: 107.13 MB per second 6 | """ 7 | 8 | import subprocess 9 | import tensorflow as tf 10 | import time 11 | import sys 12 | 13 | flags = tf.flags 14 | flags.DEFINE_integer("iters", 10, "Maximum number of additions") 15 | flags.DEFINE_integer("data_mb", 100, "size of vector in MBs") 16 | flags.DEFINE_string("port1", "12224", "port of worker1") 17 | flags.DEFINE_string("port2", "12225", "port of worker2") 18 | flags.DEFINE_string("task", "", "internal use") 19 | FLAGS = flags.FLAGS 20 | 21 | # setup local cluster from flags 22 | host = "127.0.0.1:" 23 | cluster = {"worker": [host+FLAGS.port1, host+FLAGS.port2]} 24 | clusterspec = tf.train.ClusterSpec(cluster).as_cluster_def() 25 | 26 | def default_config(): 27 | optimizer_options = tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0) 28 | config = tf.ConfigProto( 29 | graph_options=tf.GraphOptions(optimizer_options=optimizer_options)) 30 | config.log_device_placement = False 31 | config.allow_soft_placement = False 32 | return config 33 | 34 | def create_graph(device1, device2): 35 | """Create graph that keeps variable on device1 and 36 | vector of ones/addition op on device2""" 37 | 38 | tf.reset_default_graph() 39 | dtype=tf.int32 40 | params_size = 250*1000*FLAGS.data_mb # 1MB is 250k integers 41 | 42 | with tf.device(device1): 43 | params = tf.get_variable("params", [params_size], dtype, 44 | initializer=tf.zeros_initializer) 45 | with tf.device(device2): 46 | # constant node gets placed on device1 because of simple_placer 47 | # update = tf.constant(1, shape=[params_size], dtype=dtype) 48 | update = tf.get_variable("update", [params_size], dtype, 49 | initializer=tf.ones_initializer) 50 | add_op = params.assign_add(update) 51 | 52 | init_op = tf.initialize_all_variables() 53 | return init_op, add_op 54 | 55 | def run_benchmark(sess, init_op, add_op): 56 | """Returns MB/s rate of addition.""" 57 | 58 | sess.run(init_op) 59 | sess.run(add_op.op) # warm-up 60 | start_time = time.time() 61 | for i in range(FLAGS.iters): 62 | # change to add_op.op to make faster 63 | sess.run(add_op) 64 | elapsed_time = time.time() - start_time 65 | return float(FLAGS.iters)*FLAGS.data_mb/elapsed_time 66 | 67 | 68 | def run_benchmark_local(): 69 | ops = create_graph(None, None) 70 | sess = tf.Session(config=default_config()) 71 | return run_benchmark(sess, *ops) 72 | 73 | 74 | def run_benchmark_distributed(): 75 | ops = create_graph("/job:worker/task:0", "/job:worker/task:1") 76 | 77 | # launch distributed service 78 | def runcmd(cmd): subprocess.Popen(cmd, shell=True, stderr=subprocess.STDOUT) 79 | runcmd("python %s --task=0"%(sys.argv[0])) 80 | runcmd("python %s --task=1"%(sys.argv[0])) 81 | time.sleep(1) 82 | 83 | sess = tf.Session("grpc://"+host+FLAGS.port1, config=default_config()) 84 | return run_benchmark(sess, *ops) 85 | 86 | if __name__=='__main__': 87 | if not FLAGS.task: 88 | 89 | rate1 = run_benchmark_local() 90 | rate2 = run_benchmark_distributed() 91 | 92 | print("Adding data in %d MB chunks" %(FLAGS.data_mb)) 93 | print("Local rate: %.2f MB per second" %(rate1,)) 94 | print("Distributed rate: %.2f MB per second" %(rate2,)) 95 | 96 | else: # Launch TensorFlow server 97 | server = tf.train.Server(clusterspec, config=default_config(), 98 | job_name="worker", 99 | task_index=int(FLAGS.task)) 100 | server.join() 101 | -------------------------------------------------------------------------------- /cluster/myutil.py: -------------------------------------------------------------------------------- 1 | from pprint import pprint as pp 2 | import yaml 3 | #import util 4 | import boto3 5 | from collections import OrderedDict 6 | import time 7 | 8 | class timeit: 9 | """Decorator to measure length of time spent in the block in millis and log 10 | it to TensorBoard.""" 11 | 12 | def __init__(self, tag=""): 13 | self.tag = tag 14 | 15 | def __enter__(self): 16 | self.start = time.perf_counter() 17 | return self 18 | 19 | def __exit__(self, *args): 20 | self.end = time.perf_counter() 21 | interval_sec = (self.end - self.start) 22 | print("%s took %.2f seconds"%(self.tag, interval_sec)) 23 | 24 | def get_instance_ip_map(): 25 | """Return instance_id->private_ip map for all running instances.""" 26 | 27 | ec2 = boto3.resource('ec2') 28 | 29 | # Get information for all running instances 30 | running_instances = ec2.instances.filter(Filters=[{ 31 | 'Name': 'instance-state-name', 32 | 'Values': ['running']}]) 33 | 34 | ec2info = OrderedDict() 35 | for instance in running_instances: 36 | name = '' 37 | for tag in instance.tags or []: 38 | if 'Name' in tag['Key']: 39 | name = tag['Value'] 40 | ec2info[instance.id] = instance.private_ip_address 41 | 42 | return ec2info 43 | -------------------------------------------------------------------------------- /cluster/terminate_instances.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | 4 | Script to kill all instances matching given prefix. 5 | 6 | Usage: 7 | 8 | ./terminate_instances.py gpu # terminates all instances matching "gpu*" 9 | """ 10 | 11 | import boto3 12 | import time 13 | import sys 14 | import os 15 | 16 | LIMIT_TO_KEY = 'yaroslav' # only touch instances launched with this key, 17 | # set to '' to remove restriction 18 | 19 | def main(): 20 | prefix = sys.argv[1] 21 | 22 | ec2 = boto3.client('ec2') 23 | response = ec2.describe_instances() 24 | 25 | def get_name(instance_response): 26 | names = [entry['Value'] for entry in instance_response.get('Tags',[]) if 27 | entry['Key']=='Name'] 28 | if not names: 29 | names = [''] 30 | assert len(names)==1 31 | return names[0] 32 | 33 | instance_list = [] 34 | for reservation in response['Reservations']: 35 | for instance_response in reservation['Instances']: 36 | instance_list.append((get_name(instance_response), 37 | instance_response)) 38 | 39 | instances_to_kill = [] 40 | for (name, instance_response) in instance_list: 41 | if not name.startswith(prefix): 42 | continue 43 | key = instance_response.get('KeyName', '') 44 | if LIMIT_TO_KEY and LIMIT_TO_KEY != key: 45 | print("instance %s matches but key %s doesn't match desired key %s, " 46 | "skipping" %(name, key, LIMIT_TO_KEY)) 47 | continue 48 | state = instance_response['State']['Name'] 49 | if state == 'terminated': 50 | continue 51 | instances_to_kill.append((instance_response['InstanceId'], 52 | name, 53 | instance_response['AmiLaunchIndex'], 54 | state)) 55 | 56 | for (instance_id, name, task_id, state) in instances_to_kill: 57 | print("%s:%s %s"%(name, task_id, state)) 58 | 59 | answer = input("%d instances found, terminate? (Y/n) " % ( 60 | len(instances_to_kill))) 61 | if not answer: 62 | answer = "y" 63 | if answer.lower() == "y": 64 | instance_ids = [record[0] for record in instances_to_kill] 65 | response = ec2.terminate_instances(InstanceIds=instance_ids) 66 | print("Terminating, got response: %s", response) 67 | else: 68 | print("Didn't get y, doing nothing") 69 | 70 | 71 | if __name__=='__main__': 72 | main() 73 | -------------------------------------------------------------------------------- /cluster/test_aws.py: -------------------------------------------------------------------------------- 1 | # simple example of launching tensorflow job 2 | 3 | import aws 4 | import os 5 | import sys 6 | import time 7 | import tensorflow as tf 8 | import boto3 9 | 10 | flags = tf.flags 11 | flags.DEFINE_string("role", "launcher", "either launcher or worker") 12 | flags.DEFINE_integer("data_mb", 128, "size of vector in MBs") 13 | flags.DEFINE_integer("iters_per_step", 10, "number of additions per step") 14 | FLAGS = flags.FLAGS 15 | 16 | module_path=os.path.dirname(os.path.abspath(__file__)) 17 | sys.path.insert(0, module_path+'/tf-tools/benchmark/runner') 18 | import cluster_aws as toby_aws 19 | 20 | 21 | def test_new_job(): 22 | name = "testjob" 23 | instances = toby_aws.LookupAwsInstances(instance_tag=name) 24 | assert not instances, "Instances already exist, kill them first" 25 | 26 | job = aws.tf_job(name, 2) 27 | instances = toby_aws.LookupAwsInstances(instance_tag=name) 28 | assert len(instances) == 2 29 | 30 | def test_terminate_job(): 31 | aws.terminate_job("testjob") 32 | 33 | 34 | def test_reuse_job(): 35 | name = "testjob" 36 | job = aws.tf_job(name, 2) 37 | 38 | def test_send_file(): 39 | name = "testjob" 40 | job = aws.tf_job(name, 4) 41 | job.wait_until_ready() 42 | task0 = job.tasks[0] 43 | secret_word = "testfile3" 44 | os.system("echo '%s' > upload_test.txt"%(secret_word,)) 45 | task0.upload('upload_test.txt') 46 | stdout,stderr = task0.run_sync("cat upload_test.txt") 47 | print(stdout) # => testfile2 48 | assert stdout.strip() == secret_word 49 | 50 | def test_upload_directory(): 51 | pass 52 | 53 | def test_stream_output(): 54 | name = "testjob" 55 | job = aws.tf_job(name, 4) 56 | job.wait_until_ready() 57 | task = job.tasks[0] 58 | task.run('cd Dropbox && ls') 59 | time.sleep(0.5) # async ... todo: expose thread and join instead of sleep? 60 | os.system('cat '+task.last_stdout) 61 | 62 | 63 | def main(): 64 | # test_terminate_job() 65 | # test_new_job() 66 | # test_reuse_job() 67 | # test_send_file() 68 | test_stream_output() 69 | 70 | if __name__=='__main__': 71 | main() 72 | -------------------------------------------------------------------------------- /cluster/tf-tools/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .DS_Store 3 | -------------------------------------------------------------------------------- /cluster/tf-tools/benchmark/multi_gpu/advanced_tweaks_compare.sh: -------------------------------------------------------------------------------- 1 | # Showing NCHW vs NHWC, NCCL and paramater server GPU vs CPU 2 | _NUM_GPUS=1,2,8 3 | LOG_FOLDER=advanced_tests 4 | 5 | # PS GPU vs. CPU NHWC 6 | ./test_runner.sh --model resnet50 --num_batches 100 --batch_size 32 --gpus ${_NUM_GPUS} --ps_server gpu --variable_update send_recv --data_format NHWC --log_folder_prefix ${LOG_FOLDER} --framework tensorflow 7 | ./test_runner.sh --model resnet50 --num_batches 100 --batch_size 32 --gpus ${_NUM_GPUS} --ps_server cpu --variable_update send_recv --data_format NHWC --log_folder_prefix ${LOG_FOLDER} --framework tensorflow 8 | 9 | # NCHW vs NHWC (GPU PS) 10 | ./test_runner.sh --model resnet50 --num_batches 100 --batch_size 32 --gpus ${_NUM_GPUS} --ps_server gpu --data_format NCHW --variable_update send_recv --log_folder_prefix ${LOG_FOLDER} --framework tensorflow 11 | ./test_runner.sh --model resnet50 --num_batches 100 --batch_size 32 --gpus ${_NUM_GPUS} --ps_server gpu --data_format NHWC --variable_update send_recv --log_folder_prefix ${LOG_FOLDER} --framework tensorflow 12 | 13 | # Add NCCL to NCHW (GPU PS) 14 | ./test_runner.sh --model resnet50 --num_batches 100 --batch_size 32 --gpus ${_NUM_GPUS} --ps_server gpu --data_format NCHW --variable_update replicated --log_folder_prefix ${LOG_FOLDER} --framework tensorflow 15 | 16 | 17 | -------------------------------------------------------------------------------- /cluster/tf-tools/benchmark/multi_gpu/image_classification_bench_tests.sh: -------------------------------------------------------------------------------- 1 | # Runs tests for an 8 GPU server 2 | _NUM_GPUS=1,2,4,8 3 | # Inception v3 4 | ./test_runner.sh --model inception3 --num_batches 100 --batch_size 32 --gpus ${_NUM_GPUS} --framework tensorflow 5 | ./test_runner.sh --model inception3 --num_batches 4 --batch_size 32 --gpus ${_NUM_GPUS} --framework mxnet 6 | 7 | 8 | # Resnet-50 9 | ./test_runner.sh --model resnet50 --num_batches 100 --batch_size 32 --gpus ${_NUM_GPUS} --framework tensorflow 10 | ./test_runner.sh --model resnet50 --num_batches 4 --batch_size 32 --gpus ${_NUM_GPUS} --framework mxnet 11 | 12 | 13 | # Resnet-152 14 | ./test_runner.sh --model resnet152 --num_batches 100 --batch_size 32 --gpus ${_NUM_GPUS} --framework tensorflow 15 | ./test_runner.sh --model resnet152 --num_batches 4 --batch_size 32 --gpus ${_NUM_GPUS} --framework mxnet 16 | 17 | 18 | # AlexNet (OWT) 19 | # AlexNet script is broken on MXNet. 20 | #./test_runner.sh --model alexnet --num_batches 4 --batch_size 512 --gpus ${_NUM_GPUS} --framework mxnet 21 | ./test_runner.sh --model alexnet --num_batches 100 --batch_size 512 --gpus ${_NUM_GPUS} --framework tensorflow 22 | ./test_runner.sh --model alexnet --num_batches 100 --batch_size 128 --gpus ${_NUM_GPUS} --framework tensorflow 23 | -------------------------------------------------------------------------------- /cluster/tf-tools/benchmark/multi_gpu/stats_monitor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | # Get all nvidia-smi data worth having 5 | # There is no historical data so calling this after a run 6 | # when the GPU may no long be throttled is of no value. 7 | 8 | KEEP_LOOP=true 9 | LOG_FULL_PATH="./monitor_log.txt" 10 | LOG_SUMMARY_FULL_PATH="./log_summary.txt" 11 | 12 | 13 | while [[ $# -gt -0 ]]; do 14 | key="$1" 15 | # echo $key 16 | # echo $2 17 | case $key in 18 | --log_full_path) 19 | LOG_FULL_PATH="$2" # location to log raw monitoring logs, e.g. nvidia-smi 20 | shift 21 | ;; 22 | --log_summary_full_path) 23 | LOG_SUMMARY_FULL_PATH="$2" # Format of the data NHWC or NCHW (NVIDIA) 24 | shift 25 | ;; 26 | *) 27 | echo "Unknown flag: $key" 28 | ;; 29 | esac 30 | shift # past argument or value 31 | done 32 | 33 | 34 | 35 | 36 | MAX_SLOWDOWN_GPUS=0 37 | # Handle CTRL-C or other term signal, log the max number of GPUs that showed 38 | # Errors, for now that means "HW Slowdown: Active" 39 | function summarizeCleanup { 40 | echo "Max GPUs throttled: ${MAX_SLOWDOWN_GPUS}" 41 | echo "Max GPUs throttled: ${MAX_SLOWDOWN_GPUS}" >> $LOG_SUMMARY_FULL_PATH 42 | exit; 43 | } 44 | 45 | # catch being asked to end 46 | trap summarizeCleanup SIGINT SIGTERM 47 | 48 | # Log nvidia-smi data forever (until killed externally) and track when HW Slowdown: Active occures 49 | # which indicates overheating and likely lower clock. 50 | while [ "$KEEP_LOOP" = "true" ]; do 51 | 52 | RESULT=$(nvidia-smi -q -d UTILIZATION,CLOCK,PERFORMANCE | tee -a ${LOG_FULL_PATH} | \ 53 | grep -E 'HW Slowdown' | awk '!/Not Active/ {count++} END{print count}') 54 | 55 | # Handle result being blank. Likely a better way using awk above 56 | if [ "$RESULT" = "" ]; then 57 | RESULT=0 58 | fi 59 | 60 | if [ "$RESULT" -gt "$MAX_SLOWDOWN_GPUS" ]; then 61 | MAX_SLOWDOWN_GPUS=$RESULT 62 | echo "$MAX_SLOWDOWN_GPUS GPU(s) with slowdown" 63 | fi 64 | 65 | 66 | # 10 second seem to be reasonable. 67 | sleep 10 68 | 69 | done 70 | 71 | -------------------------------------------------------------------------------- /cluster/tf-tools/benchmark/multi_gpu/unit_test_stats_monitor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ./monitor_nvidia.sh --log_full_path ./full_log.txt --log_summary_full_path ./log_summary.txt & 4 | 5 | NVIDIA_MONITOR=$! 6 | 7 | echo "Log monitor pid ${NVIDIA_MONITOR}" 8 | 9 | sleep 12 10 | 11 | kill $NVIDIA_MONITOR 12 | wait $NVIDIA_MONITOR 13 | echo "Success: ${NVIDIA_MONITOR} is not longer running" 14 | 15 | # put this in any script to hard kill monitor_nvidida 16 | echo "Test killing with pgrep" 17 | ./monitor_nvidia.sh --log_full_path ./full_log --log_summary_full_path ./log_summary & 18 | NVIDIA_MONITOR_2=$! 19 | 20 | sleep 12 21 | 22 | echo "Log monitor pid ${NVIDIA_MONITOR_2}" 23 | echo "kill with pgrep" 24 | pgrep "monitor_nvidia" | xargs kill 25 | 26 | echo "Wait until dead" 27 | wait $NVIDIA_MONITOR_2 28 | echo "Process is dead: Test successful" 29 | 30 | -------------------------------------------------------------------------------- /cluster/tf-tools/benchmark/runner/configs/aws/multi_server.yaml: -------------------------------------------------------------------------------- 1 | # Run config 2 | cloud_type: aws 3 | 4 | tf_url: tensorflow-gpu 5 | 6 | # Shared with AWS and GCE 7 | instance_tag: tf-monster 8 | instance_type: p2.8xlarge 9 | instance_force_reuse: False 10 | instance_ami: ami-xxxxxxx 11 | instance_count: 8 12 | #instance_on_finish: stop 13 | 14 | # As of May 2017 this config matches what was published on tf.org. 15 | # For batch-size 32, 4 ps_servers is the right setting for 8 workers 16 | run_configs: 17 | - name: distributed 18 | workers: 8 19 | ps_servers: 8 20 | gpus: 8 21 | models: ['resnet50'] 22 | ps_server: gpu 23 | data_format: NCHW 24 | variable_update: distributed_replicated 25 | log_folder: results 26 | framework: tensorflow 27 | num_batches: 100 28 | batch_size: 64 29 | repeat: 5 30 | cross_replica_sync: True 31 | optimizer: sgd 32 | 33 | 34 | #### 35 | # Full run 32 GPUs down to 1 GPU with ps_servers tuned for resnet50 36 | ####### 37 | 38 | - name: distributed 39 | workers: 4 40 | ps_servers: 4 41 | 42 | - name: distributed 43 | workers: 2 44 | ps_servers: 2 45 | 46 | - name: distributed 47 | workers: 1 48 | ps_servers: 1 49 | 50 | - name: distributed 51 | workers: '0' 52 | ps_servers: '0' 53 | gpus: 1 54 | -------------------------------------------------------------------------------- /cluster/tf-tools/benchmark/runner/configs/aws/yaroslav.yaml: -------------------------------------------------------------------------------- 1 | # Run config 2 | cloud_type: aws 3 | 4 | tf_url: tensorflow-gpu 5 | 6 | instance_tag: yaroslav 7 | instance_type: p2.xlarge 8 | instance_force_reuse: False 9 | instance_ami: ami-60df1418 10 | instance_count: 8 11 | #instance_on_finish: stop 12 | 13 | run_configs: 14 | - name: distributed 15 | workers: 1 16 | ps_servers: 1 17 | gpus: 1 18 | models: ['resnet50'] 19 | ps_server: gpu 20 | data_format: NCHW 21 | variable_update: distributed_replicated 22 | log_folder: results 23 | framework: tensorflow 24 | num_batches: 100 25 | batch_size: 64 26 | repeat: 1 27 | cross_replica_sync: True 28 | optimizer: sgd 29 | 30 | -------------------------------------------------------------------------------- /cluster/tf-tools/benchmark/runner/instance_info.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | import boto3 4 | 5 | """ 6 | A tool for retrieving basic information from the running EC2 instances. 7 | """ 8 | 9 | # Connect to EC2 10 | ec2 = boto3.resource('ec2') 11 | 12 | # Get information for all running instances 13 | running_instances = ec2.instances.filter(Filters=[{ 14 | 'Name': 'instance-state-name', 15 | 'Values': ['running']}]) 16 | 17 | ec2info = defaultdict() 18 | for instance in running_instances: 19 | for tag in instance.tags or []: 20 | if 'Name'in tag['Key']: 21 | name = tag['Value'] 22 | if name != 'tf': 23 | continue 24 | # Add instance info to a dictionary 25 | ec2info[instance.id] = { 26 | 'Name': name, 27 | 'Type': instance.instance_type, 28 | 'State': instance.state['Name'], 29 | 'Private IP': instance.private_ip_address, 30 | 'Public IP': instance.public_ip_address, 31 | 'Launch Time': instance.launch_time 32 | } 33 | 34 | attributes = ['Name', 'Type', 'State', 'Private IP', 'Public IP', 'Launch Time'] 35 | for instance_id, instance in ec2info.items(): 36 | for key in attributes: 37 | print("{0}: {1}".format(key, instance[key])) 38 | print("------") 39 | 40 | -------------------------------------------------------------------------------- /cluster/tf-tools/benchmark/runner/test_cluster_aws.py: -------------------------------------------------------------------------------- 1 | from command_builder import * 2 | from pprint import pprint as pp 3 | import yaml 4 | import cluster_aws 5 | 6 | from collections import OrderedDict 7 | import time 8 | 9 | AMI='ami-60df1418' # cuda 8 10 | AMI='ami-9ddb0fe5' # boyd base 11 | KEY_NAME='yaroslav' 12 | KEY_FILE=os.environ['HOME']+'/d/yaroslav.pem' 13 | SECURITY_GROUP='open' 14 | #INSTANCE_TYPE='g3.16xlarge' 15 | INSTANCE_TYPE='p2.8xlarge' 16 | TAG='tf' 17 | 18 | global_timeit_dict = OrderedDict() 19 | class timeit: 20 | """Decorator to measure length of time spent in the block in millis and log 21 | it to TensorBoard.""" 22 | 23 | def __init__(self, tag=""): 24 | self.tag = tag 25 | 26 | def __enter__(self): 27 | self.start = time.perf_counter() 28 | return self 29 | 30 | def __exit__(self, *args): 31 | self.end = time.perf_counter() 32 | interval_sec = (self.end - self.start) 33 | print("%s took %.2f seconds"%(self.tag, interval_sec)) 34 | 35 | def test_two_machine(): 36 | 37 | 38 | 39 | def main(): 40 | FIRST_TIME = False 41 | 42 | if FIRST_TIME: 43 | with timeit('create_instances'): 44 | instances = cluster_aws.CreateAwsInstances(num_instances=2, 45 | image_id=AMI, 46 | key_name=KEY_NAME, 47 | ssh_key=KEY_FILE, 48 | security_group=SECURITY_GROUP, 49 | instance_tag=TAG, 50 | placement_group='', 51 | instance_type=INSTANCE_TYPE) 52 | else: 53 | instances = cluster_aws.LookupAwsInstances(instance_tag=TAG, 54 | ssh_key=KEY_FILE) 55 | # Exception connecting to host via ssh (could be a timeout): 56 | 57 | 58 | 59 | with timeit('connect'): 60 | instance = instances[0] 61 | instance.WaitUntilReady() 62 | 63 | 64 | def line_extractor(line): 65 | return True 66 | 67 | instance.ExecuteCommandAndStreamOutput('mkdir 43', 68 | stdout_file='/tmp/output') 69 | instance.ExecuteCommandAndStreamOutput('ls', stdout_file='/tmp/output') 70 | 71 | import pdb; pdb.set_trace() 72 | 73 | 74 | if __name__=='__main__': 75 | main() 76 | -------------------------------------------------------------------------------- /cluster/tf-tools/benchmark/runner/test_command_builder.py: -------------------------------------------------------------------------------- 1 | from command_builder import * 2 | from pprint import pprint as pp 3 | import yaml 4 | 5 | def main(): 6 | 7 | 8 | with open('configs/aws/yaroslav.yaml') as stream: 9 | config_yaml = yaml.load(stream) 10 | 11 | configs = LoadYamlRunConfig(config_yaml, 1) 12 | # pp(configs) 13 | 14 | config = configs[0] 15 | 16 | worker_hosts = ['1','2'] 17 | worker_hosts_str = ','.join(worker_hosts) 18 | ps_hosts = ['a','b'] 19 | ps_hosts_str = ','.join(ps_hosts) 20 | for i,worker in enumerate(worker_hosts): 21 | print(BuildDistributedCommandWorker(config, worker_hosts_str, ps_hosts_str, i)) 22 | 23 | for i,worker in enumerate(ps_hosts): 24 | print(BuildDistributedCommandPS(config, worker_hosts_str, ps_hosts_str, i)) 25 | 26 | 27 | 28 | 29 | 30 | if __name__=='__main__': 31 | main() 32 | -------------------------------------------------------------------------------- /cluster/upload_test.txt: -------------------------------------------------------------------------------- 1 | testfile3 2 | -------------------------------------------------------------------------------- /conditional_backprop.py: -------------------------------------------------------------------------------- 1 | # Example of conditionally enabling backprop based on a variable. 2 | # variable "switches" determines which entries of "y" will be backpropagated 3 | # through. 4 | # 5 | # IE, switches.assign([1,0]) enables backprop through first value but not 6 | # second. 7 | # 8 | # Running it you should see following on stdout: 9 | # Value 2.0, gradient 2.0 10 | # Value 2.0, gradient 0.0 11 | # Value 2.0, gradient 1.0 12 | 13 | import tensorflow as tf 14 | 15 | def conditional_backprop(do_backprop, tensor): 16 | do_backprop = tf.Print(do_backprop, [do_backprop], "switch query") 17 | t = tf.cond(tf.cast(do_backprop, tf.bool), 18 | lambda: tf.Print(tensor, [0], 19 | "backprop enabled for "+tensor.op.name), 20 | lambda: tf.zeros_like(tensor)) 21 | y = t + tf.stop_gradient(tensor - t) 22 | return y 23 | 24 | x = tf.ones((), name="x") 25 | y0 = tf.add(x, 0, name="y0") 26 | y1 = tf.add(x, 0, name="y1") 27 | 28 | switches = tf.Variable(tf.ones((2))) 29 | doit = tf.constant(True) 30 | yy0 = conditional_backprop(switches[0], y0) 31 | yy1 = conditional_backprop(switches[1], y1) 32 | y = tf.stack([yy0, yy1], name="y") 33 | 34 | z = tf.reduce_sum(y) 35 | 36 | grad = tf.gradients(z, [x])[0] 37 | 38 | sess = tf.Session() 39 | sess.run(tf.global_variables_initializer()) 40 | print("Value %.1f, gradient %.1f"%tuple(sess.run([z, grad]))) 41 | 42 | sess.run(switches.assign([0,0])) 43 | print("Value %.1f, gradient %.1f"%tuple(sess.run([z, grad]))) 44 | 45 | sess.run(switches.assign([1,0])) 46 | print("Value %.1f, gradient %.1f"%tuple(sess.run([z, grad]))) 47 | -------------------------------------------------------------------------------- /configure_tf.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/expect -d 2 | # Helper script that uses expect to automatically go through all configure 3 | # steps using the defaults for all options except 4 | # XLA: y 5 | # CUDA: y 6 | # compute capability: 3.5,5.0,6.0,6.1 7 | spawn ./configure 8 | expect "Please specify the location of python*" 9 | send "\r" 10 | expect "Please specify optimization flags to use during compilation when bazel option*" 11 | send "\r" 12 | expect "Do you wish to use jemalloc*" 13 | send "\r" 14 | expect "Do you wish to build TensorFlow with Google Cloud Platform*" 15 | send "\r" 16 | expect "Do you wish to build TensorFlow with Hadoop File System support*" 17 | send "\r" 18 | expect "Do you wish to build TensorFlow with the XLA*" 19 | send "y\r" 20 | expect "Please input the desired Python library*" 21 | send "\r" 22 | expect "Do you wish to build TensorFlow with OpenCL*" 23 | send "\r" 24 | expect "Do you wish to build TensorFlow with CUDA*" 25 | send "y\r" 26 | expect "Please specify which gcc should*" 27 | send "\r" 28 | expect "Please specify the CUDA SDK version you want to use*" 29 | send "\r" 30 | expect "Please specify the location where CUDA toolkit*" 31 | send "\r" 32 | expect "Please specify the Cudnn version*" 33 | send "\r" 34 | expect "Please specify the location where cuDNN" 35 | send "\r" 36 | expect "lease specify a list of comma-separated Cuda compute" 37 | send "3.5,5.2,6.0,6.1\r" 38 | set timeout 120 39 | expect eof 40 | -------------------------------------------------------------------------------- /configure_tf_cpu.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/expect -d 2 | # Helper script that uses expect to automatically go through all configure 3 | # steps using the defaults for all options except 4 | # XLA: y 5 | # CUDA: n 6 | spawn ./configure 7 | expect "Please specify the location of python*" 8 | send "\r" 9 | expect "Please specify optimization flags to use during compilation when bazel option*" 10 | send "\r" 11 | expect "Do you wish to use jemalloc*" 12 | send "\r" 13 | expect "Do you wish to build TensorFlow with Google Cloud Platform*" 14 | send "\r" 15 | expect "Do you wish to build TensorFlow with Hadoop File System support*" 16 | send "\r" 17 | expect "Do you wish to build TensorFlow with the XLA*" 18 | send "y\r" 19 | expect "Please input the desired Python library*" 20 | send "\r" 21 | expect "Do you wish to build TensorFlow with OpenCL*" 22 | send "\r" 23 | expect "Do you wish to build TensorFlow with CUDA*" 24 | send "\r" 25 | set timeout 120 26 | expect eof 27 | -------------------------------------------------------------------------------- /danjar_peek.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.python.client import timeline 3 | 4 | 5 | class Queue(tf.FIFOQueue): 6 | 7 | def __init__(self, capacity): 8 | s = () 9 | d = tf.int32 10 | super().__init__(capacity - 1, [d], [s]) 11 | self._first = tf.get_variable(name="var1", 12 | initializer=tf.ones_initializer(), 13 | shape=s, dtype=d, use_resource=False) 14 | self._size = tf.get_variable(name="size", shape=(), 15 | initializer=tf.zeros_initializer(), 16 | dtype=tf.int32, use_resource=False) 17 | 18 | def peek(self): 19 | return self._first.read_value() 20 | 21 | def enqueue(self, element): 22 | super_ = super() 23 | def first(): 24 | assigns = [self._first.assign(element)] 25 | with tf.control_dependencies(assigns): 26 | return tf.constant(0) 27 | 28 | def other(): 29 | with tf.control_dependencies([super_.enqueue(element)]): 30 | return tf.constant(0) 31 | 32 | with tf.control_dependencies([self._size.assign_add(1)]): 33 | dummy = tf.cond(tf.equal(self._size, 0), first, other) 34 | return tf.identity(dummy) 35 | 36 | 37 | queue = Queue(10) 38 | queue_peek = queue.peek() 39 | print("Peek op is "+str(queue_peek)) 40 | 41 | queue_init = queue.enqueue(tf.constant(-2)) 42 | 43 | 44 | print(tf.get_default_graph().as_graph_def()) 45 | for i in range(20): 46 | sess = tf.Session() 47 | sess.run(tf.global_variables_initializer()) 48 | sess.run(queue_init) 49 | print("queue size", sess.run(queue.size())) 50 | sess.run(queue.close()) 51 | 52 | # print("Printing queue") 53 | # while True: 54 | # print(sess.run(queue.dequeue())) 55 | 56 | run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) 57 | run_options.output_partition_graphs = True 58 | run_metadata = tf.RunMetadata() 59 | #import pdb; pdb.set_trace() 60 | # queue_peek, 61 | result = sess.run(queue_peek, run_metadata=run_metadata, 62 | options=run_options) 63 | 64 | tl = timeline.Timeline(run_metadata.step_stats) 65 | ctf = tl.generate_chrome_trace_format() 66 | with open('timeline-%d.json'%(i,), 'w') as f: 67 | f.write(ctf) 68 | with open('stepstats-%d.json'%(i,), 'w') as f: 69 | f.write(str(run_metadata)) 70 | 71 | print(result, end=' ') 72 | 73 | # Expected: 1 1 1 1 1 1 1 1 1 1 74 | # Actual: 0 1 0 0 1 1 0 0 0 1 75 | -------------------------------------------------------------------------------- /distributed/README.md: -------------------------------------------------------------------------------- 1 | TF distributed tools 2 | -------------------------------------------------------------------------------- /double_memory_bug.py: -------------------------------------------------------------------------------- 1 | # Troubleshooting 2 | # https://github.com/tensorflow/tensorflow/issues/13433#issuecomment-351722017 3 | 4 | import tensorflow as tf 5 | import numpy as np 6 | 7 | def sessrun(*args, **kwargs): 8 | """Helper to do sess.run and save run_metadata""" 9 | global sess, run_metadata 10 | 11 | run_metadata = tf.RunMetadata() 12 | 13 | kwargs['options'] = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) 14 | kwargs['run_metadata'] = run_metadata 15 | result = sess.run(*args, **kwargs) 16 | first_entry = args[0] 17 | # have to do this because sess.run(tensor) is same as sess.run([tensor]) 18 | if isinstance(first_entry, list): 19 | if len(first_entry) == 0 and len(args) == 1: 20 | return None 21 | first_entry = first_entry[0] 22 | 23 | import urllib.request 24 | response = urllib.request.urlopen("https://raw.githubusercontent.com/yaroslavvb/chain_constant_memory/master/mem_util.py") 25 | open("mem_util.py", "wb").write(response.read()) 26 | 27 | import mem_util 28 | 29 | 30 | dtype = tf.float32 31 | dtype_size = 4 # bytes 32 | #shape = (1000,1000*1000) 33 | shape = (100, 1000*1000) 34 | total_size = np.prod(shape)*dtype_size 35 | print("Variable with %.1f GB" %(total_size/1e9,)) 36 | w = tf.Variable(tf.random_uniform(shape,dtype=dtype),dtype=dtype) 37 | sess = tf.Session() 38 | sessrun(tf.global_variables_initializer()) 39 | print(sess.run(w[0,0])) 40 | 41 | mem_util.print_memory_timeline(run_metadata) 42 | -------------------------------------------------------------------------------- /dynamic_stitch_gpu.py: -------------------------------------------------------------------------------- 1 | # from https://github.com/tensorflow/tensorflow/issues/7251 2 | import os 3 | os.environ["CUDA_VISIBLE_DEVICES"]="0" 4 | 5 | import tensorflow as tf 6 | from tensorflow.python.client.timeline import Timeline 7 | 8 | with tf.device("/gpu:0"): 9 | x = tf.ones(100, name="x") 10 | idxs = tf.range(100) 11 | 12 | for i in range(10): 13 | y = tf.identity(x, name="identity-"+str(i)) 14 | x = tf.dynamic_stitch([idxs, idxs], [x, y], name="stitch-"+str(i)) 15 | 16 | config = tf.ConfigProto(graph_options=tf.GraphOptions(optimizer_options=tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0))) 17 | sess = tf.InteractiveSession(config=config) 18 | metadata = tf.RunMetadata() 19 | sess.run(x, options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE, 20 | output_partition_graphs=True), 21 | run_metadata=metadata) 22 | 23 | timeline = Timeline(metadata.step_stats) 24 | with open("dynamic_stitch_gpu_profile.json", "w") as f: 25 | f.write(timeline.generate_chrome_trace_format()) 26 | with open("dynamic_stitch_gpu_profile.pbtxt", "w") as f: 27 | f.write(str(metadata)) 28 | -------------------------------------------------------------------------------- /eager_lbfgs/common_gd.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | parser = argparse.ArgumentParser(description='PyTorch MNIST Example') 3 | 4 | parser.add_argument('--batch-size', type=int, default=60000, metavar='N', 5 | help='input batch size for training') 6 | parser.add_argument('--iters', type=int, default=100, metavar='N', 7 | help='number of iterations to run for (default: 20)') 8 | parser.add_argument('--lr', type=float, default=1.0, metavar='LR', 9 | help='learning rate (default: 1.0)') 10 | parser.add_argument('--no-cuda', action='store_true', default=False, 11 | help='disables CUDA training') 12 | parser.add_argument('--seed', type=int, default=1, metavar='S', 13 | help='random seed (default: 1)') 14 | parser.add_argument('--hidden-size', type=int, default=196, metavar='H', 15 | help='hidden size') 16 | parser.add_argument('--visible-size', type=int, default=784, metavar='V', 17 | help='visible-size') 18 | parser.add_argument('--gd', action='store_true', default=False, 19 | help='force run of gradient descent instead of lbfgs') 20 | parser.add_argument('--history', type=int, default=100, metavar='V', 21 | help='history buffer for lbfgs') 22 | args = parser.parse_args() 23 | -------------------------------------------------------------------------------- /eager_lbfgs/data/short_batch.csv: -------------------------------------------------------------------------------- 1 | 100 2 | 200 3 | 300 4 | -------------------------------------------------------------------------------- /eager_lbfgs/data/short_eager_batch.csv: -------------------------------------------------------------------------------- 1 | 10 2 | 100 3 | -------------------------------------------------------------------------------- /eager_lbfgs/data/short_eager_loss.csv: -------------------------------------------------------------------------------- 1 | 1.125071197748184204e-03 2 | 1.720546046271920204e-03 3 | 2.242934657260775566e-03 4 | -------------------------------------------------------------------------------- /eager_lbfgs/data/short_eager_time.csv: -------------------------------------------------------------------------------- 1 | 9.806975307874381542e-01 2 | 9.339727419428527355e-01 3 | 9.292591358534991741e-01 4 | -------------------------------------------------------------------------------- /eager_lbfgs/data/short_pytorch_loss.csv: -------------------------------------------------------------------------------- 1 | 1.125177601352334023e-03 2 | 1.720896689221262932e-03 3 | 2.242802875116467476e-03 4 | -------------------------------------------------------------------------------- /eager_lbfgs/data/short_pytorch_time.csv: -------------------------------------------------------------------------------- 1 | 2.150501497089862823e-01 2 | 2.058924520388245583e-01 3 | 1.908177738077938557e-01 4 | -------------------------------------------------------------------------------- /eager_lbfgs/pytorch_lbfgs.py: -------------------------------------------------------------------------------- 1 | import util as u 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import torch.optim as optim 7 | from torch.autograd import Variable 8 | import numpy as np 9 | 10 | # todo: make images global 11 | 12 | step = 0 13 | final_loss = None 14 | 15 | def benchmark(batch_size, iters, seed=1, cuda=True, history=100, verbose=False): 16 | global step, final_loss 17 | 18 | step = 0 19 | final_loss = None 20 | 21 | torch.manual_seed(seed) 22 | np.random.seed(seed) 23 | if cuda: 24 | torch.cuda.manual_seed(seed) 25 | 26 | visible_size = 28*28 27 | hidden_size = 196 28 | 29 | images = torch.Tensor(u.get_mnist_images(batch_size).T) 30 | images = images[:batch_size] 31 | if cuda: 32 | images = images.cuda() 33 | data = Variable(images) 34 | 35 | class Net(nn.Module): 36 | def __init__(self): 37 | super(Net, self).__init__() 38 | self.encoder = nn.Parameter(torch.rand(visible_size, hidden_size)) 39 | 40 | def forward(self, input): 41 | x = input.view(-1, visible_size) 42 | x = torch.sigmoid(torch.mm(x, self.encoder)) 43 | x = torch.sigmoid(torch.mm(x, torch.transpose(self.encoder, 0, 1))) 44 | return x.view_as(input) 45 | 46 | # initialize model and weights 47 | model = Net() 48 | model.encoder.data = torch.Tensor(u.ng_init(visible_size, 49 | hidden_size)) 50 | if cuda: 51 | model.cuda() 52 | 53 | model.train() 54 | optimizer = optim.LBFGS(model.parameters(), max_iter=iters, history_size=history, lr=1.0) 55 | 56 | times = [] 57 | def closure(): 58 | global step, final_loss 59 | optimizer.zero_grad() 60 | output = model(data) 61 | loss = F.mse_loss(output, data) 62 | if verbose: 63 | loss0 = loss.data[0] 64 | times.append(u.last_time()) 65 | print("Step %3d loss %6.5f msec %6.3f"%(step, loss0, u.last_time())) 66 | step+=1 67 | if step == iters: 68 | final_loss = loss.data[0] 69 | loss.backward() 70 | u.record_time() 71 | return loss 72 | 73 | optimizer.step(closure) 74 | 75 | output = model(data) 76 | loss = F.mse_loss(output, data) 77 | loss0 = loss.data[0] 78 | 79 | if verbose: 80 | u.summarize_time() 81 | 82 | # print(times) 83 | s = ','.join(["%f"%(n,) for n in times[2:]]) 84 | print('{', s,'}') 85 | 86 | return final_loss 87 | 88 | 89 | 90 | def main(): 91 | import common_gd 92 | args = common_gd.args 93 | args.cuda = not args.no_cuda and torch.cuda.is_available() 94 | 95 | print(benchmark(batch_size=args.batch_size, iters=args.iters, seed=args.seed, cuda = args.cuda, history=args.history, verbose=True)) 96 | 97 | if __name__=='__main__': 98 | main() 99 | -------------------------------------------------------------------------------- /eager_lbfgs/run_experiment.py: -------------------------------------------------------------------------------- 1 | # compare timing for variety of batch-sizes 2 | # TODO: make PyTorch not run out of memory 3 | 4 | import tensorflow as tf 5 | import eager_lbfgs 6 | import pytorch_lbfgs 7 | import numpy as np 8 | import util as u 9 | 10 | import time 11 | import sys 12 | import os 13 | 14 | def run_experiment(iters, name): 15 | 16 | #batch_sizes = [1, 10, 100, 1000, 10000, 60000] 17 | batch_sizes = [100, 200, 300] 18 | 19 | eager_stats = [] 20 | pytorch_stats = [] 21 | 22 | def benchmark(f): 23 | # do whole run once for pre-warming 24 | f() 25 | import gc; gc.collect() 26 | start_time = time.perf_counter() 27 | final_loss = f() 28 | elapsed_time = time.perf_counter() - start_time 29 | return final_loss, elapsed_time 30 | 31 | for batch_size in batch_sizes: 32 | def eager_run(): 33 | return eager_lbfgs.benchmark(batch_size=batch_size, iters=iters) 34 | eager_stats.append(benchmark(eager_run)) 35 | def pytorch_run(): 36 | return pytorch_lbfgs.benchmark(batch_size=batch_size, iters=iters) 37 | pytorch_stats.append(benchmark(pytorch_run)) 38 | 39 | print(eager_stats) 40 | print(pytorch_stats) 41 | # pytorch_losses 42 | # pytorch_times 43 | # pytorch_sizes 44 | 45 | eager_stats = np.array(eager_stats) 46 | pytorch_stats = np.array(pytorch_stats) 47 | u.dump(batch_sizes, name+"_batch.csv") 48 | 49 | u.dump(eager_stats[:,0], name+"_eager_loss.csv") 50 | u.dump(eager_stats[:,1], name+"_eager_time.csv") 51 | 52 | u.dump(pytorch_stats[:,0], name+"_pytorch_loss.csv") 53 | u.dump(pytorch_stats[:,1], name+"_pytorch_time.csv") 54 | 55 | 56 | if __name__=='__main__': 57 | if len(sys.argv)<2: 58 | print("Running short comparison") 59 | run_experiment(51, "short") 60 | else: 61 | print("Running long comparison") 62 | run_experiment(101, "long") 63 | 64 | -------------------------------------------------------------------------------- /enqueue_many_test.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import numpy as np 3 | os.environ["CUDA_VISIBLE_DEVICES"]="" 4 | import tensorflow as tf 5 | 6 | def create_session(): 7 | config = tf.ConfigProto(log_device_placement=False) 8 | config.operation_timeout_in_ms=5000 # terminate on long hangs 9 | config.gpu_options.per_process_gpu_memory_fraction=0.3 # don't hog all vRAM 10 | sess = tf.InteractiveSession("", config=config) 11 | return sess 12 | 13 | import time 14 | import threading 15 | import os 16 | os.environ['PYTHONUNBUFFERED'] = 'True' 17 | 18 | 19 | from google.protobuf.internal import api_implementation 20 | assert api_implementation._default_implementation_type == 'cpp' 21 | 22 | 23 | from tensorflow.python.client import timeline 24 | 25 | tf.reset_default_graph() 26 | 27 | reverse = False 28 | if len(sys.argv)>1: 29 | assert sys.argv[1] == 'reverse' 30 | reverse = True 31 | 32 | n = 10**6 33 | dtype = tf.int32 34 | queue = tf.FIFOQueue(capacity=2*n, dtypes=[dtype], shapes=[()]) 35 | zeros = tf.Variable(tf.zeros((n), name="0", dtype=dtype)) 36 | ones = tf.Variable(tf.ones((n), name="1", dtype=dtype)) 37 | enqueue_zeros = queue.enqueue_many(zeros, name="zeros") 38 | enqueue_ones = queue.enqueue_many(ones, name="ones") 39 | sess = create_session() 40 | sess.run(tf.global_variables_initializer()) 41 | 42 | start_time0 = time.time() 43 | run_metadatas = [] 44 | def run_op(op): 45 | start_time = time.time() 46 | print("%10.2f ms: starting op %s\n" % ((start_time-start_time0)*1000, op.name), flush=True, end='') 47 | 48 | options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) 49 | run_metadata = tf.RunMetadata() 50 | sess.run(op, options=options, run_metadata=run_metadata) 51 | end_time = time.time() 52 | print("%10.2f ms: ending op %s\n" % ((end_time-start_time0)*1000, op.name), flush=True, end='') 53 | run_metadatas.append(run_metadata) 54 | 55 | 56 | 57 | threads = [threading.Thread(group=None, target=run_op, args=(op,)) for op in (enqueue_zeros, enqueue_ones)] 58 | if reverse: 59 | threads.reverse() 60 | 61 | for t in threads: 62 | t.start() 63 | 64 | # wait for threads to finish 65 | for t in threads: 66 | t.join() 67 | 68 | # generate merged timeline 69 | merged_metadata = tf.RunMetadata() 70 | for run_metadata in run_metadatas: 71 | merged_metadata.MergeFrom(run_metadata) 72 | 73 | tl = timeline.Timeline(merged_metadata.step_stats) 74 | ctf = tl.generate_chrome_trace_format() 75 | with open(sys.argv[0]+'_%s_timeline.json'%(reverse), 'w') as f: 76 | f.write(ctf) 77 | 78 | assert sess.run(queue.size()) == 2*n 79 | result = sess.run(queue.dequeue_many(2*n)) 80 | padding = np.array([0]) 81 | 82 | diffs = np.concatenate([padding, result])-np.concatenate([result, padding]) 83 | print("Interleaving detected: %s" % (abs(diffs).sum()>2)) 84 | -------------------------------------------------------------------------------- /enqueue_many_test_singlerun.py: -------------------------------------------------------------------------------- 1 | # Test multiple enqueue many in single .run call 2 | import os, sys 3 | import numpy as np 4 | os.environ["CUDA_VISIBLE_DEVICES"]="" 5 | import tensorflow as tf 6 | 7 | def create_session(): 8 | config = tf.ConfigProto(log_device_placement=False) 9 | config.operation_timeout_in_ms=5000 # terminate on long hangs 10 | config.gpu_options.per_process_gpu_memory_fraction=0.3 # don't hog all vRAM 11 | sess = tf.InteractiveSession("", config=config) 12 | return sess 13 | 14 | import time 15 | import threading 16 | import os 17 | os.environ['PYTHONUNBUFFERED'] = 'True' 18 | 19 | 20 | from google.protobuf.internal import api_implementation 21 | assert api_implementation._default_implementation_type == 'cpp' 22 | 23 | 24 | from tensorflow.python.client import timeline 25 | tf.reset_default_graph() 26 | 27 | reverse = False 28 | if len(sys.argv)>1: 29 | assert sys.argv[1] == 'reverse' 30 | reverse = True 31 | 32 | n = 10**6 33 | dtype = tf.int32 34 | queue = tf.FIFOQueue(capacity=2*n, dtypes=[dtype], shapes=[()]) 35 | zeros = tf.Variable(tf.zeros((n), name="0", dtype=dtype)) 36 | ones = tf.Variable(tf.ones((n), name="1", dtype=dtype)) 37 | enqueue_zeros = queue.enqueue_many(zeros, name="zeros") 38 | enqueue_ones = queue.enqueue_many(ones, name="ones") 39 | sess = create_session() 40 | sess.run(tf.global_variables_initializer()) 41 | 42 | op = tf.group(enqueue_zeros, enqueue_ones) 43 | 44 | start_time0 = time.time() 45 | run_metadatas = [] 46 | def run_op(op): 47 | start_time = time.time() 48 | print("%10.2f ms: starting op %s\n" % ((start_time-start_time0)*1000, op.name), flush=True, end='') 49 | 50 | options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) 51 | run_metadata = tf.RunMetadata() 52 | sess.run(op, options=options, run_metadata=run_metadata) 53 | end_time = time.time() 54 | print("%10.2f ms: ending op %s\n" % ((end_time-start_time0)*1000, op.name), flush=True, end='') 55 | run_metadatas.append(run_metadata) 56 | 57 | 58 | 59 | threads = [threading.Thread(group=None, target=run_op, args=(op,))] 60 | 61 | for t in threads: 62 | t.start() 63 | 64 | # wait for threads to finish 65 | for t in threads: 66 | t.join() 67 | 68 | # generate merged timeline 69 | merged_metadata = tf.RunMetadata() 70 | for run_metadata in run_metadatas: 71 | merged_metadata.MergeFrom(run_metadata) 72 | 73 | tl = timeline.Timeline(merged_metadata.step_stats) 74 | ctf = tl.generate_chrome_trace_format() 75 | with open(sys.argv[0]+'_timeline.json', 'w') as f: 76 | f.write(ctf) 77 | 78 | assert sess.run(queue.size()) == 2*n 79 | result = sess.run(queue.dequeue_many(2*n)) 80 | padding = np.array([0]) 81 | 82 | diffs = np.concatenate([padding, result])-np.concatenate([result, padding]) 83 | print("Interleaving detected: %s" % (abs(diffs).sum()>2)) 84 | -------------------------------------------------------------------------------- /ericyue-slowreader/benchmark-batch.py: -------------------------------------------------------------------------------- 1 | # measure the speed at which batches can be made 2 | # Only 14k 3 | # range queue 1996115, batch queue 3885, 6455.13 per second 4 | # range d 404771, batch d 3229 5 | # range queue 1988698, batch queue 11302, 14735.80 per second 6 | # range d -7417, batch d 7417 7 | # range queue 1981384, batch queue 18616, 14620.57 per second 8 | # range d -7314, batch d 7314 9 | # range queue 1974016, batch queue 25984, 14662.89 per second 10 | 11 | import tensorflow as tf 12 | import time 13 | 14 | 15 | steps_to_validate = 200 16 | epoch_number = 2 17 | thread_number = 2 18 | batch_size = 100 19 | 20 | capacity = 2*10**6 21 | # don't use too high of limit, 10**9 hangs (overflows to negative in TF?) 22 | a_queue = tf.train.range_input_producer(limit=10**3, num_epochs=2000, 23 | capacity=capacity, shuffle=False) 24 | 25 | # manually run the queue runner for a bit 26 | config = tf.ConfigProto(log_device_placement=False) 27 | config.operation_timeout_in_ms=5000 # terminate on long hangs 28 | sess = tf.InteractiveSession("", config=config) 29 | sess.run(tf.global_variables_initializer()) 30 | sess.run(tf.local_variables_initializer()) 31 | 32 | 33 | a_queue_qr = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)[0] 34 | for i in range(1000): 35 | sess.run(a_queue_qr.enqueue_ops) 36 | 37 | 38 | # check the size 39 | range_size_node = "input_producer/fraction_of_2000000_full/fraction_of_2000000_full_Size:0" 40 | 41 | # size gives raw size rather than number of batches 42 | batch_size_node = "batch/fifo_queue_Size:0" 43 | 44 | print("range size is ", sess.run(range_size_node)) 45 | 46 | # now create batch and run it manually 47 | # use size of 2 or get TypeError: 'Tensor' object is not iterable. 48 | # (possibly singleton list get auto-packed into a single Tensor) 49 | [b, _] = tf.train.batch([a_queue.dequeue()]*2, batch_size=batch_size, 50 | capacity=capacity) 51 | 52 | 53 | tf.train.start_queue_runners() 54 | start_time = time.time() 55 | old_range_size, old_batch_size = (0, 0) 56 | while True: 57 | new_range_size, new_batch_size = sess.run([range_size_node, batch_size_node]) 58 | 59 | new_time = time.time() 60 | rate = (new_batch_size-old_batch_size)/(new_time-start_time) 61 | print("range queue %d, batch queue %d, %.2f per second"%(new_range_size, 62 | new_batch_size, 63 | rate)) 64 | print("range d %d, batch d %d" %(new_range_size - old_range_size, 65 | new_batch_size - old_batch_size)) 66 | start_time = time.time() 67 | old_range_size, old_batch_size = new_range_size, new_batch_size 68 | time.sleep(0.5) 69 | 70 | 71 | 72 | def let_queue_repopulate(size_tensor, min_elements=100000, sleep_delay=0.5): 73 | """Wait until queue has enough elements.""" 74 | size2 = "input_producer/fraction_of_2000000_full/fraction_of_2000000_full_Size:0" 75 | while sess.run(size_tensor) < min_elements: 76 | print("Size1: %d, size2: %d" %tuple(sess.run([size_tensor, size2]))) 77 | time.sleep(sleep_delay) 78 | 79 | step = 0 80 | start_time = time.time() 81 | while True: 82 | step+=1 83 | let_queue_repopulate(size_tensor=batch_size_node) 84 | sess.run(b.op) 85 | if step % steps_to_validate == 0: 86 | end_time = time.time() 87 | sec = (end_time - start_time) 88 | print("[{}] time[{:6.2f}] step[{:10d}] speed[{:6d}]".format( 89 | str(end_time).split(".")[0],sec, step, 90 | int((steps_to_validate*batch_size)/sec) 91 | )) 92 | start_time = end_time 93 | -------------------------------------------------------------------------------- /ericyue-slowreader/benchmark-reader.py: -------------------------------------------------------------------------------- 1 | # [1484609202] time[ 0.01] step[ 20] speed[360350] 2 | # [1484609202] time[ 0.00] step[ 40] speed[1129322] 3 | # [1484609202] time[ 0.00] step[ 60] speed[546168] 4 | # [1484609202] time[ 0.00] step[ 80] speed[709696] 5 | # [1484609202] time[ 0.00] step[ 100] speed[1112399] 6 | # [1484609202] time[ 0.00] step[ 120] speed[1506033] 7 | 8 | import tensorflow as tf 9 | import time 10 | 11 | filename_queue = tf.train.string_input_producer(["./data.zlib"], 12 | shuffle=False, 13 | seed = int(time.time())) 14 | 15 | reader = tf.TFRecordReader(options = tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.ZLIB)) 16 | _, serialized_example = reader.read(filename_queue) 17 | 18 | reader = tf.TFRecordReader(options = tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.ZLIB)) 19 | _, serialized_example = reader.read(filename_queue) 20 | 21 | sess = tf.InteractiveSession() 22 | tf.train.start_queue_runners() 23 | 24 | batch_size = 100 25 | steps_to_validate = 20 26 | 27 | step = 0 28 | start_time = time.time() 29 | while True: 30 | step+=1 31 | sess.run(serialized_example.op) 32 | if step % steps_to_validate == 0: 33 | end_time = time.time() 34 | sec = (end_time - start_time) 35 | print("[{}] time[{:6.2f}] step[{:10d}] speed[{:6d}]".format( 36 | str(end_time).split(".")[0],sec, step, 37 | int((steps_to_validate*batch_size)/sec) 38 | )) 39 | start_time = end_time 40 | -------------------------------------------------------------------------------- /ericyue-slowreader/benchmark-synthetic-batch.py: -------------------------------------------------------------------------------- 1 | # [1484611992] time[ 0.00] step[ 420] speed[613695] 2 | # [1484611992] time[ 0.00] step[ 440] speed[501141] 3 | # [1484611992] time[ 0.01] step[ 460] speed[351428] 4 | # [1484611992] time[ 0.00] step[ 480] speed[450032] 5 | # [1484611993] time[ 0.14] step[ 500] speed[ 14419] 6 | # [1484611993] time[ 0.15] step[ 520] speed[ 13662] 7 | # [1484611993] time[ 0.14] step[ 540] speed[ 13960] 8 | # [1484611993] time[ 0.15] step[ 560] speed[ 13069] 9 | 10 | import tensorflow as tf 11 | import time 12 | 13 | 14 | steps_to_validate = 200 15 | epoch_number = 2 16 | thread_number = 2 17 | batch_size = 100 18 | 19 | capacity = 2*10**6 20 | # don't use too high of limit, 10**9 hangs (overflows to negative in TF?) 21 | a_queue = tf.train.range_input_producer(limit=10**3, capacity=capacity) 22 | 23 | # use size of 2 or get TypeError: 'Tensor' object is not iterable. 24 | # (possibly singleton list get auto-packed into a single Tensor) 25 | [b, _] = tf.train.batch([a_queue.dequeue()]*2, batch_size=100, 26 | capacity=capacity) 27 | 28 | 29 | config = tf.ConfigProto(log_device_placement=True) 30 | config.operation_timeout_in_ms=5000 # terminate on long hangs 31 | sess = tf.InteractiveSession("", config=config) 32 | 33 | tf.train.start_queue_runners() 34 | 35 | def let_queue_repopulate(size_tensor, min_elements=100000, sleep_delay=0.5): 36 | """Wait until queue has enough elements.""" 37 | size2 = "input_producer/fraction_of_2000000_full/fraction_of_2000000_full_Size:0" 38 | while sess.run(size_tensor) < min_elements: 39 | print("Size1: %d, size2: %d" %tuple(sess.run([size_tensor, size2]))) 40 | time.sleep(sleep_delay) 41 | 42 | step = 0 43 | start_time = time.time() 44 | while True: 45 | step+=1 46 | let_queue_repopulate(size_tensor="batch/fifo_queue_Size:0") 47 | sess.run(b.op) 48 | if step % steps_to_validate == 0: 49 | end_time = time.time() 50 | sec = (end_time - start_time) 51 | print("[{}] time[{:6.2f}] step[{:10d}] speed[{:6d}]".format( 52 | str(end_time).split(".")[0],sec, step, 53 | int((steps_to_validate*batch_size)/sec) 54 | )) 55 | start_time = end_time 56 | -------------------------------------------------------------------------------- /ericyue-slowreader/benchmark-synthetic.py: -------------------------------------------------------------------------------- 1 | # [1484615767] time[ 0.31] step[ 2000] speed[652222] 2 | # [1484615767] time[ 0.31] step[ 4000] speed[654197] 3 | # [1484615768] time[ 0.30] step[ 6000] speed[661347] 4 | # [1484615768] time[ 0.30] step[ 8000] speed[662600] 5 | # 6 | # with_dequeu_many = False 7 | # [1484614505] time[ 0.97] step[ 2000] speed[205131] 8 | # [1484614506] time[ 0.96] step[ 4000] speed[208224] 9 | # [1484614507] time[ 0.96] step[ 6000] speed[208984] 10 | # [1484614508] time[ 0.95] step[ 8000] speed[209907] 11 | 12 | import tensorflow as tf 13 | import time 14 | 15 | # try benchmarking 16 | steps_to_validate = 2000 17 | epoch_number = 2 18 | thread_number = 2 19 | batch_size = 100 20 | use_dequeue_many = True 21 | 22 | 23 | # don't use too high of limit, 10**9 hangs (overflows to negative in TF?) 24 | a_queue = tf.train.range_input_producer(limit=10**3, capacity=1000, shuffle=False) 25 | #a_queue = tf.train.string_input_producer(["hello"]) 26 | 27 | 28 | # use an op that guarantees batch_size dequeues 29 | if use_dequeue_many: 30 | a_batch = a_queue.dequeue_many(n=batch_size) 31 | a_batch_op = a_batch.op 32 | else: 33 | # otherwise just do batch_size dequeue ops 34 | a = a_queue.dequeue() 35 | a_batch = [a+i for i in range(batch_size)] 36 | a_batch_op = tf.group(*a_batch) 37 | 38 | config = tf.ConfigProto(log_device_placement=False) 39 | config.operation_timeout_in_ms=5000 # terminate on long hangs 40 | sess = tf.InteractiveSession("", config=config) 41 | 42 | tf.train.start_queue_runners() 43 | 44 | step = 0 45 | start_time = time.time() 46 | while True: 47 | step+=1 48 | sess.run(a_batch_op) 49 | if step % steps_to_validate == 0: 50 | end_time = time.time() 51 | sec = (end_time - start_time) 52 | print("[{}] time[{:6.2f}] step[{:10d}] speed[{:6d}]".format( 53 | str(end_time).split(".")[0],sec, step, 54 | int((steps_to_validate*batch_size)/sec) 55 | )) 56 | start_time = end_time 57 | -------------------------------------------------------------------------------- /ericyue-slowreader/data.zlib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaroslavvb/stuff/a8024ead315aa1b5d6976940b3a062178f0e499d/ericyue-slowreader/data.zlib -------------------------------------------------------------------------------- /ericyue-slowreader/profile-batch.py: -------------------------------------------------------------------------------- 1 | # script for getting cpu profile of queue runners 2 | # 3 | # sudo apt-get install google-perftools 4 | # LD_PRELOAD has to be set in a forked script, otherwise shell will 5 | # overwrite the profile file 6 | 7 | import os, sys, subprocess 8 | 9 | my_env = os.environ.copy() 10 | my_env["LD_PRELOAD"]="/usr/lib/libtcmalloc_and_profiler.so.4" 11 | my_env["CPUPROFILE"]="/tmp/profile-yue/profile" 12 | 13 | args = ["python", "benchmark-batch-noqueuerunners.py"] 14 | proc = subprocess.Popen(args, stderr=subprocess.STDOUT, env=my_env) 15 | print("Done") 16 | 17 | -------------------------------------------------------------------------------- /example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaroslavvb/stuff/a8024ead315aa1b5d6976940b3a062178f0e499d/example.png -------------------------------------------------------------------------------- /free_gpus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Parse nvidia-smi for pids and kill all GPU users 3 | # Tested on nvidia-smi 370.23 4 | import os, re, sys, subprocess 5 | import pwd 6 | 7 | from collections import defaultdict 8 | 9 | def tokenize(cmd): 10 | if isinstance(cmd, list): 11 | return cmd 12 | if isinstance(cmd, bytes): 13 | cmd = cmd.decode("ascii") 14 | if isinstance(cmd, str): 15 | cmd = cmd.split(None) 16 | return cmd 17 | 18 | 19 | def run_command(cmd): 20 | """Run command, return output as string.""" 21 | 22 | output = subprocess.Popen(cmd, stdout=subprocess.PIPE, 23 | shell=True).communicate()[0] 24 | return output.decode("ascii") 25 | 26 | 27 | def run_shell(cmd): 28 | """Runs shell command, returns list of outputted lines 29 | with newlines stripped.""" 30 | 31 | cmd = tokenize(cmd) 32 | p = subprocess.Popen(cmd, stdout=subprocess.PIPE, 33 | stderr=subprocess.STDOUT) 34 | (stdout, stderr) = p.communicate() 35 | stdout = stdout.decode("ascii") # turn into string to make Python3 happy 36 | lines = stdout.split('\n') 37 | stripped_lines = [] 38 | for l in lines: 39 | stripped_line = l.strip() 40 | if l: 41 | stripped_lines.append(stripped_line) 42 | return stripped_lines 43 | 44 | 45 | def run_shell_background(cmd_orig): 46 | """Runs shell command in background, returns pid.""" 47 | 48 | cmd = tokenize(cmd_orig) 49 | p = subprocess.Popen(cmd, close_fds=True) 50 | print("[%d] %s " % (p.pid, cmd_orig)) 51 | 52 | 53 | def get_pid_gpu_map(): 54 | """Returns map of GPU id to memory allocated on that GPU.""" 55 | 56 | output = run_command("nvidia-smi") 57 | gpu_output = output[output.find("GPU Memory"):] 58 | # lines of the form 59 | # | 0 8734 C python 11705MiB | 60 | regex = re.compile(r"[|]\s+?(?P\d+)\D+?(?P\d+).+[ ]" 61 | "(?P\d+)MiB") 62 | rows = gpu_output.split("\n") 63 | pids = [] 64 | pid_gpu_map = defaultdict(list) 65 | for row in gpu_output.split("\n"): 66 | m = regex.search(row) 67 | if not m: 68 | continue 69 | pid = int(m.group("pid")) 70 | gpu_id = int(m.group("gpu_id")) 71 | print("pid %s using gpu %s"%(pid, gpu_id)) 72 | pid_gpu_map[pid].append(gpu_id) 73 | return pid_gpu_map 74 | 75 | def kill_pids(pids_to_kill): 76 | pids = [] 77 | for pid_to_kill in pids_to_kill: 78 | pid = run_shell_background("sudo kill -9 "+str(pid_to_kill)) 79 | pids.append(pid) 80 | return pids 81 | 82 | 83 | def owner(pid): 84 | '''Return username of UID of process pid''' 85 | UID = 1 86 | EUID = 2 87 | for ln in open('/proc/%d/status' % pid): 88 | if ln.startswith('Uid:'): 89 | uid = int(ln.split()[UID]) 90 | return pwd.getpwuid(uid).pw_name 91 | 92 | if __name__ == '__main__': 93 | pid_gpu_map = get_pid_gpu_map() 94 | print("%10s %10s %s" %("pid", "username", "gpu")) 95 | for pid in pid_gpu_map: 96 | print("%10s %10s %s" %(pid, owner(pid), pid_gpu_map[pid])) 97 | answer = input("kill these? (Y/n) ") 98 | if not answer: 99 | answer = "y" 100 | if answer.lower() == "y": 101 | pids = kill_pids(pid_gpu_map.keys()) 102 | else: 103 | print("Didn't get y, doing nothing") 104 | -------------------------------------------------------------------------------- /github_pyfunc_slowness.py: -------------------------------------------------------------------------------- 1 | # Example of py_func slowing down future computations 2 | # On Mac 3 | # time 1 0.007195033016614616 4 | # time 2 0.0070790809113532305 5 | # time 3 0.008019614033401012 6 | # 7 | # On Xeon V3: 8 | # time 1 0.011401358991861343 9 | # time 2 0.011637557297945023 10 | # time 3 0.012380894273519516 11 | # 12 | # On Mac without MKL installed: 13 | # time 1 0.011707969009876251 14 | # time 2 0.011970046092756093 15 | # time 3 0.011933871079236269 16 | 17 | import numpy as np 18 | import scipy 19 | import scipy.linalg 20 | import tensorflow as tf 21 | import timeit 22 | sess = tf.Session() 23 | a = np.random.random((300, 300)) 24 | a = a.dot(a.T) 25 | best_time = np.inf 26 | for i in range(10): 27 | s = timeit.default_timer() 28 | scipy.linalg.eigh(a) 29 | e = timeit.default_timer() 30 | if e - s < best_time: 31 | best_time = e - s 32 | print("time 1", best_time) 33 | 34 | np.linalg.svd(np.random.randn(2, 300)) 35 | 36 | best_time = np.inf 37 | for i in range(10): 38 | s = timeit.default_timer() 39 | scipy.linalg.eigh(a) 40 | e = timeit.default_timer() 41 | if e - s < best_time: 42 | best_time = e - s 43 | print("time 2", best_time) 44 | 45 | ret = tf.py_func(np.linalg.svd, [np.random.randn(2, 300)], [tf.float64, tf.float64, tf.float64]) 46 | sess.run(ret) 47 | 48 | best_time = np.inf 49 | for i in range(10): 50 | s = timeit.default_timer() 51 | scipy.linalg.eigh(a) 52 | e = timeit.default_timer() 53 | if e - s < best_time: 54 | best_time = e - s 55 | print("time 3", best_time) 56 | -------------------------------------------------------------------------------- /gpu_oom.py: -------------------------------------------------------------------------------- 1 | # Example of catching GPU OOM error 2 | # http://stackoverflow.com/questions/41942538/tensorflow-gpu-memory-error-try-except-not-catching-the-error 3 | 4 | import tensorflow as tf 5 | 6 | try: 7 | with tf.device("gpu:0"): 8 | a = tf.Variable(tf.ones((10000, 10000))) 9 | sess = tf.Session() 10 | sess.run(tf.initialize_all_variables()) 11 | except: 12 | print("Caught error") 13 | import pdb; pdb.set_trace() 14 | -------------------------------------------------------------------------------- /gpu_svd_bench.py: -------------------------------------------------------------------------------- 1 | linalg-benchmark/benchmark.py -------------------------------------------------------------------------------- /graphvis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaroslavvb/stuff/a8024ead315aa1b5d6976940b3a062178f0e499d/graphvis.png -------------------------------------------------------------------------------- /input_benchmarks/convert_to_records.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Converts MNIST data to TFRecords file format with Example protos.""" 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import os 22 | import tensorflow as tf 23 | from tensorflow.contrib.learn.python.learn.datasets import mnist 24 | 25 | 26 | SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/' 27 | 28 | TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' # MNIST filenames 29 | TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' 30 | TEST_IMAGES = 't10k-images-idx3-ubyte.gz' 31 | TEST_LABELS = 't10k-labels-idx1-ubyte.gz' 32 | 33 | 34 | tf.app.flags.DEFINE_string('directory', '/tmp/data', 35 | 'Directory to download data files and write the ' 36 | 'converted result') 37 | tf.app.flags.DEFINE_integer('validation_size', 5000, 38 | 'Number of examples to separate from the training ' 39 | 'data for the validation set.') 40 | FLAGS = tf.app.flags.FLAGS 41 | 42 | 43 | def _int64_feature(value): 44 | return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) 45 | 46 | 47 | def _bytes_feature(value): 48 | return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) 49 | 50 | 51 | def convert_to(data_set, name): 52 | images = data_set.images 53 | labels = data_set.labels 54 | num_examples = data_set.num_examples 55 | 56 | if images.shape[0] != num_examples: 57 | raise ValueError('Images size %d does not match label size %d.' % 58 | (images.shape[0], num_examples)) 59 | rows = images.shape[1] 60 | cols = images.shape[2] 61 | depth = images.shape[3] 62 | 63 | filename = os.path.join(FLAGS.directory, name + '.tfrecords') 64 | print('Writing', filename) 65 | writer = tf.python_io.TFRecordWriter(filename) 66 | for index in range(num_examples): 67 | image_raw = images[index].tostring() 68 | example = tf.train.Example(features=tf.train.Features(feature={ 69 | 'height': _int64_feature(rows), 70 | 'width': _int64_feature(cols), 71 | 'depth': _int64_feature(depth), 72 | 'label': _int64_feature(int(labels[index])), 73 | 'image_raw': _bytes_feature(image_raw)})) 74 | writer.write(example.SerializeToString()) 75 | writer.close() 76 | 77 | 78 | def main(argv): 79 | # Get the data. 80 | data_sets = mnist.read_data_sets(FLAGS.directory, 81 | dtype=tf.uint8, 82 | reshape=False) 83 | 84 | # Convert to Examples and write the result to TFRecords. 85 | convert_to(data_sets.train, 'train') 86 | convert_to(data_sets.validation, 'validation') 87 | convert_to(data_sets.test, 'test') 88 | 89 | 90 | if __name__ == '__main__': 91 | tf.app.run() 92 | -------------------------------------------------------------------------------- /jupyter-version.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaroslavvb/stuff/a8024ead315aa1b5d6976940b3a062178f0e499d/jupyter-version.png -------------------------------------------------------------------------------- /khatri_rao_benchmark.py: -------------------------------------------------------------------------------- 1 | # 0.6 - 0.8 for 10x10 khatri rao 2 | # After improvement: 0.02 seconds 3 | import tensorflow as tf 4 | import util as u 5 | import time 6 | import os 7 | import sys 8 | 9 | 10 | def benchmark_construct(dims, iters, dtype): 11 | A = tf.ones((dims, dims), dtype=dtype) 12 | B = tf.ones((dims, dims), dtype=dtype) 13 | prods = [] 14 | time0 = time.time() 15 | for i in range(iters): 16 | prods.append(u.khatri_rao(A,B)) 17 | elapsed = time.time() - time0 18 | print("Constructed %d x %d kr %d times in %.2f seconds"%(A.shape[0], B.shape[0], iters, elapsed)) 19 | 20 | def benchmark_execute(dims, iters, dtype): 21 | A = tf.random_uniform((dims, dims), dtype=dtype) 22 | B = tf.random_uniform((dims, dims), dtype=dtype) 23 | prods = [] 24 | for i in range(iters): 25 | prods.append(u.khatri_rao(A,B)) 26 | elapsed_times = [] 27 | sess = tf.Session() 28 | elapsed_times = [] 29 | u.reset_time() 30 | for i in range(10): 31 | time0 = time.time() 32 | sess.run(tf.group(*prods)) 33 | elapsed_times.append(time.time()-time0) 34 | u.record_time() 35 | u.summarize_time() 36 | 37 | 38 | if __name__ == '__main__': 39 | dims = 10 40 | iters = 10 41 | dtype = tf.float32 42 | benchmark_construct(dims, iters, dtype) 43 | benchmark_execute(dims, iters, dtype) 44 | 45 | -------------------------------------------------------------------------------- /lazy_dog.py: -------------------------------------------------------------------------------- 1 | # Overfit GPT model to "the quick brown fox" 2 | # 3 | # 906.45 -- the a , " he said . " i 'm not 4 | # 310.08 -- the i - " " i 'm not going to 5 | # 134.41 -- the i - " " i 'm not a child 6 | # 30.41 -- the i - " " i 'm not going to 7 | # 8.07 -- the quick , " he said , " i 'm not 8 | # 3.61 -- the quick quick quick steps , and then the quick quick 9 | # 2.15 -- the quick quick quick jumps over the low fence jumps over 10 | # 1.41 -- the quick fox jumps over the lazy dog jumps over the 11 | # 1.13 -- the quick fox jumps over the lazy dog jumps over the 12 | # 1.05 -- the quick quick brown fox jumps over the lazy dog jumps 13 | # 1.02 -- the quick brown fox jumps over the lazy dog jumps over 14 | # 1.01 -- the quick jumps over the lazy dog jumps over the lazy 15 | # 1.02 -- the quick brown fox jumps over the lazy dog jumps over 16 | # 1.13 -- the quick brown fox jumps over the lazy dog jumps over 17 | # 1.02 -- the quick brown fox jumps over the lazy dog jumps over 18 | # 1.00 -- the quick brown fox jumps over the lazy dog jumps over 19 | # 1.01 -- the quick brown fox jumps over the lazy dog jumps over 20 | # 1.00 -- the quick brown fox jumps over the lazy dog jumps over 21 | # 1.00 -- the quick brown fox jumps over the lazy dog jumps over 22 | # 1.00 -- the quick brown fox jumps over the lazy dog jumps over 23 | 24 | 25 | import math 26 | import torch 27 | from pytorch_pretrained_bert import OpenAIGPTTokenizer, OpenAIGPTLMHeadModel 28 | 29 | 30 | def argmax(t): 31 | return int(torch.argmax(t).detach().numpy()) 32 | 33 | def decode(start_tokens, length=10): 34 | result = [] 35 | context = torch.ones(1, 0, dtype=torch.long) 36 | for start_token in start_tokens: 37 | new_token = torch.full((1, 1), start_token, dtype=torch.long) 38 | context = torch.cat((context, new_token), dim=1) 39 | result.append(tokenizer.convert_ids_to_tokens([start_token])[0]) 40 | 41 | with torch.no_grad(): 42 | for i in range(length): 43 | logits = model(context) # batch_size x 1 44 | predicted_id = argmax(logits[0,-1]) 45 | predicted_word = tokenizer.convert_ids_to_tokens([predicted_id])[0] 46 | tokenizer.convert_ids_to_tokens([]) 47 | if predicted_word.endswith(''): 48 | predicted_word = predicted_word[:-len('')] 49 | result.append(predicted_word) 50 | 51 | predicted_id_batch = torch.tensor([[predicted_id]]) 52 | context = torch.cat((context, predicted_id_batch), dim=1) 53 | 54 | result = ' '.join(result) 55 | result = result.replace('\n', ' ') 56 | return result 57 | 58 | 59 | def main(): 60 | global tokenizer, model 61 | 62 | train_dataset = 'the quick brown fox jumps over the lazy dog' 63 | tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') 64 | tokenized = [tokenizer.tokenize(train_dataset)] 65 | 66 | # [[481, 2279, 2507, 8573, 11670, 715, 481, 8447, 2585]] 67 | encoded = [tokenizer.convert_tokens_to_ids(t) for t in tokenized] 68 | model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt') 69 | 70 | optimizer = torch.optim.SGD(model.parameters(), lr = 0.001, momentum=0.9) 71 | 72 | 73 | batch = torch.tensor(encoded) 74 | 75 | start_words = ['the'] 76 | start_tokens = [tokenizer.convert_tokens_to_ids(w) for w in start_words] 77 | 78 | for i in range(20): 79 | loss = model(input_ids=batch, lm_labels=batch) 80 | perplexity = math.exp(loss.item()) 81 | print('%5.2f -- %s'%(perplexity, decode(start_tokens))) 82 | 83 | loss.backward() 84 | optimizer.step() 85 | optimizer.zero_grad() 86 | 87 | 88 | if __name__=='__main__': 89 | main() 90 | 91 | -------------------------------------------------------------------------------- /linalg-benchmark/bad_matrix.py: -------------------------------------------------------------------------------- 1 | from scipy import linalg # for svd 2 | import urllib.request 3 | import numpy as np 4 | 5 | url="https://storage.googleapis.com/tensorflow-community-wheels/svd_in" 6 | response = urllib.request.urlopen(url) 7 | body = response.read() 8 | print("Read %d bytes"%(len(body),)) 9 | assert len(body) == 15366400 10 | open("svd_in", "wb").write(body) 11 | 12 | dtype = np.float32 13 | matrix0 = np.genfromtxt('svd_in', 14 | delimiter= ",").astype(dtype) 15 | assert matrix0.shape == (784, 784) 16 | u, s, v = linalg.svd(matrix0) 17 | print("matrix0 any NaNs: %s"% (np.isnan(matrix0).any(),)) 18 | print("u had NaNs: %s"% (np.isnan(u).any(),)) 19 | -------------------------------------------------------------------------------- /linalg-benchmark/environment.yml: -------------------------------------------------------------------------------- 1 | name: benchmark 2 | channels: 3 | - anaconda 4 | - pytorch 5 | dependencies: 6 | - python=3.6 7 | - mkl 8 | - pytorch 9 | - scipy 10 | - numpy 11 | - pip: 12 | - tensorflow-gpu 13 | -------------------------------------------------------------------------------- /linalg-benchmark/get_cores_per_socket.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Simple script to parse cpuinfo and generate command to limit to a single physical socket""" 3 | 4 | import re 5 | socket_re = re.compile(".*?processor.*?(?P\d+).*?physical id.*?(?P\d+).*?power", flags=re.S) 6 | from collections import defaultdict 7 | socket_dict = defaultdict(list) 8 | for cpu, socket in socket_re.findall(open('/proc/cpuinfo').read()): 9 | socket_dict[socket].append(cpu) 10 | 11 | 12 | for socket,cpus in socket_dict.items(): 13 | print('to set to socket', socket) 14 | print('export GOMP_CPU_AFFINITY=%s'%(','.join(cpus))) 15 | -------------------------------------------------------------------------------- /linalg-benchmark/launch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Run linalg benchmark on AWS 3 | 4 | import argparse 5 | import ncluster 6 | ncluster.set_backend('aws') 7 | 8 | import threading 9 | 10 | parser = argparse.ArgumentParser(description='launch') 11 | parser.add_argument('--instances', default='p3.16xlarge, c5.18xlarge, c5.9xlarge, m5.24xlarge, i3.metal, g3.16xlarge') 12 | parser.add_argument('--image', default="Deep Learning AMI (Amazon Linux) Version 15.0") 13 | parser.add_argument('--N', default='') 14 | parser.add_argument('--short', action='store_true', help='short version of benchmark') 15 | args = parser.parse_args() 16 | 17 | results = {} 18 | def launch(instance): 19 | """Run benchmark on given instance type.""" 20 | task = ncluster.make_task('benchmark-'+instance, instance_type=instance, image_name=args.image) 21 | task.upload('benchmark.py') 22 | task.run('source activate tensorflow_p36') 23 | task.run('pip install torch') 24 | task.run('export CUDA_VISIBLE_DEVICES=0') 25 | if args.N: 26 | task.run(f'export LINALG_BENCHMARK_N={args.N}') 27 | if args.short: 28 | task.run(f'export LINALG_BENCHMARK_SHORT={args.N}') 29 | 30 | stdout, stderr = task.run_with_output('python benchmark.py') 31 | print('='*80) 32 | print(instance) 33 | print(stdout) 34 | 35 | 36 | def main(): 37 | # launch 38 | threads = [] 39 | for instance in args.instances.split(','): 40 | instance = instance.strip() 41 | thread = threading.Thread(target=launch, args=[instance]) 42 | thread.start() 43 | threads.append(thread) 44 | for thread in threads: 45 | thread.join() 46 | 47 | 48 | 49 | if __name__=='__main__': 50 | main() 51 | -------------------------------------------------------------------------------- /linalg-benchmark/launch_tensorflow_svd_crash.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Run crashing TensorFlow SVD example 3 | 4 | import ncluster 5 | ncluster.set_backend('aws') 6 | 7 | import argparse 8 | parser = argparse.ArgumentParser(description='launch') 9 | parser.add_argument('--instance', default='c5.9xlarge') 10 | parser.add_argument('--image', default="Deep Learning AMI (Amazon Linux) Version 13.0") 11 | args = parser.parse_args() 12 | 13 | def main(): 14 | task = ncluster.make_task(instance_type=args.instance, 15 | image_name=args.image) 16 | task.run('source activate tensorflow_p36') 17 | task.upload('tensorflow_svd_crash.py') 18 | stdout, stderr = task.run_with_output('python tensorflow_svd_crash.py') 19 | print(stdout, stderr) 20 | 21 | if __name__=='__main__': 22 | main() 23 | -------------------------------------------------------------------------------- /linalg-benchmark/requirements.txt: -------------------------------------------------------------------------------- 1 | # mkl is conda only 2 | numpy 3 | scipy 4 | tensorflow-gpu 5 | torch 6 | -------------------------------------------------------------------------------- /linearize/linearize_test.py: -------------------------------------------------------------------------------- 1 | import linearize 2 | 3 | import os, sys, time 4 | import inspect 5 | import numpy as np 6 | import tensorflow as tf 7 | import pdb 8 | import math 9 | import toposort 10 | 11 | from tensorflow.python.ops import gen_random_ops 12 | 13 | def create_session(): 14 | config = tf.ConfigProto(log_device_placement=False, graph_options=tf.GraphOptions(optimizer_options=tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0))) 15 | return tf.InteractiveSession(config=config) 16 | 17 | def setup_env(): 18 | """Sets up test enviornment.""" 19 | 20 | # download memory_util if needed 21 | memory_util_url = "https://raw.githubusercontent.com/yaroslavvb/memory_util/master/memory_util.py" 22 | if os.path.exists('memory_util.py'): 23 | size = len(open('memory_util.py').read()) 24 | else: 25 | size = 0 26 | 27 | if size != 13636: 28 | print("Size changed or 0, redownloading memory_util.py") 29 | import urllib.request 30 | response = urllib.request.urlopen(memory_util_url) 31 | open("memory_util.py", "wb").write(response.read()) 32 | 33 | 34 | def make_caterpillar_graph(length=5, node_mbs=1): 35 | """Length is number of concats.""" 36 | 37 | n = node_mbs * 250000 38 | n2 = int(math.sqrt(n)) 39 | dtype = tf.float32 40 | 41 | def make_leaf(i): 42 | name = "leaf"+str(i) 43 | val = gen_random_ops._random_uniform((n2, n2), dtype, name=name) 44 | return val 45 | 46 | def make_merge(a, b, i): 47 | name = "merge"+str(i) 48 | merge_node = tf.matmul(a, b, name=name) 49 | # nonlinear_node = tf.tanh(merge_node, name="tanh"+str(i)) 50 | #nonlinear_node = tf.identity(merge_node, name="tanh"+str(i)) 51 | return merge_node 52 | 53 | leaf0 = make_leaf(0) 54 | node0 = tf.identity(leaf0, name="merge0") 55 | node = node0 56 | nodes = [node] 57 | 58 | for i in range(1, length+1): 59 | leaf = make_leaf(i) 60 | node = make_merge(node, leaf, i) 61 | nodes.append(node) 62 | return nodes 63 | 64 | def test_print(): 65 | """Should print: 66 | leaf1 -> merge1 67 | leaf0 -> merge0 68 | merge1 -> merge2 69 | merge0 -> merge1 70 | leaf2 -> merge2 71 | leaf0/shape -> leaf0 72 | leaf1/shape -> leaf1 73 | leaf2/shape -> leaf2 74 | """ 75 | 76 | nodes = make_caterpillar_graph(length=2) 77 | linearize.print_tf_graph(linearize.get_graph()) 78 | 79 | 80 | def test_toposort(): 81 | nodes = make_caterpillar_graph(length=2) 82 | graph = linearize.get_graph() 83 | print(list(toposort.toposort(graph))) 84 | 85 | 86 | def test_linearize(): 87 | nodes = make_caterpillar_graph(5) 88 | linearize.linearize() 89 | 90 | sess = create_session() 91 | 92 | import memory_util 93 | memory_util.vlog(1) 94 | with memory_util.capture_stderr() as stderr: 95 | sess.run(nodes[-1].op) 96 | memory_util.print_memory_timeline(stderr, ignore_less_than_bytes=1000) 97 | 98 | if __name__=='__main__': 99 | setup_env() 100 | import memory_util 101 | memory_util.vlog(1) 102 | 103 | # sess = create_session() 104 | #nodes = make_caterpillar_graph() 105 | # test_print() 106 | # linearize.print_tf_graph(linearize.get_graph()) 107 | # print(tf.get_default_graph().as_graph_def()) 108 | # test_toposort() 109 | test_linearize() 110 | sys.exit() 111 | # with memory_util.capture_stderr() as stderr: 112 | # print(sess.run(nodes[-1][0,0])) 113 | print(len(stderr.getvalue())) 114 | memory_util.print_memory_timeline(stderr, ignore_less_than_bytes=1000) 115 | -------------------------------------------------------------------------------- /matmul_benchmark.py: -------------------------------------------------------------------------------- 1 | # On Titan X (Pascal) 2 | # 8192 x 8192 matmul took: 0.10 sec, 11304.59 G ops/sec 3 | # http://stackoverflow.com/questions/41804380/testing-gpu-with-tensorflow-matrix-multiplication 4 | 5 | import os 6 | import sys 7 | import tensorflow as tf 8 | import time 9 | 10 | n = 8192 11 | dtype = tf.float32 12 | with tf.device("/gpu:0"): 13 | matrix1 = tf.Variable(tf.ones((n, n), dtype=dtype)) 14 | matrix2 = tf.Variable(tf.ones((n, n), dtype=dtype)) 15 | product = tf.matmul(matrix1, matrix2) 16 | 17 | 18 | # avoid optimizing away redundant nodes 19 | config = tf.ConfigProto(graph_options=tf.GraphOptions(optimizer_options=tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0))) 20 | sess = tf.Session(config=config) 21 | 22 | sess.run(tf.global_variables_initializer()) 23 | iters = 10 24 | 25 | # pre-warming 26 | sess.run(product.op) 27 | 28 | start = time.time() 29 | for i in range(iters): 30 | sess.run(product.op) 31 | end = time.time() 32 | ops = n**3 + (n-1)*n**2 # n^2*(n-1) additions, n^3 multiplications 33 | elapsed = (end - start) 34 | rate = iters*ops/elapsed/10**9 35 | print('\n %d x %d matmul took: %.2f sec, %.2f G ops/sec' % (n, n, 36 | elapsed/iters, 37 | rate,)) 38 | -------------------------------------------------------------------------------- /matmul_times/1080-float16.csv: -------------------------------------------------------------------------------- 1 | 1,0.0000000034 2 | 1,0.0000000036 3 | 1,0.0000000039 4 | 1,0.0000000035 5 | 1,0.0000000043 6 | 1,0.0000000046 7 | 1,0.0000000050 8 | 1,0.0000000054 9 | 2,0.0000000584 10 | 2,0.0000000644 11 | 2,0.0000000674 12 | 2,0.0000000687 13 | 2,0.0000000752 14 | 3,0.0000002873 15 | 3,0.0000002522 16 | 3,0.0000002471 17 | 4,0.0000007086 18 | 4,0.0000007393 19 | 4,0.0000007178 20 | 5,0.0000013838 21 | 5,0.0000015370 22 | 6,0.0000026987 23 | 6,0.0000022227 24 | 7,0.0000036381 25 | 8,0.0000054416 26 | 8,0.0000053568 27 | 9,0.0000082184 28 | 10,0.0000110903 29 | 11,0.0000138653 30 | 12,0.0000189915 31 | 13,0.0000241109 32 | 14,0.0000293803 33 | 16,0.0000457074 34 | 17,0.0000509526 35 | 19,0.0000716285 36 | 20,0.0000863740 37 | 22,0.0001204977 38 | 24,0.0001568045 39 | 26,0.0001843983 40 | 29,0.0002785588 41 | 32,0.0003894144 42 | 34,0.0004765664 43 | 38,0.0006730307 44 | 41,0.0008188075 45 | 45,0.0010837874 46 | 49,0.0014305645 47 | 53,0.0022087100 48 | 58,0.0026427016 49 | 64,0.0031762071 50 | 69,0.0033630118 51 | 76,0.0040770393 52 | 82,0.0044708883 53 | 90,0.0062248578 54 | 98,0.0092337182 55 | 107,0.0139221632 56 | 117,0.0178044032 57 | 128,0.0224213489 58 | 139,0.0299333631 59 | 152,0.0379965451 60 | 165,0.0496895280 61 | 181,0.0672505017 62 | 197,0.0831603483 63 | 215,0.1132126430 64 | 234,0.1287379171 65 | 256,0.1819730888 66 | 279,0.2485196740 67 | 304,0.3036680806 68 | 331,0.3879232771 69 | 362,0.4661769607 70 | 394,0.5644570767 71 | 430,0.7777496690 72 | 469,0.9742523210 73 | 512,1.2651589030 74 | 558,1.4602956948 75 | 608,1.8217860513 76 | 663,1.9921930853 77 | 724,2.2364226876 78 | 789,2.1433891063 79 | 861,2.7090783898 80 | 939,2.9531931908 81 | 1024,3.5877896025 82 | 1116,3.6779722946 83 | 1217,4.3078682686 84 | 1327,4.6144299678 85 | 1448,5.0839816350 86 | 1579,5.3015632066 87 | 1722,5.6268885114 88 | 1878,5.8209107716 89 | 2048,6.2596829924 90 | 2233,5.9811348181 91 | 2435,5.6518923737 92 | 2655,7.0360807776 93 | 2896,7.2112054057 94 | 3158,8.0330287265 95 | 3444,8.2290337210 96 | 3756,8.0293896669 97 | 4096,8.5125871285 98 | 4466,8.5370020832 99 | 4870,7.7743161522 100 | 5311,8.3979650843 101 | 5792,8.4688923679 102 | 6316,8.4772457343 103 | 6888,8.6937689402 104 | 7512,8.6359499182 105 | 8192,8.8169536875 106 | 8933,8.6668573644 107 | 9741,8.3756722311 108 | 10623,8.7015778540 109 | 11585,8.5536109363 110 | 12633,8.6088656910 111 | 13777,8.5709830209 112 | -------------------------------------------------------------------------------- /matmul_times/1080-float32.csv: -------------------------------------------------------------------------------- 1 | 1,0.0000000045 2 | 1,0.0000000035 3 | 1,0.0000000043 4 | 1,0.0000000034 5 | 1,0.0000000043 6 | 1,0.0000000049 7 | 1,0.0000000050 8 | 1,0.0000000056 9 | 2,0.0000000609 10 | 2,0.0000000714 11 | 2,0.0000000697 12 | 2,0.0000000659 13 | 2,0.0000000714 14 | 3,0.0000003000 15 | 3,0.0000003220 16 | 3,0.0000002666 17 | 4,0.0000006627 18 | 4,0.0000007880 19 | 4,0.0000008079 20 | 5,0.0000015459 21 | 5,0.0000012673 22 | 6,0.0000023556 23 | 6,0.0000025808 24 | 7,0.0000039604 25 | 8,0.0000058466 26 | 8,0.0000060740 27 | 9,0.0000092089 28 | 10,0.0000118796 29 | 11,0.0000163369 30 | 12,0.0000240841 31 | 13,0.0000299625 32 | 14,0.0000387380 33 | 16,0.0000520994 34 | 17,0.0000523390 35 | 19,0.0000795539 36 | 20,0.0000982683 37 | 22,0.0001214796 38 | 24,0.0001636920 39 | 26,0.0002092548 40 | 29,0.0002703317 41 | 32,0.0003729074 42 | 34,0.0004653672 43 | 38,0.0006168099 44 | 41,0.0008476423 45 | 45,0.0010729708 46 | 49,0.0014324164 47 | 53,0.0017386005 48 | 58,0.0028449365 49 | 64,0.0031177020 50 | 69,0.0038735519 51 | 76,0.0062974793 52 | 82,0.0064873469 53 | 90,0.0098341099 54 | 98,0.0116686863 55 | 107,0.0157851631 56 | 117,0.0202608931 57 | 128,0.0292374706 58 | 139,0.0367351869 59 | 152,0.0526993296 60 | 165,0.0644459866 61 | 181,0.0702350321 62 | 197,0.0879495323 63 | 215,0.1242069765 64 | 234,0.1359054587 65 | 256,0.1859243927 66 | 279,0.2025442452 67 | 304,0.2457022567 68 | 331,0.3658846812 69 | 362,0.4848338147 70 | 394,0.5143358856 71 | 430,0.7882646333 72 | 469,0.8928990397 73 | 512,1.1656835586 74 | 558,1.3195744775 75 | 608,1.6413668097 76 | 663,1.8156536501 77 | 724,2.2290742492 78 | 789,2.6347425512 79 | 861,2.7744974657 80 | 939,3.3441175965 81 | 1024,3.7699864220 82 | 1116,4.2669058057 83 | 1217,4.2239304343 84 | 1327,4.2999952491 85 | 1448,5.0765567637 86 | 1579,4.8243019385 87 | 1722,5.5287772808 88 | 1878,5.8710045088 89 | 2048,6.2494979996 90 | 2233,6.0713500257 91 | 2435,5.6761027623 92 | 2655,6.5709195721 93 | 2896,7.4728911051 94 | 3158,7.9778022427 95 | 3444,8.1693656027 96 | 3756,7.9928773714 97 | 4096,8.5269678381 98 | 4466,8.3420676045 99 | 4870,7.4717510687 100 | 5311,8.1053401717 101 | 5792,8.1765165436 102 | 6316,8.2193813665 103 | 6888,8.2207526766 104 | 7512,8.2530108115 105 | 8192,8.6345897045 106 | 8933,8.3026164362 107 | 9741,7.9344509507 108 | -------------------------------------------------------------------------------- /matmul_times/g3-float16.csv: -------------------------------------------------------------------------------- 1 | 1,0.0000000054 2 | 1,0.0000000051 3 | 1,0.0000000065 4 | 1,0.0000000044 5 | 1,0.0000000048 6 | 1,0.0000000045 7 | 1,0.0000000042 8 | 1,0.0000000048 9 | 2,0.0000000598 10 | 2,0.0000000522 11 | 2,0.0000000602 12 | 2,0.0000000592 13 | 2,0.0000000569 14 | 3,0.0000002437 15 | 3,0.0000002260 16 | 3,0.0000002362 17 | 4,0.0000005616 18 | 4,0.0000005722 19 | 4,0.0000005844 20 | 5,0.0000012099 21 | 5,0.0000011887 22 | 6,0.0000018102 23 | 6,0.0000016259 24 | 7,0.0000028216 25 | 8,0.0000040517 26 | 8,0.0000043483 27 | 9,0.0000062360 28 | 10,0.0000095128 29 | 11,0.0000114769 30 | 12,0.0000153840 31 | 13,0.0000180906 32 | 14,0.0000243247 33 | 16,0.0000400715 34 | 17,0.0000377767 35 | 19,0.0000514943 36 | 20,0.0000683860 37 | 22,0.0000865112 38 | 24,0.0001051213 39 | 26,0.0001426751 40 | 29,0.0001913256 41 | 32,0.0002875058 42 | 34,0.0003051285 43 | 38,0.0004346936 44 | 41,0.0005442804 45 | 45,0.0007014911 46 | 49,0.0010251019 47 | 53,0.0012280063 48 | 58,0.0016934492 49 | 64,0.0020862005 50 | 69,0.0027503521 51 | 76,0.0031500031 52 | 82,0.0035116997 53 | 90,0.0046931443 54 | 98,0.0081784715 55 | 107,0.0083872862 56 | 117,0.0115189820 57 | 128,0.0190747343 58 | 139,0.0185188080 59 | 152,0.0249379696 60 | 165,0.0328195935 61 | 181,0.0387405693 62 | 197,0.0508121822 63 | 215,0.0742819229 64 | 234,0.0777197828 65 | 256,0.1125915606 66 | 279,0.1304913186 67 | 304,0.1583694332 68 | 331,0.1888342444 69 | 362,0.2387355070 70 | 394,0.2557542415 71 | 430,0.3165042259 72 | 469,0.4105311260 73 | 512,0.5243249028 74 | 558,0.5828114282 75 | 608,0.6143411728 76 | 663,0.6469807857 77 | 724,0.9105040736 78 | 789,0.9376811718 79 | 861,1.0619575012 80 | 939,1.2155576995 81 | 1024,1.4132319602 82 | 1116,1.4260724553 83 | 1217,1.4432986598 84 | 1327,1.4319563294 85 | 1448,1.4471410970 86 | 1579,1.5607382246 87 | 1722,1.6237686216 88 | 1878,1.9481190133 89 | 2048,2.3587374197 90 | 2233,2.4338772716 91 | 2435,2.5985527843 92 | 2655,3.1787637172 93 | 2896,3.4698303371 94 | 3158,3.8602679332 95 | 3444,4.0042434553 96 | 3756,3.8255689503 97 | 4096,4.0256793455 98 | 4466,4.0370775306 99 | 4870,3.8937734305 100 | 5311,3.9990913681 101 | 5792,3.9777389470 102 | 6316,4.0011357263 103 | 6888,4.0787678870 104 | 7512,4.0671405648 105 | 8192,4.1312549545 106 | 8933,4.0862087134 107 | 9741,4.0105048138 108 | 10623,4.1034223568 109 | 11585,4.0652498675 110 | 12633,4.0691972371 111 | -------------------------------------------------------------------------------- /matmul_times/g3-float32.csv: -------------------------------------------------------------------------------- 1 | 1,0.0000000073 2 | 1,0.0000000048 3 | 1,0.0000000054 4 | 1,0.0000000052 5 | 1,0.0000000055 6 | 1,0.0000000052 7 | 1,0.0000000051 8 | 1,0.0000000048 9 | 2,0.0000000586 10 | 2,0.0000000613 11 | 2,0.0000000606 12 | 2,0.0000000604 13 | 2,0.0000000590 14 | 3,0.0000002199 15 | 3,0.0000002166 16 | 3,0.0000002331 17 | 4,0.0000005445 18 | 4,0.0000005953 19 | 4,0.0000005509 20 | 5,0.0000011035 21 | 5,0.0000010665 22 | 6,0.0000018245 23 | 6,0.0000020743 24 | 7,0.0000030141 25 | 8,0.0000048806 26 | 8,0.0000041701 27 | 9,0.0000067732 28 | 10,0.0000091322 29 | 11,0.0000126449 30 | 12,0.0000167117 31 | 13,0.0000194493 32 | 14,0.0000258426 33 | 16,0.0000528046 34 | 17,0.0000458252 35 | 19,0.0000690545 36 | 20,0.0000746672 37 | 22,0.0001094539 38 | 24,0.0001388742 39 | 26,0.0001667570 40 | 29,0.0002413090 41 | 32,0.0003086639 42 | 34,0.0003747490 43 | 38,0.0005190560 44 | 41,0.0005871084 45 | 45,0.0009346655 46 | 49,0.0010798767 47 | 53,0.0014085849 48 | 58,0.0022631995 49 | 64,0.0025011876 50 | 69,0.0032449305 51 | 76,0.0034306438 52 | 82,0.0041311552 53 | 90,0.0051279059 54 | 98,0.0074588679 55 | 107,0.0098178931 56 | 117,0.0124193482 57 | 128,0.0160830886 58 | 139,0.0211496424 59 | 152,0.0247961973 60 | 165,0.0338157899 61 | 181,0.0430062583 62 | 197,0.0542269884 63 | 215,0.0718169422 64 | 234,0.0988467147 65 | 256,0.1196925419 66 | 279,0.1455098916 67 | 304,0.1972818843 68 | 331,0.1772180971 69 | 362,0.2636590135 70 | 394,0.2754185967 71 | 430,0.3726414457 72 | 469,0.3835439101 73 | 512,0.6403236611 74 | 558,0.6271523764 75 | 608,0.6723487806 76 | 663,0.6716123262 77 | 724,0.8588339883 78 | 789,0.9945220783 79 | 861,1.1051620527 80 | 939,1.2015233788 81 | 1024,1.3994641071 82 | 1116,1.3509447411 83 | 1217,1.4638138307 84 | 1327,1.4528602068 85 | 1448,1.4676407356 86 | 1579,1.5869783426 87 | 1722,1.6656387593 88 | 1878,1.9194971156 89 | 2048,2.3645307724 90 | 2233,2.4380628450 91 | 2435,2.6216720574 92 | 2655,3.1950040147 93 | 2896,3.4910323127 94 | 3158,3.8479634522 95 | 3444,3.9335505780 96 | 3756,3.8047738191 97 | 4096,3.9912365038 98 | 4466,3.9395745634 99 | 4870,3.7676875717 100 | 5311,3.8672090765 101 | 5792,3.8548877523 102 | 6316,3.8473078459 103 | 6888,3.9200867856 104 | 7512,3.8976030316 105 | 8192,3.9648233326 106 | 8933,3.9030829731 107 | -------------------------------------------------------------------------------- /matmul_times/nvidia-p3-float16.csv: -------------------------------------------------------------------------------- 1 | 1,0.0000000052 2 | 1,0.0000000041 3 | 1,0.0000000046 4 | 1,0.0000000049 5 | 1,0.0000000042 6 | 1,0.0000000041 7 | 1,0.0000000046 8 | 1,0.0000000055 9 | 2,0.0000000554 10 | 2,0.0000000637 11 | 2,0.0000000545 12 | 2,0.0000000532 13 | 2,0.0000000507 14 | 3,0.0000002001 15 | 3,0.0000002108 16 | 3,0.0000002082 17 | 4,0.0000004921 18 | 4,0.0000004808 19 | 4,0.0000005280 20 | 5,0.0000010680 21 | 5,0.0000010738 22 | 6,0.0000017084 23 | 6,0.0000017649 24 | 7,0.0000027460 25 | 8,0.0000041874 26 | 8,0.0000042690 27 | 9,0.0000069834 28 | 10,0.0000086023 29 | 11,0.0000106921 30 | 12,0.0000133234 31 | 13,0.0000179914 32 | 14,0.0000246968 33 | 16,0.0000337639 34 | 17,0.0000395959 35 | 19,0.0000656157 36 | 20,0.0000653406 37 | 22,0.0000929182 38 | 24,0.0001354284 39 | 26,0.0001817980 40 | 29,0.0002643254 41 | 32,0.0002998829 42 | 34,0.0003157878 43 | 38,0.0004163927 44 | 41,0.0007430828 45 | 45,0.0007193491 46 | 49,0.0012699617 47 | 53,0.0014592136 48 | 58,0.0019013776 49 | 64,0.0028069479 50 | 69,0.0026474907 51 | 76,0.0037250988 52 | 82,0.0044074782 53 | 90,0.0076624283 54 | 98,0.0099851778 55 | 107,0.0103070034 56 | 117,0.0138518935 57 | 128,0.0231608708 58 | 139,0.0216869965 59 | 152,0.0316261520 60 | 165,0.0363883348 61 | 181,0.0517542283 62 | 197,0.0623640412 63 | 215,0.0814378555 64 | 234,0.1148254428 65 | 256,0.1336743332 66 | 279,0.2045750757 67 | 304,0.2282127190 68 | 331,0.3014729031 69 | 362,0.3264810883 70 | 394,0.4374849399 71 | 430,0.6657965652 72 | 469,0.6715217545 73 | 512,1.1204232802 74 | 558,0.9380739764 75 | 608,1.8426026193 76 | 663,1.4936602735 77 | 724,2.0298141173 78 | 789,2.4388995886 79 | 861,2.8479440678 80 | 939,3.5406921338 81 | 1024,8.3014007112 82 | 1116,4.9172887222 83 | 1217,4.8020404036 84 | 1327,6.9848150236 85 | 1448,14.6789663308 86 | 1579,7.2316324211 87 | 1722,8.7278280040 88 | 1878,10.5813931247 89 | 2048,27.3441403434 90 | 2233,12.1147870981 91 | 2435,12.0358759694 92 | 2655,12.1954933158 93 | 2896,46.6051363046 94 | 3158,13.2800244576 95 | 3444,13.8267628176 96 | 3756,13.7995925164 97 | 4096,69.9589102386 98 | 4466,14.3317932735 99 | 4870,14.1166294853 100 | 5311,14.9101748975 101 | 5792,71.8273441365 102 | 6316,14.8697839164 103 | 6888,75.0021194804 104 | 7512,76.0847702634 105 | 8192,87.2323633474 106 | 8933,15.2443599021 107 | 9741,15.0255254543 108 | 10623,15.4011254535 109 | 11585,15.3233762417 110 | 12633,15.4141927233 111 | 13777,15.4542546400 112 | 15024,51.3086154117 113 | 16384,54.8225731495 114 | 17866,15.4363718738 115 | 19483,15.4177083800 116 | -------------------------------------------------------------------------------- /matmul_times/nvidia-p3-float32.csv: -------------------------------------------------------------------------------- 1 | 1,0.0000000064 2 | 1,0.0000000049 3 | 1,0.0000000051 4 | 1,0.0000000050 5 | 1,0.0000000052 6 | 1,0.0000000042 7 | 1,0.0000000046 8 | 1,0.0000000045 9 | 2,0.0000000510 10 | 2,0.0000000485 11 | 2,0.0000000479 12 | 2,0.0000000548 13 | 2,0.0000000471 14 | 3,0.0000002029 15 | 3,0.0000002306 16 | 3,0.0000001912 17 | 4,0.0000004770 18 | 4,0.0000004901 19 | 4,0.0000005156 20 | 5,0.0000009719 21 | 5,0.0000009661 22 | 6,0.0000017499 23 | 6,0.0000017244 24 | 7,0.0000031790 25 | 8,0.0000038894 26 | 8,0.0000038526 27 | 9,0.0000059240 28 | 10,0.0000106741 29 | 11,0.0000110602 30 | 12,0.0000139035 31 | 13,0.0000181564 32 | 14,0.0000236016 33 | 16,0.0000321635 34 | 17,0.0000413539 35 | 19,0.0000546057 36 | 20,0.0000616567 37 | 22,0.0001286502 38 | 24,0.0001390447 39 | 26,0.0001460748 40 | 29,0.0001951062 41 | 32,0.0002772766 42 | 34,0.0003086856 43 | 38,0.0004384349 44 | 41,0.0005533988 45 | 45,0.0007149317 46 | 49,0.0009276363 47 | 53,0.0013331269 48 | 58,0.0018695030 49 | 64,0.0022682527 50 | 69,0.0031001839 51 | 76,0.0035845403 52 | 82,0.0045948409 53 | 90,0.0074623255 54 | 98,0.0074607993 55 | 107,0.0097530265 56 | 117,0.0138274526 57 | 128,0.0186669288 58 | 139,0.0286824569 59 | 152,0.0268727477 60 | 165,0.0355386730 61 | 181,0.0473796592 62 | 197,0.0659762906 63 | 215,0.0864406612 64 | 234,0.0885813776 65 | 256,0.1523531400 66 | 279,0.1636012400 67 | 304,0.2584961682 68 | 331,0.2811011999 69 | 362,0.3519482168 70 | 394,0.4218602630 71 | 430,0.5220794553 72 | 469,0.5562290865 73 | 512,0.9788097312 74 | 558,0.8579025595 75 | 608,1.2380418081 76 | 663,1.5557990302 77 | 724,2.1672233818 78 | 789,2.5753768218 79 | 861,2.2584686231 80 | 939,3.0033014996 81 | 1024,3.8812342065 82 | 1116,4.1678976639 83 | 1217,4.4648922758 84 | 1327,5.3470564838 85 | 1448,6.3065089240 86 | 1579,6.8292859679 87 | 1722,7.5663780618 88 | 1878,8.2142042796 89 | 2048,8.9502136919 90 | 2233,10.0775378689 91 | 2435,9.4611509618 92 | 2655,10.4873276759 93 | 2896,11.0233645833 94 | 3158,11.7347679907 95 | 3444,11.8439367523 96 | 3756,11.8604344909 97 | 4096,13.3133777524 98 | 4466,14.6612454411 99 | 4870,14.1017926913 100 | 5311,15.0166186211 101 | 5792,14.5565515674 102 | 6316,14.8739420297 103 | 6888,14.7140872238 104 | 7512,15.1461848986 105 | 8192,15.2734148530 106 | 8933,15.2323380953 107 | 9741,14.9980861870 108 | 10623,15.3962168307 109 | 11585,15.3180348029 110 | 12633,15.4027308920 111 | 13777,15.4373195183 112 | -------------------------------------------------------------------------------- /matmul_times/p2-float16.csv: -------------------------------------------------------------------------------- 1 | 1,0.0000000067 2 | 1,0.0000000072 3 | 1,0.0000000069 4 | 1,0.0000000069 5 | 1,0.0000000072 6 | 1,0.0000000072 7 | 1,0.0000000072 8 | 1,0.0000000069 9 | 2,0.0000000827 10 | 2,0.0000000814 11 | 2,0.0000000822 12 | 2,0.0000000868 13 | 2,0.0000000825 14 | 3,0.0000002982 15 | 3,0.0000002975 16 | 3,0.0000003082 17 | 4,0.0000007697 18 | 4,0.0000007342 19 | 4,0.0000007368 20 | 5,0.0000014666 21 | 5,0.0000015306 22 | 6,0.0000026006 23 | 6,0.0000027080 24 | 7,0.0000041761 25 | 8,0.0000063052 26 | 8,0.0000066057 27 | 9,0.0000089852 28 | 10,0.0000134527 29 | 11,0.0000164623 30 | 12,0.0000217909 31 | 13,0.0000285714 32 | 14,0.0000345181 33 | 16,0.0000524895 34 | 17,0.0000587160 35 | 19,0.0000842484 36 | 20,0.0000992910 37 | 22,0.0001303871 38 | 24,0.0001693906 39 | 26,0.0002135477 40 | 29,0.0002919552 41 | 32,0.0003960148 42 | 34,0.0004729200 43 | 38,0.0006482468 44 | 41,0.0008219702 45 | 45,0.0010681646 46 | 49,0.0013798360 47 | 53,0.0017229605 48 | 58,0.0022514637 49 | 64,0.0030171800 50 | 69,0.0037498750 51 | 76,0.0050644601 52 | 82,0.0060690625 53 | 90,0.0078262982 54 | 98,0.0099887460 55 | 107,0.0128728047 56 | 117,0.0163691077 57 | 128,0.0214060202 58 | 139,0.0268555938 59 | 152,0.0346719627 60 | 165,0.0429776979 61 | 181,0.0556467780 62 | 197,0.0694491538 63 | 215,0.0887498153 64 | 234,0.1106947986 65 | 256,0.1407321169 66 | 279,0.1764994256 67 | 304,0.2221952034 68 | 331,0.2682980740 69 | 362,0.3377952514 70 | 394,0.4052823128 71 | 430,0.5053214475 72 | 469,0.6015845170 73 | 512,0.7474339226 74 | 558,0.8127210424 75 | 608,0.9427789145 76 | 663,0.8139654736 77 | 724,0.9727766023 78 | 789,1.1184685269 79 | 861,1.2510366606 80 | 939,1.1916444116 81 | 1024,1.3306496804 82 | 1116,1.2824753318 83 | 1217,1.3783316294 84 | 1327,1.4173754597 85 | 1448,1.4487053520 86 | 1579,1.4887979924 87 | 1722,1.5300201882 88 | 1878,1.5429387891 89 | 2048,1.6414710251 90 | 2233,1.6066053016 91 | 2435,1.5576891764 92 | 2655,1.6047121464 93 | 2896,1.5857469506 94 | 3158,1.6055466919 95 | 3444,1.6241025204 96 | 3756,1.6241688208 97 | 4096,1.6478262379 98 | 4466,1.6268917333 99 | 4870,1.6020022079 100 | 5311,1.5891106124 101 | 5792,1.5323465126 102 | 6316,1.4899392695 103 | 6888,1.4858769476 104 | 7512,1.4761556421 105 | 8192,1.5168820074 106 | 8933,1.4726656810 107 | 9741,1.4652164240 108 | 10623,1.4744597313 109 | 11585,1.4591969299 110 | 12633,1.4463543210 111 | 13777,1.4481929981 112 | 15024,1.4724910144 113 | 16384,1.4893490224 114 | -------------------------------------------------------------------------------- /matmul_times/p2-float32.csv: -------------------------------------------------------------------------------- 1 | 1,0.0000000062 2 | 1,0.0000000068 3 | 1,0.0000000065 4 | 1,0.0000000064 5 | 1,0.0000000065 6 | 1,0.0000000064 7 | 1,0.0000000065 8 | 1,0.0000000065 9 | 2,0.0000000753 10 | 2,0.0000000749 11 | 2,0.0000000752 12 | 2,0.0000000716 13 | 2,0.0000000746 14 | 3,0.0000002802 15 | 3,0.0000002800 16 | 3,0.0000002791 17 | 4,0.0000006948 18 | 4,0.0000006977 19 | 4,0.0000006917 20 | 5,0.0000013994 21 | 5,0.0000013979 22 | 6,0.0000024555 23 | 6,0.0000023712 24 | 7,0.0000037908 25 | 8,0.0000057672 26 | 8,0.0000059771 27 | 9,0.0000087234 28 | 10,0.0000117764 29 | 11,0.0000151661 30 | 12,0.0000202780 31 | 13,0.0000260046 32 | 14,0.0000323737 33 | 16,0.0000465056 34 | 17,0.0000536368 35 | 19,0.0000743973 36 | 20,0.0000867669 37 | 22,0.0001151412 38 | 24,0.0001512901 39 | 26,0.0001919364 40 | 29,0.0002657910 41 | 32,0.0003578811 42 | 34,0.0004240251 43 | 38,0.0005969277 44 | 41,0.0007429814 45 | 45,0.0009892200 46 | 49,0.0012556719 47 | 53,0.0016050206 48 | 58,0.0021076201 49 | 64,0.0028424239 50 | 69,0.0034972628 51 | 76,0.0046591594 52 | 82,0.0058126913 53 | 90,0.0079179318 54 | 98,0.0098811798 55 | 107,0.0127129541 56 | 117,0.0175084783 57 | 128,0.0224006348 58 | 139,0.0278861869 59 | 152,0.0368665318 60 | 165,0.0469158064 61 | 181,0.0609087915 62 | 197,0.0734990603 63 | 215,0.0931482866 64 | 234,0.1193392140 65 | 256,0.1542684148 66 | 279,0.1888398340 67 | 304,0.2380925611 68 | 331,0.2674025149 69 | 362,0.3328934757 70 | 394,0.3741520061 71 | 430,0.4676547051 72 | 469,0.5209803082 73 | 512,0.6465247933 74 | 558,0.7790807773 75 | 608,0.8420655397 76 | 663,1.0007404240 77 | 724,1.0769312388 78 | 789,1.2099749766 79 | 861,1.2660685126 80 | 939,1.2971475261 81 | 1024,1.5355688801 82 | 1116,1.5382477777 83 | 1217,1.5904825608 84 | 1327,1.6077547562 85 | 1448,1.7429501426 86 | 1579,1.8778230197 87 | 1722,1.8987380664 88 | 1878,1.8820152242 89 | 2048,1.9533219189 90 | 2233,2.3653780610 91 | 2435,2.3538986067 92 | 2655,3.0718712072 93 | 2896,3.2225252910 94 | 3158,3.1766847222 95 | 3444,3.2306806382 96 | 3756,3.2422227956 97 | 4096,3.2994986714 98 | 4466,3.3113207461 99 | 4870,3.1896135517 100 | 5311,3.2127539480 101 | 5792,3.3224748444 102 | 6316,3.0620875510 103 | 6888,3.1754668235 104 | 7512,3.1448125620 105 | 8192,3.1791185412 106 | 8933,2.9275908341 107 | 9741,2.8994841052 108 | 10623,2.8849807107 109 | 11585,2.8143021900 110 | -------------------------------------------------------------------------------- /mavelin/machine1.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import tensorflow as tf 3 | 4 | cluster_spec = tf.train.ClusterSpec({ 5 | "a": { 0: "localhost:8000" }, 6 | "b": { 0: "localhost:8001" }, 7 | }) 8 | 9 | jobname = "a" 10 | taskid = 0 11 | server = tf.train.Server(cluster_spec, jobname, taskid) 12 | 13 | with tf.device("/job:a/task:0/cpu:0"): 14 | queue = tf.FIFOQueue( 15 | capacity=100, dtypes=[tf.int64], 16 | shapes=[[]], shared_name="a_queue", name="a_queue") 17 | 18 | if jobname == "a" and taskid == 0: 19 | enqueue_op = queue.enqueue(10) 20 | sess = tf.Session(server.target) 21 | while True: 22 | sess.run(enqueue_op) 23 | else: 24 | dequeue_op = queue.dequeue() 25 | sess = tf.Session(server.target) 26 | while True: 27 | print(sess.run(dequeue_op)) 28 | -------------------------------------------------------------------------------- /mavelin/machine3.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import tensorflow as tf 3 | 4 | cluster_spec = tf.train.ClusterSpec({ 5 | "a": { 0: "localhost:8000" }, 6 | "b": { 1: "localhost:8001" }, 7 | }) 8 | 9 | DOFAIL=True 10 | 11 | jobname = "b" 12 | taskid = 1 13 | server = tf.train.Server(cluster_spec, jobname, taskid) 14 | 15 | with tf.device("/job:a/task:0/cpu:0"): 16 | queue = tf.FIFOQueue( 17 | capacity=100, dtypes=[tf.int64], 18 | shapes=[[]], shared_name="a_queue", name="a_queue") 19 | 20 | if jobname == "a" and taskid == 0: 21 | enqueue_op = queue.enqueue(10) 22 | sess = tf.Session(server.target) 23 | while True: 24 | sess.run(enqueue_op) 25 | else: 26 | with tf.device("/job:b/task:1"): 27 | out = queue.dequeue() 28 | queue_b = tf.FIFOQueue(capacity=100, dtypes=[tf.int64], shapes=[[]], name="b_queue") 29 | if DOFAIL: 30 | out = tf.cond(tf.equal(out, 10), lambda: queue_b.enqueue(out), lambda: tf.no_op()) 31 | g = tf.get_default_graph() 32 | from tensorflow.core.framework import attr_value_pb2 33 | op = g.get_operation_by_name('cond/b_queue_enqueue/Switch_1') 34 | op.node_def.attr["_class"].CopyFrom(attr_value_pb2.AttrValue( 35 | list=attr_value_pb2.AttrValue.ListValue(s=[]))) 36 | 37 | op = g.get_operation_by_name('cond/b_queue_enqueue/Switch') 38 | op.node_def.attr["_class"].CopyFrom(attr_value_pb2.AttrValue( 39 | list=attr_value_pb2.AttrValue.ListValue(s=[]))) 40 | 41 | with open('fail.pbtxt', 'w') as outf: 42 | outf.write(str(tf.get_default_graph().as_graph_def())) 43 | else: 44 | enq = queue_b.enqueue(out) 45 | no_op = tf.no_op() 46 | out = tf.cond(tf.equal(out, 10), lambda: enq, lambda: no_op) 47 | with open('pass.pbtxt', 'w') as outf: 48 | outf.write(str(tf.get_default_graph().as_graph_def())) 49 | 50 | 51 | sess = tf.Session(server.target) 52 | while True: 53 | print(sess.run(out)) 54 | -------------------------------------------------------------------------------- /notebook_util.py: -------------------------------------------------------------------------------- 1 | import subprocess, re, os, sys 2 | 3 | # GPU picking 4 | # http://stackoverflow.com/a/41638727/419116 5 | # Nvidia-smi GPU memory parsing. 6 | # Tested on nvidia-smi 370.23 7 | 8 | def run_command(cmd): 9 | """Run command, return output as string.""" 10 | 11 | output = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True).communicate()[0] 12 | return output.decode("ascii") 13 | 14 | def list_available_gpus(): 15 | """Returns list of available GPU ids.""" 16 | 17 | output = run_command("nvidia-smi -L") 18 | # lines of the form GPU 0: TITAN X 19 | gpu_regex = re.compile(r"GPU (?P\d+):") 20 | result = [] 21 | for line in output.strip().split("\n"): 22 | m = gpu_regex.match(line) 23 | assert m, "Couldnt parse "+line 24 | result.append(int(m.group("gpu_id"))) 25 | return result 26 | 27 | def gpu_memory_map(): 28 | """Returns map of GPU id to memory allocated on that GPU.""" 29 | 30 | output = run_command("nvidia-smi") 31 | gpu_output = output[output.find("GPU Memory"):] 32 | # lines of the form 33 | # | 0 8734 C python 11705MiB | 34 | memory_regex = re.compile(r"[|]\s+?(?P\d+)\D+?(?P\d+).+[ ](?P\d+)MiB") 35 | rows = gpu_output.split("\n") 36 | result = {gpu_id: 0 for gpu_id in list_available_gpus()} 37 | for row in gpu_output.split("\n"): 38 | m = memory_regex.search(row) 39 | if not m: 40 | continue 41 | gpu_id = int(m.group("gpu_id")) 42 | gpu_memory = int(m.group("gpu_memory")) 43 | result[gpu_id] += gpu_memory 44 | return result 45 | 46 | def pick_gpu_lowest_memory(): 47 | """Returns GPU with the least allocated memory""" 48 | 49 | memory_gpu_map = [(memory, gpu_id) for (gpu_id, memory) in gpu_memory_map().items()] 50 | best_memory, best_gpu = sorted(memory_gpu_map)[0] 51 | return best_gpu 52 | 53 | def setup_one_gpu(): 54 | assert not 'tensorflow' in sys.modules, "GPU setup must happen before importing TensorFlow" 55 | gpu_id = pick_gpu_lowest_memory() 56 | print("Picking GPU "+str(gpu_id)) 57 | os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" 58 | os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id) 59 | 60 | def setup_no_gpu(): 61 | if 'tensorflow' in sys.modules: 62 | print("Warning, GPU setup must happen before importing TensorFlow") 63 | os.environ["CUDA_VISIBLE_DEVICES"] = '' 64 | -------------------------------------------------------------------------------- /queue_mismatch.py: -------------------------------------------------------------------------------- 1 | # from http://stackoverflow.com/questions/41920371/tensorflow-multi-threaded-queuerunner?noredirect=1#comment71036438_41920371 2 | 3 | import tensorflow as tf 4 | import numpy as np 5 | import time 6 | 7 | batch_size = 4 8 | iters = 100 9 | a = tf.train.range_input_producer(10, shuffle=False, name="a", capacity=batch_size*iters).dequeue() 10 | b = tf.train.range_input_producer(10, shuffle=False, name="b", capacity=batch_size*iters).dequeue() 11 | c1, c2 = tf.train.batch([a,b], num_threads=batch_size, batch_size=batch_size, capacity=iters) 12 | config = tf.ConfigProto() 13 | config.operation_timeout_in_ms=5000 # terminate on long hangs 14 | #import pdb; pdb.set_trace() 15 | sess = tf.InteractiveSession(config=config) 16 | sess.run([tf.initialize_all_variables()]) 17 | 18 | coord = tf.train.Coordinator() 19 | threads = tf.train.start_queue_runners(sess, coord) 20 | 21 | 22 | time.sleep(1) 23 | coord.request_stop() 24 | coord.join(threads) 25 | #print("Queue runners: ") 26 | #for qr in tf.get_default_graph().get_collection(tf.GraphKeys.QUEUE_RUNNERS): 27 | # print("name: %s" %(qr.name)) 28 | # print("queue_name: %s" %(qr.queue.name)) 29 | # print("number of enqueue ops: %d"%(len(qr.enqueue_ops),)) 30 | 31 | results = [] 32 | for i in range(iters): 33 | d1,list1,list2 = sess.run([tf.reduce_all(tf.equal(c1, c2)), c1, c2]) 34 | if not d1: 35 | print(list1) 36 | print(list2) 37 | results.append(d1) 38 | print("mismatches: %d/%d"%(iters-sum(results), iters)) 39 | 40 | 41 | coord.request_stop() 42 | -------------------------------------------------------------------------------- /queues_talk/slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaroslavvb/stuff/a8024ead315aa1b5d6976940b3a062178f0e499d/queues_talk/slides.pdf -------------------------------------------------------------------------------- /resnet_leak_report.py: -------------------------------------------------------------------------------- 1 | # test whether memory gets cleared on creating new sessions 2 | import sys, os, math, random 3 | os.environ['TF_CPP_MIN_LOG_LEVEL']='2' 4 | 5 | 6 | import tensorflow as tf 7 | import numpy as np 8 | 9 | if __name__=='__main__': 10 | for i in range(10): 11 | tf.reset_default_graph() 12 | sess = tf.InteractiveSession() 13 | 14 | size = 12000 15 | example_queue = tf.FIFOQueue(1, dtypes=[tf.float32], shapes=[[size]]) 16 | from tensorflow.python.ops import gen_random_ops 17 | image = tf.random_uniform([size]) 18 | example_enqueue_op = example_queue.enqueue([image]) 19 | sess.run(example_enqueue_op) 20 | sess.run(example_queue.close()) 21 | 22 | images = example_queue.dequeue_many(1) 23 | images = tf.concat([images]*size, axis=0) 24 | var = tf.Variable(tf.ones_like(images)) 25 | 26 | sess.run(tf.global_variables_initializer()) 27 | sess.run(tf.local_variables_initializer()) 28 | def relu(x): 29 | return tf.where(tf.less(x, 0.0), x, x, name='leaky_relu') 30 | cost = tf.reduce_sum(relu(images+var)) 31 | 32 | grads = tf.gradients(cost, var) 33 | _, memuse = sess.run([grads, tf.contrib.memory_stats.MaxBytesInUse()]) 34 | print("Run %d, GBs in use %.1f"%(i, memuse/10**9)) 35 | 36 | sess.close() 37 | del sess 38 | -------------------------------------------------------------------------------- /resnet_leak_report2.py: -------------------------------------------------------------------------------- 1 | # test whether memory gets cleared on creating new sessions 2 | import sys, os, math, random 3 | 4 | 5 | import tensorflow as tf 6 | import numpy as np 7 | 8 | if __name__=='__main__': 9 | try: 10 | 11 | from tensorflow.core.protobuf import rewriter_config_pb2 12 | rewrite_options = rewriter_config_pb2.RewriterConfig( 13 | disable_model_pruning=True, 14 | constant_folding=rewriter_config_pb2.RewriterConfig.OFF, 15 | memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL) 16 | optimizer_options = tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0) 17 | graph_options=tf.GraphOptions(optimizer_options=optimizer_options, 18 | rewrite_options=rewrite_options) 19 | config = tf.ConfigProto(graph_options=graph_options) 20 | sess = tf.Session(config=config) 21 | 22 | size = 12000 23 | num_runs = 10 24 | 25 | images = tf.random_uniform([size, size]) 26 | var = tf.Variable(tf.ones_like(images)) 27 | sess.run(var.initializer) 28 | for i in range(10): 29 | def relu(x): 30 | return tf.where(tf.less(x, 0.0), x, x, name='leaky_relu') 31 | cost = tf.reduce_sum(relu(images+var)) 32 | 33 | grads = tf.gradients(cost, var) 34 | _, memuse, memuse2 = sess.run([grads, tf.contrib.memory_stats.MaxBytesInUse(), tf.contrib.memory_stats.BytesInUse()]) 35 | print("Run %d, GBs in use %.2f, %.2f"%(i, memuse/10**9,memuse2/10**9)) 36 | except: 37 | pass 38 | finally: 39 | [memuse] = sess.run([tf.contrib.memory_stats.MaxBytesInUse()]) 40 | print("Memory GBs in use %.2f"%(memuse/10**9,)) 41 | 42 | 43 | # 576000000 44 | # 2017-09-21 14:53:23.483412: I tensorflow/core/framework/log_memory.cc:35] __LOG_MEMORY__ MemoryLogTensorOutput { step_id: 2 kernel_name: "gradients/leaky_relu_grad/zeros_like" tensor { dtype: DT_FLOAT shape { dim { size: 144000000 } } allocation_description { requested_bytes: 576000000 allocated_bytes: 576000000 allocator_name: "GPU_0_bfc" allocation_id: 6 ptr: 1109438113536 } } } 45 | -------------------------------------------------------------------------------- /resource_variable_test.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.python.ops import resource_variable_ops 3 | import portpicker 4 | 5 | port = portpicker.pick_unused_port() 6 | host = "127.0.0.1" 7 | job_name = "worker" 8 | cluster = {job_name: [host+":"+str(port)]} 9 | cluster_spec = tf.train.ClusterSpec(cluster).as_cluster_def() 10 | 11 | server = tf.train.Server(cluster_spec, job_name=job_name) 12 | sess = tf.Session(server.target) 13 | 14 | x = tf.get_variable("x", shape=[], dtype=tf.float32, 15 | initializer=tf.constant_initializer(2), use_resource=True) 16 | sess.run(tf.global_variables_initializer()) 17 | print(sess.run(x)) 18 | -------------------------------------------------------------------------------- /simple_train.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import (DataLoader, SequentialSampler, 3 | TensorDataset) 4 | 5 | from pytorch_pretrained_bert import OpenAIGPTTokenizer, OpenAIGPTLMHeadModel 6 | 7 | 8 | def main(): 9 | # 3 examples 10 | train_dataset = 'small brown fox jumps over the lazy dog\n' \ 11 | 'small brown fox jumps over the lazy dog\n' \ 12 | 'small brown fox jumps over the lazy dog\n' 13 | tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt', 14 | special_tokens=[]) 15 | tokenized = [tokenizer.tokenize(t) for t in train_dataset.strip().split('\n')] 16 | 17 | encoded=[tokenizer.convert_tokens_to_ids(t) for t in tokenized] # 3x8 18 | dataset = TensorDataset(torch.tensor(encoded)) 19 | sampler = SequentialSampler(dataset) 20 | dataloader = DataLoader(dataset, sampler=sampler, batch_size=1) 21 | model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt') 22 | 23 | optimizer = torch.optim.SGD(model.parameters(), lr = 0.0001, momentum=0.9) 24 | 25 | batch = next(iter(dataloader)) 26 | batch=batch[0] # dataloader gives [batch] instead of batch...why? 27 | 28 | for i in range(20): 29 | loss = model(input_ids=batch, lm_labels=batch) 30 | print(loss.detach().numpy()) 31 | loss.backward() 32 | optimizer.step() 33 | optimizer.zero_grad() 34 | 35 | # Should produce this 36 | # 6.134997 37 | # 5.3747735 38 | # 5.164842 39 | # 4.8581843 40 | # 4.346232 41 | # 4.158811 42 | # 3.7503657 43 | # 3.29156 44 | # 2.8858535 45 | # 2.760832 46 | # 2.562772 47 | # 2.0645103 48 | # 1.6837901 49 | # 1.6822727 50 | # 1.5878279 51 | # 1.3873199 52 | # 1.158909 53 | # 0.92595655 54 | # 0.8487712 55 | # 0.82774204 56 | 57 | 58 | if __name__=='__main__': 59 | main() 60 | 61 | -------------------------------------------------------------------------------- /svd_benchmark.py: -------------------------------------------------------------------------------- 1 | # Fastest way to compute eigenvectors for 4k matrix? 2 | # 3 | # Xeon V3 benchmarks: 4 | # n=4096 eigs min: 27758.34, median: 28883.69 5 | # n=4096 gesdd min: 7241.70, median: 8477.95 6 | # n=4096 gesvd min=20487.48, median: 22057.64, 7 | # n=4096 inv min: 556.67, median: 579.25, 8 | # n=4096 linsolve: min: 534.40, median: 558.06, mean: 579.19 9 | # 10 | # Xeon V4: 11 | # n=4096 gesdd min: 5586.02, median: 6032.16 12 | # 13 | # 14 | # i7-5820K CPU @ 3.30GHz 15 | # n=4096 gesdd 7288.02, median: 7397.23, mean: 7478.78 16 | # n=4096 inv 520 msec 17 | # 18 | # after upgrading things 19 | # b'Intel(R) Math Kernel Library Version 2017.0.3 Product Build 20170413 for Intel(R) 64 architecture applications' 20 | # n=4096 inv 1427.54 21 | 22 | 23 | from scipy import linalg # for svd 24 | import numpy as np 25 | import time 26 | import sys 27 | 28 | methods = ['gesdd', 'gesvd', 'eigh', 'inv', 'inv2', 'linsolve'] 29 | 30 | if len(sys.argv)<2: 31 | method = methods[0] 32 | else: 33 | method = sys.argv[1] 34 | 35 | # from @eamartin 36 | def empty_aligned(n, align): 37 | """Get n bytes of memory wih alignment align.""" 38 | a = np.empty(n + (align - 1), dtype=np.float32) 39 | data_align = a.ctypes.data % align 40 | offset = 0 if data_align == 0 else (align - data_align) 41 | return a[offset : offset + n] 42 | 43 | assert method in methods 44 | 45 | n=4096 46 | #n=1024 47 | x_old = np.random.randn(n*n).reshape((n,n)).astype(dtype=np.float32) 48 | x = empty_aligned(n*n, 32).reshape((n, n)) 49 | x[:] = x_old 50 | x = x @ x.T 51 | 52 | x0 = np.random.randn(n).reshape((n,1)).astype(dtype=np.float32) 53 | 54 | start_time = time.time() 55 | times = [] 56 | 57 | print("n=%d %s "%(n, method)) 58 | for i in range(9): 59 | if method == 'gesdd': 60 | result = linalg.svd(x) 61 | elif method == 'gesvd': 62 | result = linalg.svd(x, lapack_driver='gesvd') 63 | elif method == 'eigh': 64 | result = linalg.eigh(x) 65 | elif method == 'inv': 66 | result = linalg.inv(x) 67 | elif method == 'inv2': 68 | result = linalg.inv(x, overwrite_a=True) 69 | elif method == 'linsolve': 70 | result = linalg.solve(x, x0) 71 | else: 72 | assert False 73 | new_time = time.time() 74 | elapsed_time = 1000*(new_time - start_time) 75 | print("%.2f msec" %(elapsed_time)) 76 | start_time = new_time 77 | times.append(elapsed_time) 78 | 79 | print("Times: min: %.2f, median: %.2f, mean: %.2f"%(np.min(times), np.median(times), np.mean(times))) 80 | 81 | 82 | # Other timings: svd 83 | # n=1000 Times: min: 126.04, median: 132.48 84 | # n=2000 Times: min: 573.03, median: 621.49 85 | # n=4096 Times: min: 5586.02, median: 6032.16 86 | # Other timings: inv 87 | # Times: min: 17.87, median: 23.41, mean: 27.90 88 | -------------------------------------------------------------------------------- /svd_noconverge.py: -------------------------------------------------------------------------------- 1 | import scipy 2 | import scipy.linalg as linalg 3 | import numpy as np 4 | import os 5 | import sys 6 | import ctypes 7 | import numpy as np 8 | 9 | def mklVersion(): 10 | ver = np.zeros(199, dtype=np.uint8) 11 | mkl = ctypes.cdll.LoadLibrary("libmkl_rt.so") 12 | mkl.MKL_Get_Version_String(ver.ctypes.data_as(ctypes.c_char_p), 198) 13 | return ver[ver != 0].tostring() 14 | 15 | # mklVersion() 16 | 17 | def download_if_needed(fn,target_length=0,bucket="yaroslavvb_stuff"): 18 | import urllib.request 19 | url="https://storage.googleapis.com/%s/%s"%(bucket, fn) 20 | response = urllib.request.urlopen(url) 21 | body = response.read() 22 | print("Read %d bytes from %s"%(len(body), url)) 23 | if target_length: 24 | assert len(body)==target_length 25 | 26 | open(fn, "wb").write(body) 27 | 28 | fn='badsvd0' 29 | download_if_needed(fn, 2458624) 30 | target0 = np.fromfile(fn, np.float32).reshape(784,784) 31 | 32 | success = True 33 | try: 34 | u0, s0, vt0 = linalg.svd(target0) 35 | except Exception as e: 36 | print("SVD failure") 37 | print(repr(e)) 38 | success = False 39 | else: 40 | print("SVD success") 41 | 42 | print("Scipy version: ", scipy.version.full_version) 43 | print("Numpy version: ", np.version.full_version) 44 | print("Python version: ", sys.version) 45 | print("Python binary: ", sys.executable) 46 | 47 | print("-"*80) 48 | print("MKL version:") 49 | print(mklVersion()) 50 | print("-"*80) 51 | print("Conda version:") 52 | os.system("conda list --explicit") 53 | print("-"*80) 54 | print("CPU version") 55 | for l in open("/proc/cpuinfo").read().split('\n'): 56 | if 'model name' in l: 57 | print(l) 58 | break 59 | 60 | if success: 61 | print("Success.") 62 | else: 63 | print("Failure.") 64 | 65 | # Upload notes: 66 | # export fullname=badsvd0 67 | # export bucket=yaroslavvb_stuff 68 | # gsutil cp $fullname gs://$bucket 69 | # gsutil acl set public-read gs://$bucket/$fullname 70 | # echo https://storage.googleapis.com/$bucket/$fullname 71 | -------------------------------------------------------------------------------- /tensorflow-memory-talk.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaroslavvb/stuff/a8024ead315aa1b5d6976940b3a062178f0e499d/tensorflow-memory-talk.pdf -------------------------------------------------------------------------------- /tiny_runs/qr_test.py: -------------------------------------------------------------------------------- 1 | # qr on 4096 x 4096 2 | # tf 6.89 3 | # np openblas 11.38 4 | # np mkl: 2.36 5 | 6 | import tensorflow as tf 7 | import time 8 | import numpy as np 9 | 10 | np.__config__.show() 11 | 12 | try: 13 | tf.reset_default_graph() 14 | n = 2048*2 15 | mat = tf.Variable(tf.random_uniform((n,n))) 16 | qr = tf.qr(mat) 17 | sess = tf.Session(config = tf.ConfigProto(graph_options=tf.GraphOptions(optimizer_options=tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0)))) 18 | sess.run(tf.initialize_all_variables()) 19 | sess.run(qr[0].op) 20 | start_time = time.time() 21 | sess.run(qr[0].op) 22 | end_time = time.time() 23 | print("TF QR on %d by %d matrix in %.2f seconds"%(n, n, end_time-start_time)) 24 | except: 25 | print("No tf") 26 | 27 | a = np.random.randn(n, n) 28 | start_time = time.time() 29 | q, r = np.linalg.qr(a) 30 | end_time = time.time() 31 | print("numpy QR on %d by %d matrix in %.2f seconds"%(n, n, end_time-start_time)) 32 | --------------------------------------------------------------------------------