├── .gitignore
├── README.md
├── akaitsuki-slow
    ├── config.py
    ├── feed_dict.pbtxt
    ├── feed_dict.py
    └── main.py
├── autotune
    ├── README.md
    ├── autograd_lib.py
    ├── autograd_lib_test.py
    ├── autograd_test.py
    ├── ciresan_bench.py
    ├── curvature_test.py
    ├── eval_conv2d_approx.py
    ├── factored_test.py
    ├── globals.py
    ├── hessian_test.py
    ├── linalg_bench.py
    ├── linesearch_test_disabled.py
    ├── lyapunov_test.py
    ├── mnist_end2end_test.py
    ├── plotting_test.py
    ├── pytorch_benchmark.py
    ├── scipy_benchmark.py
    ├── svd_benchmark.py
    ├── test
    │   ├── bad_sigmas.pt
    │   ├── factored.pt
    │   └── gesvd_crash.txt
    ├── train_ciresan.py
    ├── train_ciresan_cca.py
    ├── train_ciresan_factored.py
    ├── train_ciresan_new.py
    ├── train_medium.py
    ├── train_small.py
    ├── train_small_xent.py
    ├── train_small_xent_factored.py
    ├── train_tiny.py
    ├── train_tiny_xent.py
    ├── util.py
    └── util_test.py
├── aws-recipes.ipynb
├── aws-scratch.ipynb
├── benchmark_huggingface_predict.py
├── bin
    └── tfversion
├── clipping-profile.ipynb
├── cluster
    ├── .gitignore
    ├── README.md
    ├── async_adder.py
    ├── aws.py
    ├── benchmark_grpc_recv.py
    ├── benchmarks
    │   ├── .DS_Store
    │   ├── .gitignore
    │   ├── LICENSE
    │   ├── README.md
    │   ├── bower_components
    │   │   ├── d3
    │   │   │   ├── .bower.json
    │   │   │   ├── .gitattributes
    │   │   │   ├── CONTRIBUTING.md
    │   │   │   ├── LICENSE
    │   │   │   ├── README.md
    │   │   │   ├── bower.json
    │   │   │   ├── d3.js
    │   │   │   ├── d3.min.js
    │   │   │   └── package.js
    │   │   └── plottable
    │   │   │   ├── .bower.json
    │   │   │   ├── bower.json
    │   │   │   ├── plottable.css
    │   │   │   ├── plottable.d.ts
    │   │   │   ├── plottable.js
    │   │   │   └── plottable.min.js
    │   ├── dashboard_app
    │   │   ├── .DS_Store
    │   │   ├── app.yaml
    │   │   ├── main.py
    │   │   ├── main_test.py
    │   │   ├── requirements.txt
    │   │   ├── static
    │   │   │   ├── css
    │   │   │   │   └── style.css
    │   │   │   └── js
    │   │   │   │   └── benchmark_latency_chart.js
    │   │   └── templates
    │   │   │   ├── index.html
    │   │   │   └── test.html
    │   ├── index.html
    │   ├── js
    │   │   ├── csv_benchmark_chart.js
    │   │   └── latency_chart.js
    │   ├── scripts
    │   │   ├── Dockerfile.tf_cnn_benchmarks
    │   │   ├── benchmark_configs.yml
    │   │   ├── tf_cnn_benchmarks
    │   │   │   ├── .DS_Store
    │   │   │   ├── README.md
    │   │   │   ├── benchmark_cnn.py
    │   │   │   ├── benchmark_storage.py
    │   │   │   ├── cbuild_benchmark_storage.py
    │   │   │   ├── cnn_util.py
    │   │   │   ├── convnet_builder.py
    │   │   │   ├── datasets.py
    │   │   │   ├── models
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── alexnet_model.py
    │   │   │   │   ├── densenet_model.py
    │   │   │   │   ├── googlenet_model.py
    │   │   │   │   ├── inception_model.py
    │   │   │   │   ├── lenet_model.py
    │   │   │   │   ├── model.py
    │   │   │   │   ├── model_config.py
    │   │   │   │   ├── overfeat_model.py
    │   │   │   │   ├── resnet_model.py
    │   │   │   │   ├── trivial_model.py
    │   │   │   │   └── vgg_model.py
    │   │   │   ├── preprocessing.py
    │   │   │   ├── tf_cnn_benchmarks.py
    │   │   │   └── variable_mgr.py
    │   │   └── util
    │   │   │   ├── __init__.py
    │   │   │   ├── benchmark_util.py
    │   │   │   ├── benchmark_util_test.py
    │   │   │   ├── convert_csv_to_json.py
    │   │   │   └── convert_csv_to_json_test.py
    │   ├── soumith_benchmarks.html
    │   └── tools
    │   │   ├── k8s_tensorflow_lib.py
    │   │   ├── k8s_tensorflow_test.py
    │   │   ├── kubectl_util.py
    │   │   ├── kubectl_util_test.py
    │   │   └── run_distributed_benchmarks.py
    ├── client_transfer_benchmark.py
    ├── cloud-formation-example
    │   ├── README.md
    │   ├── iam.yaml
    │   ├── tensorflow.yaml
    │   └── zone.sh
    ├── connect
    ├── connect.py
    ├── delete_placement_groups.py
    ├── fill_efs.py
    ├── imagenet64
    │   ├── README.md
    │   ├── aws.py
    │   ├── launch.py
    │   ├── requirements.txt
    │   └── variable_mgr.py
    ├── instance_info.py
    ├── launch_async_adder.py
    ├── launch_micro.py
    ├── launch_ray.py
    ├── launch_simple_tf.py
    ├── local_distributed_benchmark.py
    ├── myutil.py
    ├── ray_add.py
    ├── simple_distributed.py
    ├── terminate_instances.py
    ├── test_aws.py
    ├── tf-tools
    │   ├── .gitignore
    │   ├── benchmark
    │   │   ├── multi_gpu
    │   │   │   ├── advanced_tweaks_compare.sh
    │   │   │   ├── image_classification_bench_tests.sh
    │   │   │   ├── stats_monitor.sh
    │   │   │   ├── test_runner.sh
    │   │   │   └── unit_test_stats_monitor.sh
    │   │   └── runner
    │   │   │   ├── cluster_aws.py
    │   │   │   ├── command_builder.py
    │   │   │   ├── configs
    │   │   │       └── aws
    │   │   │       │   ├── multi_server.yaml
    │   │   │       │   └── yaroslav.yaml
    │   │   │   ├── instance_info.py
    │   │   │   ├── launch_experiment.py
    │   │   │   ├── test_cluster_aws.py
    │   │   │   ├── test_command_builder.py
    │   │   │   └── util.py
    │   └── install
    │   │   ├── aws_amzlinux.md
    │   │   └── aws_ubuntu16_04.md
    ├── tmux.py
    └── upload_test.txt
├── conditional_backprop.py
├── configure_tf.sh
├── configure_tf_cpu.sh
├── danjar_peek.py
├── distributed
    ├── README.md
    ├── benchmark_grpc_recv.py
    └── client_transfer_benchmark.py
├── double_memory_bug.py
├── dynamic_stitch_gpu.py
├── dynamic_stitch_gpu_profile.pbtxt
├── eager_lbfgs
    ├── .ipynb_checkpoints
    │   └── performance-checkpoint.ipynb
    ├── common_gd.py
    ├── data
    │   ├── short_batch.csv
    │   ├── short_eager_batch.csv
    │   ├── short_eager_loss.csv
    │   ├── short_eager_time.csv
    │   ├── short_pytorch_loss.csv
    │   └── short_pytorch_time.csv
    ├── eager_lbfgs.py
    ├── performance.ipynb
    ├── pytorch_lbfgs.py
    ├── run_experiment.py
    ├── torch_lbfgs.lua
    └── util.py
├── enqueue_many_test.py
├── enqueue_many_test_singlerun.py
├── ericyue-slowreader
    ├── benchmark-batch-noqueuerunners-timeline.json
    ├── benchmark-batch-noqueuerunners.profile
    ├── benchmark-batch-noqueuerunners.py
    ├── benchmark-batch.py
    ├── benchmark-reader.py
    ├── benchmark-synthetic-batch.py
    ├── benchmark-synthetic.py
    ├── benchmark.py
    ├── data.zlib
    └── profile-batch.py
├── example.png
├── free_gpus.py
├── github_pyfunc_slowness.py
├── gpu-memory-transfer.ipynb
├── gpu_oom.py
├── gpu_svd_bench.py
├── graph_template.py
├── graphvis.png
├── imagenet15-scratch.ipynb
├── input_benchmarks
    ├── convert_to_records.py
    ├── fully_connected_feed.py
    ├── fully_connected_preloaded_var.py
    ├── fully_connected_reader.py
    ├── timeline.feed.json
    ├── timeline.reader.json
    └── timeline.var.json
├── inverse_segfault.py
├── jupyter-version.png
├── keras_autoencoder
    ├── keras_large.py
    ├── util.py
    └── weightnorm.py
├── khatri_rao_benchmark.py
├── lazy_dog.py
├── linalg-benchmark
    ├── README.md
    ├── bad_matrix.py
    ├── benchmark.py
    ├── environment.yml
    ├── get_cores_per_socket.py
    ├── launch.py
    ├── launch_tensorflow_svd_crash.py
    ├── requirements.txt
    ├── results.txt
    └── tensorflow_svd_crash.py
├── line_search_example
    ├── data
    │   └── step_lengths_ada.csv
    ├── line_search_example.py
    └── util.py
├── linearize
    ├── linearize.py
    ├── linearize_test.py
    └── memory_util.py
├── matmul_benchmark.py
├── matmul_benchmark_seq.py
├── matmul_times
    ├── 1080-float16.csv
    ├── 1080-float32.csv
    ├── g3-float16.csv
    ├── g3-float32.csv
    ├── nvidia-p3-float16.csv
    ├── nvidia-p3-float32.csv
    ├── p2-float16.csv
    └── p2-float32.csv
├── mavelin
    ├── machine1.py
    └── machine3.py
├── memory tracking.ipynb
├── memory-probe-examples.ipynb
├── memory-release-check.ipynb
├── natural_gradient_multilayer.py
├── node-merge.ipynb
├── notebook_util.py
├── numpy_initializers
    ├── kfac_cifar.py
    └── util.py
├── parallel_dequeue_test.py
├── phantomjs-tryout.ipynb
├── phantomjs-tryout.js
├── pytorch-hessian.ipynb
├── queue_mismatch.py
├── queues_talk
    ├── queues.ipynb
    └── slides.pdf
├── resnet_8_simple.pbtxt
├── resnet_leak_report.py
├── resnet_leak_report2.py
├── resource_variable_test.py
├── rotations_comparison.py
├── saving memory by using functions.ipynb
├── simple_rewiring.ipynb
├── simple_train.py
├── svd_benchmark.py
├── svd_noconverge.py
├── svd_test.py
├── tensorflow-memory-talk.pdf
├── tf_initializer_bug_report.py
├── tiny_runs
    ├── qr_test.py
    └── tiny_tf.py
└── whitening_util.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | /__pycache__
 2 | /.ipynb_checkpoints
 3 | *#
 4 | *~
 5 | /linalg-benchmark/.idea/linalg-benchmark.iml
 6 | /linalg-benchmark/.idea/misc.xml
 7 | /linalg-benchmark/.idea/modules.xml
 8 | /linalg-benchmark/.idea/vcs.xml
 9 | /linalg-benchmark/.idea/workspace.xml
10 | /linalg-benchmark/.idea
11 | .DS_Store
12 | __pycache__
13 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # stuff
2 | 


--------------------------------------------------------------------------------
/akaitsuki-slow/config.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 |  
 3 |  
 4 | def str2bool(v):
 5 |     return v.lower() in ('y', 'yes', 't', 'true', '1')
 6 |  
 7 |  
 8 | def get_args():
 9 |     parser = argparse.ArgumentParser()
10 |     parser.register('type', 'bool', str2bool)
11 |  
12 |     parser.add_argument('--random_seed',
13 |                         type=int,
14 |                         default=1013,
15 |                         help='Random seed')
16 |  
17 |     parser.add_argument('--vocab_size',
18 |                         type=int,
19 |                         default=10000,
20 |                         help='Default embed size')
21 |  
22 |     parser.add_argument('--embed_size',
23 |                         type=int,
24 |                         default=128,
25 |                         help='Default embedding size if embedding_file is not given')
26 |  
27 |     parser.add_argument('--hidden_size',
28 |                         type=int,
29 |                         default=128,
30 |                         help='Hidden size of RNN units')
31 |  
32 |     parser.add_argument('--num_labels',
33 |                         type=int,
34 |                         default=96,
35 |                         help='num labels')
36 |  
37 |     parser.add_argument('--bidir',
38 |                         type='bool',
39 |                         default=True,
40 |                         help='bidir: whether to use a bidirectional RNN')
41 |  
42 |     parser.add_argument('--num_layers',
43 |                         type=int,
44 |                         default=1,
45 |                         help='Number of RNN layers')
46 |  
47 |     parser.add_argument('--rnn_type',
48 |                         type=str,
49 |                         default='gru',
50 |                         help='RNN type: lstm or gru (default)')
51 |  
52 |     parser.add_argument('--batch_size',
53 |                         type=int,
54 |                         default=32,
55 |                         help='Batch size')
56 |  
57 |     parser.add_argument('--dropout_rate',
58 |                         type=float,
59 |                         default=0.2,
60 |                         help='Dropout rate')
61 |  
62 |     parser.add_argument('--optimizer',
63 |                         type=str,
64 |                         default='sgd',
65 |                         help='Optimizer: sgd (default) or adam or rmsprop')
66 |  
67 |     parser.add_argument('--learning_rate', '-lr',
68 |                         type=float,
69 |                         default=0.1,
70 |                         help='Learning rate for SGD')
71 |  
72 |     return parser.parse_args()
73 | 
74 | 


--------------------------------------------------------------------------------
/akaitsuki-slow/feed_dict.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | from tensorflow.python.client import timeline 
 4 |  
 5 | 
 6 | sess = tf.Session()
 7 | a = tf.placeholder(tf.float32)
 8 | b = a*2
 9 | c0 = sess.run([b], feed_dict={a:2.})
10 | 
11 | run_metadata = tf.RunMetadata()
12 | run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
13 | run_options.output_partition_graphs=True
14 | 
15 | c0 = sess.run([b], feed_dict={a:2.}, options=run_options,
16 |               run_metadata=run_metadata)
17 | with open("feed_dict.pbtxt", "w") as f:
18 |     f.write(str(run_metadata))
19 | 


--------------------------------------------------------------------------------
/autotune/README.md:
--------------------------------------------------------------------------------
 1 | To run tests in this directory
 2 | 
 3 | ```
 4 | pytest
 5 | ```
 6 | 
 7 | If there's a slow test, you can run this file directly to see timings of individual tests, ie
 8 | 
 9 | ```
10 | python linesearch_test.py
11 | ```
12 | 


--------------------------------------------------------------------------------
/autotune/globals.py:
--------------------------------------------------------------------------------
 1 | # Module to hold global variables for curvature computation functions.
 2 | # This is needed sincne functionality may be split over several modules
 3 | 
 4 | from typing import Optional
 5 | 
 6 | import torch
 7 | from torch.utils.tensorboard import SummaryWriter
 8 | 
 9 | event_writer: Optional[SummaryWriter] = None
10 | project_name: Optional[str] = 'train_ciresan'  # project name to use for wandb logging
11 | logdir_base: str = '/ncluster/runs'
12 | run_name: Optional[str] = None  # run name to use, corresponds to logging dir and wandb run name
13 | logdir: Optional[str] = None  # logdir
14 | token_count: int = 0   # TODO(y): rename to global-step. Meaning is context-specific, in case of sequences it's number of tokens
15 | 
16 | args = None   #  global arg values
17 | debug_dump_stats: bool = False   # print activations/backprops to console
18 | debug_linalg_crashes: bool = False   # save matrices that cause linalg routines to crash
19 | 
20 | 
21 | # debug_hard_crashes_on_nans: bool = True  # crash if encountering NaN
22 | 
23 | hacks_disable_hess = False
24 | 
25 | 
26 | if torch.cuda.is_available():
27 |     device = torch.device('cuda')
28 |     print("Using GPU")
29 | else:
30 |     device = torch.device('cpu')
31 | 
32 | 
33 | def reset_global_step():
34 |     global token_count
35 |     token_count = 0
36 | 
37 | 
38 | def increment_global_step(incr: int):
39 |     global token_count
40 |     token_count += incr
41 | 
42 | 
43 | def get_global_step() -> int:
44 |     return token_count
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/autotune/linalg_bench.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import time
  4 | from typing import Optional, Tuple, Callable
  5 | 
  6 | # import torch
  7 | import scipy
  8 | import torch
  9 | from torchcurv.optim import SecondOrderOptimizer
 10 | 
 11 | 
 12 | import torch.nn as nn
 13 | 
 14 | import util as u
 15 | 
 16 | import numpy as np
 17 | 
 18 | """
 19 | MKL version unknown
 20 | PyTorch version 1.2.0
 21 | Scipy version:  1.2.1
 22 | Numpy version:  1.16.4
 23 | 1024-by-1024 matrix
 24 |  7079.93   linalg.solve_lyapunov
 25 |   280.11   linalg.pinvh
 26 |  1186.08   linalg.pinv
 27 |    49.18   linalg.inv
 28 |   118.23   qr
 29 |   413.42   svd
 30 | """
 31 | 
 32 | class Net(nn.Module):
 33 |     def __init__(self, d):
 34 |         super().__init__()
 35 |         self.w = nn.Linear(d, 1, bias=False)
 36 | 
 37 |     def forward(self, x: torch.Tensor):
 38 |         result = self.w(x)
 39 |         return result
 40 | 
 41 | 
 42 | class timeit:
 43 |     """Decorator to measure length of time spent in the block in millis and log
 44 |     it to TensorBoard. This function is
 45 |     """
 46 | 
 47 |     def __init__(self, tag=""):
 48 |         self.tag = tag
 49 | 
 50 |     def __enter__(self):
 51 |         self.start = time.perf_counter()
 52 |         return self
 53 | 
 54 |     def __exit__(self, *args):
 55 |         self.end = time.perf_counter()
 56 |         interval_ms = 1000 * (self.end - self.start)
 57 |         print(f"{interval_ms:8.2f}   {self.tag}")
 58 | 
 59 | 
 60 | def get_mkl_version():
 61 |   import ctypes
 62 |   import numpy as np
 63 | 
 64 |   # this recipe only works on Linux
 65 |   try:
 66 |     ver = np.zeros(199, dtype=np.uint8)
 67 |     mkl = ctypes.cdll.LoadLibrary("libmkl_rt.so")
 68 |     mkl.MKL_Get_Version_String(ver.ctypes.data_as(ctypes.c_char_p), 198)
 69 |     return ver[ver != 0].tostring()
 70 |   except:
 71 |     return 'unknown'
 72 | 
 73 | 
 74 | def print_cpu_info():
 75 |   ver = 'unknown'
 76 |   try:
 77 |     for l in open("/proc/cpuinfo").read().split('\n'):
 78 |       if 'model name' in l:
 79 |         ver = l
 80 |         break
 81 |   except:
 82 |     pass
 83 | 
 84 | 
 85 | def linalg_bench():
 86 |     if np.__config__.get_info("lapack_mkl_info"):
 87 |         print("MKL version", get_mkl_version())
 88 |     else:
 89 |         print("not using MKL")
 90 | 
 91 |     print("PyTorch version", torch.version.__version__)
 92 | 
 93 |     print("Scipy version: ", scipy.version.full_version)
 94 |     print("Numpy version: ", np.version.full_version)
 95 | 
 96 |     for d in [1024]:
 97 |         print(f"{d}-by-{d} matrix")
 98 |         n = 10000
 99 |         assert n > 2*d   # to prevent singularity
100 |         X = np.random.random((d, 10000))
101 |         Y = np.random.random((d, 10000))
102 |         H = X @ X.T
103 |         S = Y @ Y.T
104 | 
105 |         with timeit(f"linalg.solve_lyapunov"):
106 |             result = scipy.linalg.solve_lyapunov(H, S)
107 |             #print(result[0,0])
108 | 
109 |         with timeit(f"linalg.pinvh"):
110 |             result = scipy.linalg.pinvh(H)
111 |             #print(result[0, 0])
112 | 
113 |         with timeit(f"linalg.pinv"):
114 |             result = scipy.linalg.pinv(H)
115 |             #print(result[0, 0])
116 | 
117 | 
118 |         with timeit(f"linalg.inv"):
119 |             result = scipy.linalg.inv(H)
120 |             #print(result[0, 0])
121 | 
122 |         with timeit(f"qr"):
123 |             result = scipy.linalg.qr(H)
124 |             #print(result[0, 0])
125 | 
126 |         with timeit(f"qr-pivoting"):
127 |             result = scipy.linalg.qr(H, pivoting=True)
128 |             #print(result[0, 0])
129 | 
130 |         with timeit(f"svd"):
131 |             result = scipy.linalg.svd(H)
132 |             #print(result[0, 0])
133 | 
134 | 
135 | 
136 | if __name__ == '__main__':
137 |     linalg_bench()
138 | 


--------------------------------------------------------------------------------
/autotune/pytorch_benchmark.py:
--------------------------------------------------------------------------------
  1 | """
  2 | (pytorch_p36) [ec2-user@ip-172-31-6-232 cifar]$ python pytorch_benchmark.py
  3 | MKL version b'Intel(R) Math Kernel Library Version 2019.0.4 Product Build 20190411 for Intel(R) 64 architecture applications'
  4 | PyTorch version 1.1.0
  5 | Scipy version:  1.3.0
  6 | Numpy version:  1.16.4
  7 | Benchmarking 1024-by-1024 matrix on cuda:0
  8 |   882.84   svd
  9 |    17.22   inv
 10 |   227.04   pinv
 11 |   452.77   eig
 12 |   227.18   svd
 13 | 
 14 | 
 15 | Laptop
 16 | 
 17 | MKL version unknown
 18 | PyTorch version 1.2.0
 19 | Scipy version:  1.2.1
 20 | Numpy version:  1.16.4
 21 | CPU version:  unknown
 22 | CPU logical cores: 8
 23 | CPU physical cores: 4
 24 | CPU physical sockets:  0
 25 | Benchmarking 1024-by-1024 matrix on cpu
 26 |   170.24   svd
 27 |    22.41   inv
 28 |   206.70   pinv
 29 |   247.92   eig
 30 |   180.16   pinverse
 31 |    20.08   solve
 32 |   124.89   svd
 33 |    14.57   inv
 34 |   197.24   pinv
 35 |   221.06   eig
 36 |   213.46   pinverse
 37 |    21.75   solve
 38 | 
 39 | """
 40 | import os
 41 | import sys
 42 | import time
 43 | 
 44 | import numpy as np
 45 | 
 46 | import util as u
 47 | 
 48 | import torch
 49 | 
 50 | # from @eamartin
 51 | def empty_aligned(n, align):
 52 |     """Get n bytes of memory wih alignment align."""
 53 |     a = np.empty(n + (align - 1), dtype=np.float32)
 54 |     data_align = a.ctypes.data % align
 55 |     offset = 0 if data_align == 0 else (align - data_align)
 56 |     return a[offset: offset + n]
 57 | 
 58 | 
 59 | def benchmark(method):
 60 | 
 61 |     start_time = time.time()
 62 |     times = []
 63 | 
 64 |     for i in range(1):
 65 |         if method == 'svd':
 66 |             _result = torch.svd(H)
 67 |             open('/dev/null', 'w').write(str(_result[0]))
 68 |         elif method == 'inv':
 69 |             _result = torch.inverse(H)
 70 |             open('/dev/null', 'w').write(str(_result[0]))
 71 |         elif method == 'pinv':
 72 |             _result = u.pinv(H)
 73 |             open('/dev/null', 'w').write(str(_result[0]))
 74 |         elif method == 'pinverse':
 75 |             _result = torch.pinverse(H)
 76 |             open('/dev/null', 'w').write(str(_result[0]))
 77 |         elif method == 'eig':
 78 |             _result = torch.symeig(H, eigenvectors=True)
 79 |             open('/dev/null', 'w').write(str(_result[0]))
 80 |         elif method == 'svd':
 81 |             _result = torch.svd(H)
 82 |             open('/dev/null', 'w').write(str(_result[0]))
 83 |         elif method == 'solve':
 84 |             _result = torch.solve(S, H)
 85 |             open('/dev/null', 'w').write(str(_result[0]))
 86 |         else:
 87 |             assert False
 88 |         new_time = time.time()
 89 |         elapsed_time = 1000 * (new_time - start_time)
 90 |         print(f"{elapsed_time:8.2f}   {method}")
 91 |         start_time = new_time
 92 |         times.append(elapsed_time)
 93 | 
 94 | 
 95 | if __name__ == '__main__':
 96 |     methods = ['svd', 'inv', 'pinv', 'eig', 'pinverse', 'solve']*2
 97 | 
 98 |     u.print_version_info()
 99 |     d = 1024
100 | 
101 |     x0 = torch.rand(d).reshape((d, 1)).float()
102 | 
103 |     X = torch.rand((d, 10000))
104 |     Y = torch.rand((d, 10000))
105 |     H = X @ X.t()
106 |     S = Y @ Y.t()
107 | 
108 |     if torch.cuda.is_available():
109 |         [x0, X, Y, H, S] = u.move_to_gpu([x0, X, Y, H, S])
110 | 
111 |     print(f"Benchmarking {d}-by-{d} matrix on {x0.device}")
112 |     for method in methods:
113 |         benchmark(method)
114 | 
115 | # Other timings: svd
116 | # n=1000 Times: min: 126.04, median: 132.48
117 | # n=2000 Times: min: 573.03, median: 621.49
118 | # n=4096 Times: min: 5586.02, median: 6032.16
119 | # Other timings: inv
120 | # Times: min: 17.87, median: 23.41, mean: 27.90
121 | 


--------------------------------------------------------------------------------
/autotune/svd_benchmark.py:
--------------------------------------------------------------------------------
 1 | # Fastest way to compute eigenvectors for 4k matrix?
 2 | #
 3 | # Inverse on i3.metal
 4 | # n=4096: 368 ms ± 1.51 ms per loop
 5 | # 
 6 | # Xeon V3 benchmarks:
 7 | # n=4096 eigs  min: 27758.34, median: 28883.69
 8 | # n=4096 gesdd min: 7241.70, median: 8477.95
 9 | # n=4096 gesvd min=20487.48, median: 22057.64,
10 | # n=4096 inv min: 556.67, median: 579.25,
11 | # n=4096 linsolve: min: 534.40, median: 558.06, mean: 579.19
12 | #
13 | # Xeon V4:
14 | # n=4096 gesdd min: 5586.02, median: 6032.16
15 | #
16 | #
17 | # i7-5820K CPU @ 3.30GHz
18 | # n=4096 gesdd 7288.02, median: 7397.23, mean: 7478.78
19 | # n=4096 inv 520 msec
20 | #
21 | # after upgrading things
22 | # b'Intel(R) Math Kernel Library Version 2017.0.3 Product Build 20170413 for Intel(R) 64 architecture applications'
23 | # n=4096 inv 1427.54
24 | 
25 | 
26 | from scipy import linalg  # for svd
27 | import numpy as np
28 | import time
29 | import sys
30 | 
31 | 
32 | # from @eamartin
33 | def empty_aligned(n, align):
34 |   """Get n bytes of memory wih alignment align."""
35 |   a = np.empty(n + (align - 1), dtype=np.float32)
36 |   data_align = a.ctypes.data % align
37 |   offset = 0 if data_align == 0 else (align - data_align)
38 |   return a[offset : offset + n]
39 | 
40 | 
41 | def benchmark(method):
42 |   n=1024
43 |   x_old = np.random.randn(n*n).reshape((n,n)).astype(dtype=np.float32)
44 |   x = empty_aligned(n*n, 32).reshape((n, n))
45 |   x[:] = x_old
46 |   x = x @ x.T
47 | 
48 |   x0 = np.random.randn(n).reshape((n,1)).astype(dtype=np.float32)
49 | 
50 |   start_time = time.time()
51 |   times = []
52 | 
53 |   for i in range(1):
54 |     if method == 'gesdd':
55 |       result = linalg.svd(x)
56 |     elif method == 'gesvd':
57 |       result = linalg.svd(x, lapack_driver='gesvd')
58 |     elif method == 'eigh':
59 |       result = linalg.eigh(x)
60 |     elif method == 'inv':
61 |       result = linalg.inv(x)
62 |     elif method == 'inv2':
63 |       result = linalg.inv(x, overwrite_a=True)
64 |     elif method == 'linsolve':
65 |       result = linalg.solve(x, x0)
66 |     else:
67 |       assert False
68 |     new_time = time.time()
69 |     elapsed_time = 1000*(new_time - start_time)
70 |     print(f"elapsed_time:8.2f}   {method}")
71 |     start_time = new_time
72 |     times.append(elapsed_time)
73 | 
74 | 
75 | if __name__=='__main__':
76 |   methods = ['gesdd', 'gesvd', 'eigh', 'inv', 'inv2', 'linsolve']
77 | 
78 |   for method in methods:
79 |     benchmark(method)
80 | 
81 | 
82 | 
83 | 
84 | # Other timings: svd
85 | # n=1000 Times: min: 126.04, median: 132.48
86 | # n=2000 Times: min: 573.03, median: 621.49
87 | # n=4096 Times: min: 5586.02, median: 6032.16
88 | # Other timings: inv
89 | # Times: min: 17.87, median: 23.41, mean: 27.90
90 | 


--------------------------------------------------------------------------------
/autotune/test/bad_sigmas.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yaroslavvb/stuff/a8024ead315aa1b5d6976940b3a062178f0e499d/autotune/test/bad_sigmas.pt


--------------------------------------------------------------------------------
/autotune/test/factored.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yaroslavvb/stuff/a8024ead315aa1b5d6976940b3a062178f0e499d/autotune/test/factored.pt


--------------------------------------------------------------------------------
/bin/tfversion:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
 4 | import tensorflow as tf
 5 | version=tf.__version__
 6 | print("version: %s"%(version,))
 7 | commit = tf.__git_version__
 8 | print("__git_version__: %s"%(commit,))
 9 | # commit looks like this
10 | # 'v1.0.0-65-g4763edf-dirty'
11 | commit = commit.replace("'","")
12 | if commit.endswith('-dirty'):
13 |   dirty = True
14 |   commit = commit[:-len('-dirty')]
15 | commit=commit.rsplit('-g', 1)[1]
16 | url = 'https://github.com/tensorflow/tensorflow/commit/'+commit
17 | print("Commit %s" %(url,))


--------------------------------------------------------------------------------
/cluster/.gitignore:
--------------------------------------------------------------------------------
1 | /.DS_Store
2 | 


--------------------------------------------------------------------------------
/cluster/README.md:
--------------------------------------------------------------------------------
1 | # cluster
2 | train on AWS
3 | 


--------------------------------------------------------------------------------
/cluster/benchmarks/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yaroslavvb/stuff/a8024ead315aa1b5d6976940b3a062178f0e499d/cluster/benchmarks/.DS_Store


--------------------------------------------------------------------------------
/cluster/benchmarks/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | 


--------------------------------------------------------------------------------
/cluster/benchmarks/README.md:
--------------------------------------------------------------------------------
 1 | # Instructions for adding distributed benchmarks to continuous run:
 2 | 
 3 | 1. You can add your benchmark file under
 4 |    [tensorflow/benchmarks/scripts](https://github.com/tensorflow/benchmarks/tree/master/scripts) directory. The benchmark should accept `task_index`, `job_name`, `ps_hosts` and `worker_hosts` flags. You can copy-paste the following flag definitions:
 5 | 
 6 |     ```python
 7 |     tf.app.flags.DEFINE_integer("task_index", None, "Task index, should be >= 0.")
 8 |     tf.app.flags.DEFINE_string("job_name", None, "job name: worker or ps")
 9 |     tf.app.flags.DEFINE_string("ps_hosts", None, "Comma-separated list of hostname:port pairs")
10 |     tf.app.flags.DEFINE_string("worker_hosts", None, "Comma-separated list of hostname:port pairs")
11 |     ```
12 | 2. Report benchmark values by calling `store_data_in_json` from your benchmark
13 |    code. This function is defined in
14 |    [benchmark\_util.py](https://github.com/tensorflow/benchmarks/blob/master/scripts/util/benchmark_util.py).
15 | 3. Create a Dockerfile that sets up dependencies and runs your benchmark. For
16 |    example, see [Dockerfile.tf\_cnn\_benchmarks](https://github.com/tensorflow/benchmarks/blob/master/scripts/Dockerfile.tf_cnn_benchmarks).
17 | 4. Add the benchmark to
18 |    [benchmark\_configs.yml](https://github.com/tensorflow/benchmarks/blob/master/scripts/benchmark_configs.yml)
19 |    * Set `benchmark_name` to a descriptive name for your benchmark and make sure
20 |      it is unique.
21 |    * Set `worker_count` and `ps_count`.
22 |    * Set `docker_file` to the Dockerfile path starting with `benchmarks/`
23 |      directory.
24 |    * Optionally, you can pass flags to your benchmark by adding `args` list.
25 | 5. Send PR with the changes to annarev.
26 | 
27 | Currently running benchmarks:
28 | https://benchmarks-dot-tensorflow-testing.appspot.com/
29 | 
30 | For any questions, please contact annarev@google.com.
31 | 


--------------------------------------------------------------------------------
/cluster/benchmarks/bower_components/d3/.bower.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "d3",
 3 |   "version": "3.5.5",
 4 |   "main": "d3.js",
 5 |   "scripts": [
 6 |     "d3.js"
 7 |   ],
 8 |   "ignore": [
 9 |     ".DS_Store",
10 |     ".git",
11 |     ".gitignore",
12 |     ".npmignore",
13 |     ".spmignore",
14 |     ".travis.yml",
15 |     "Makefile",
16 |     "bin",
17 |     "component.json",
18 |     "composer.json",
19 |     "index.js",
20 |     "lib",
21 |     "node_modules",
22 |     "package.json",
23 |     "src",
24 |     "test"
25 |   ],
26 |   "homepage": "https://github.com/mbostock-bower/d3-bower",
27 |   "_release": "3.5.5",
28 |   "_resolution": {
29 |     "type": "version",
30 |     "tag": "v3.5.5",
31 |     "commit": "264ea13e4ed8583b37a91f7640aa22fdee6b2f26"
32 |   },
33 |   "_source": "https://github.com/mbostock-bower/d3-bower.git",
34 |   "_target": "3.5.5",
35 |   "_originalSource": "d3"
36 | }


--------------------------------------------------------------------------------
/cluster/benchmarks/bower_components/d3/.gitattributes:
--------------------------------------------------------------------------------
1 | bower.json -diff merge=ours
2 | component.json -diff merge=ours
3 | d3.js -diff merge=ours
4 | d3.min.js -diff merge=ours
5 | package.js -diff merge=ours
6 | 


--------------------------------------------------------------------------------
/cluster/benchmarks/bower_components/d3/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | **Important:** these GitHub issues are for *bug reports and feature requests only*. Please use [StackOverflow](http://stackoverflow.com/questions/tagged/d3.js) or the [d3-js Google group](https://groups.google.com/d/forum/d3-js) for general help.
 4 | 
 5 | If you’re looking for ways to contribute, please [peruse open issues](https://github.com/mbostock/d3/issues?milestone=&page=1&state=open). The icebox is a good place to find ideas that are not currently in development. If you already have an idea, please check past issues to see whether your idea or a similar one was previously discussed.
 6 | 
 7 | Before submitting a pull request, consider implementing a live example first, say using [bl.ocks.org](http://bl.ocks.org). Real-world use cases go a long way to demonstrating the usefulness of a proposed feature. The more complex a feature’s implementation, the more usefulness it should provide. Share your demo using the #d3js tag on Twitter or by sending it to the [d3-js Google group](https://groups.google.com/d/forum/d3-js).
 8 | 
 9 | If your proposed feature does not involve changing core functionality, consider submitting it instead as a [D3 plugin](https://github.com/d3/d3-plugins). New core features should be for general use, whereas plugins are suitable for more specialized use cases. When in doubt, it’s easier to start with a plugin before “graduating” to core.
10 | 
11 | To contribute new documentation or add examples to the gallery, just [edit the Wiki](https://github.com/mbostock/d3/wiki)!
12 | 
13 | ## How to Submit a Pull Request
14 | 
15 | 1. Click the “Fork” button to create your personal fork of the D3 repository.
16 | 
17 | 2. After cloning your fork of the D3 repository in the terminal, run `npm install` to install D3’s dependencies.
18 | 
19 | 3. Create a new branch for your new feature. For example: `git checkout -b my-awesome-feature`. A dedicated branch for your pull request means you can develop multiple features at the same time, and ensures that your pull request is stable even if you later decide to develop an unrelated feature.
20 | 
21 | 4. The `d3.js` and `d3.min.js` files are built from source files in the `src` directory. _Do not edit `d3.js` directly._ Instead, edit the source files, and then run `make` to build the generated files.
22 | 
23 | 5. Use `make test` to run tests and verify your changes. If you are adding a new feature, you should add new tests! If you are changing existing functionality, make sure the existing tests run, or update them as appropriate.
24 | 
25 | 6. Sign D3’s [Individual Contributor License Agreement](https://docs.google.com/forms/d/1CzjdBKtDuA8WeuFJinadx956xLQ4Xriv7-oDvXnZMaI/viewform). Unless you are submitting a trivial patch (such as fixing a typo), this form is needed to verify that you are able to contribute.
26 | 
27 | 7. Submit your pull request, and good luck!
28 | 


--------------------------------------------------------------------------------
/cluster/benchmarks/bower_components/d3/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2010-2015, Michael Bostock
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | * The name Michael Bostock may not be used to endorse or promote products
15 |   derived from this software without specific prior written permission.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 | DISCLAIMED. IN NO EVENT SHALL MICHAEL BOSTOCK BE LIABLE FOR ANY DIRECT,
21 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
26 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 


--------------------------------------------------------------------------------
/cluster/benchmarks/bower_components/d3/README.md:
--------------------------------------------------------------------------------
 1 | # Data-Driven Documents
 2 | 
 3 | <a href="http://d3js.org"><img src="http://d3js.org/logo.svg" align="left" hspace="10" vspace="6"></a>
 4 | 
 5 | **D3.js** is a JavaScript library for manipulating documents based on data. **D3** helps you bring data to life using HTML, SVG and CSS. D3’s emphasis on web standards gives you the full capabilities of modern browsers without tying yourself to a proprietary framework, combining powerful visualization components and a data-driven approach to DOM manipulation.
 6 | 
 7 | Want to learn more? [See the wiki.](https://github.com/mbostock/d3/wiki)
 8 | 
 9 | For examples, [see the gallery](https://github.com/mbostock/d3/wiki/Gallery) and [mbostock’s bl.ocks](http://bl.ocks.org/mbostock).
10 | 


--------------------------------------------------------------------------------
/cluster/benchmarks/bower_components/d3/bower.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "d3",
 3 |   "version": "3.5.5",
 4 |   "main": "d3.js",
 5 |   "scripts": [
 6 |     "d3.js"
 7 |   ],
 8 |   "ignore": [
 9 |     ".DS_Store",
10 |     ".git",
11 |     ".gitignore",
12 |     ".npmignore",
13 |     ".spmignore",
14 |     ".travis.yml",
15 |     "Makefile",
16 |     "bin",
17 |     "component.json",
18 |     "composer.json",
19 |     "index.js",
20 |     "lib",
21 |     "node_modules",
22 |     "package.json",
23 |     "src",
24 |     "test"
25 |   ]
26 | }
27 | 


--------------------------------------------------------------------------------
/cluster/benchmarks/bower_components/d3/package.js:
--------------------------------------------------------------------------------
 1 | // Package metadata for Meteor.js.
 2 | 
 3 | Package.describe({
 4 |   name: "d3js:d3", // http://atmospherejs.com/d3js/d3
 5 |   summary: "D3 (official): A JavaScript visualization library for HTML and SVG.",
 6 |   version: "3.5.5",
 7 |   git: "https://github.com/mbostock/d3.git"
 8 | });
 9 | 
10 | Package.onUse(function(api) {
11 |   api.versionsFrom(["METEOR@1.0"]);
12 |   api.addFiles("d3.js", "client");
13 | });
14 | 


--------------------------------------------------------------------------------
/cluster/benchmarks/bower_components/plottable/.bower.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "plottable",
 3 |   "description": "A modular charting library built on D3",
 4 |   "version": "2.2.0",
 5 |   "main": [
 6 |     "plottable.js",
 7 |     "plottable.css"
 8 |   ],
 9 |   "typescript": {
10 |     "definition": "plottable.d.ts"
11 |   },
12 |   "license": "MIT",
13 |   "ignore": [
14 |     "**/*",
15 |     "!bower.json",
16 |     "!plottable.js",
17 |     "!plottable.css",
18 |     "!plottable.min.js",
19 |     "!plottable.d.ts"
20 |   ],
21 |   "keywords": [
22 |     "plottable",
23 |     "plottablejs",
24 |     "plottable.js",
25 |     "d3",
26 |     "data viz",
27 |     "chart",
28 |     "charts",
29 |     "reusable charts",
30 |     "visualization",
31 |     "scatterplot",
32 |     "bar chart",
33 |     "plot",
34 |     "plots"
35 |   ],
36 |   "dependencies": {
37 |     "d3": "3.5.5"
38 |   },
39 |   "homepage": "http://plottablejs.org",
40 |   "repository": {
41 |     "type": "git",
42 |     "url": "git://github.com/palantir/plottable.git"
43 |   },
44 |   "devDependencies": {
45 |     "chai": "2.0.0",
46 |     "mocha": "2.2.5",
47 |     "jQuery": "2.1.0",
48 |     "jquery.simulate": "1.2.0",
49 |     "requirejs": "2.1.18",
50 |     "sinon": "1.16.1"
51 |   },
52 |   "_release": "2.2.0",
53 |   "_resolution": {
54 |     "type": "version",
55 |     "tag": "v2.2.0",
56 |     "commit": "e36001d8b6640cd23599905255d61b4ab58a648d"
57 |   },
58 |   "_source": "https://github.com/palantir/plottable.git",
59 |   "_target": "^2.2.0",
60 |   "_originalSource": "plottable",
61 |   "_direct": true
62 | }


--------------------------------------------------------------------------------
/cluster/benchmarks/bower_components/plottable/bower.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "plottable",
 3 |   "description": "A modular charting library built on D3",
 4 |   "version": "2.2.0",
 5 |   "main": [
 6 |     "plottable.js",
 7 |     "plottable.css"
 8 |   ],
 9 |   "typescript": {
10 |     "definition": "plottable.d.ts"
11 |   },
12 |   "license": "MIT",
13 |   "ignore": [
14 |     "**/*",
15 |     "!bower.json",
16 |     "!plottable.js",
17 |     "!plottable.css",
18 |     "!plottable.min.js",
19 |     "!plottable.d.ts"
20 |   ],
21 |   "keywords": [
22 |     "plottable",
23 |     "plottablejs",
24 |     "plottable.js",
25 |     "d3",
26 |     "data viz",
27 |     "chart",
28 |     "charts",
29 |     "reusable charts",
30 |     "visualization",
31 |     "scatterplot",
32 |     "bar chart",
33 |     "plot",
34 |     "plots"
35 |   ],
36 |   "dependencies": {
37 |     "d3": "3.5.5"
38 |   },
39 |   "homepage": "http://plottablejs.org",
40 |   "repository": {
41 |     "type": "git",
42 |     "url": "git://github.com/palantir/plottable.git"
43 |   },
44 |   "devDependencies": {
45 |     "chai": "2.0.0",
46 |     "mocha": "2.2.5",
47 |     "jQuery": "2.1.0",
48 |     "jquery.simulate": "1.2.0",
49 |     "requirejs": "2.1.18",
50 |     "sinon": "1.16.1"
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/cluster/benchmarks/dashboard_app/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yaroslavvb/stuff/a8024ead315aa1b5d6976940b3a062178f0e499d/cluster/benchmarks/dashboard_app/.DS_Store


--------------------------------------------------------------------------------
/cluster/benchmarks/dashboard_app/app.yaml:
--------------------------------------------------------------------------------
1 | runtime: python
2 | env: flex
3 | entrypoint: gunicorn -b :$PORT main:app
4 | service: benchmarks
5 | 
6 | runtime_config:
7 |   python_version: 3
8 | 


--------------------------------------------------------------------------------
/cluster/benchmarks/dashboard_app/main_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | import json
17 | import main
18 | import unittest
19 | import urllib
20 | 
21 | class TestMain(unittest.TestCase):
22 | 
23 |   def testArgumentInvalidFormat(self):
24 |     self.assertEqual('', main.argument_name(''))
25 |     self.assertEqual('', main.argument_name('arg=val'))
26 |     self.assertEqual('', main.argument_name('-arg=val'))
27 |     self.assertEqual('', main.argument_name('--argval'))
28 |     self.assertEqual('', main.argument_name('--=val'))
29 |     self.assertEqual('', main.argument_name('--='))
30 | 
31 |   def testArgumentValidFormat(self):
32 |     self.assertEqual('abc', main.argument_name('--abc=123'))
33 |     self.assertEqual('a', main.argument_name('--a=123'))
34 | 
35 |   def testIndexPage(self):
36 |     main.app.testing = True
37 |     client = main.app.test_client()
38 | 
39 |     r = client.get('/')
40 |     self.assertEqual(200, r.status_code)
41 |     self.assertIn('sample_logged_benchmark', r.data.decode('utf-8'))
42 | 
43 |   def testTestPage_InvalidTest(self):
44 |     main.app.testing = True
45 |     client = main.app.test_client()
46 | 
47 |     r = client.get('/test/abc')
48 |     self.assertEqual(200, r.status_code)
49 |     self.assertIn('No data for benchmark', str(r.data))
50 | 
51 |   def testTestPage_SampleTest(self):
52 |     main.app.testing = True
53 |     client = main.app.test_client()
54 |     sample_benchmark_name = '//tensorflow/examples/benchmark:sample_logged_benchmark'
55 | 
56 |     r = client.get(
57 |         '/test/%252F%252Ftensorflow%252Fexamples%252Fbenchmark%253Asample_logged_benchmark')
58 |     self.assertEqual(200, r.status_code)
59 |     self.assertIn(
60 |         'Performance plots for %s' % sample_benchmark_name, str(r.data))
61 | 
62 |   def testFetchBenchmarkData_InvalidTest(self):
63 |     main.app.testing = True
64 |     client = main.app.test_client()
65 | 
66 |     r = client.get('/benchmark_data/?test=abc&entry=cde')
67 |     self.assertEqual(200, r.status_code)
68 |     self.assertEqual(b'[]', r.data)
69 | 
70 |   def testFetchBenchmarkData_SampleTest(self):
71 |     main.app.testing = True
72 |     client = main.app.test_client()
73 | 
74 |     encoded_benchmark_name = (
75 |         '/test/%252F%252Ftensorflow%252Fexamples%252Fbenchmark%253Asample_logged_benchmark')
76 |     r = client.get('/benchmark_data/?test=%s&entry=SampleBenchmark.sum_wall_time' %
77 |                    encoded_benchmark_name)
78 |     self.assertEqual(200, r.status_code)
79 |     self.assertEqual(b'[]', r.data)
80 | 
81 | 
82 | if __name__ == '__main__':
83 |   unittest.main()
84 | 


--------------------------------------------------------------------------------
/cluster/benchmarks/dashboard_app/requirements.txt:
--------------------------------------------------------------------------------
1 | Flask==0.12.2
2 | gunicorn==19.7.1
3 | google-cloud
4 | 


--------------------------------------------------------------------------------
/cluster/benchmarks/dashboard_app/static/css/style.css:
--------------------------------------------------------------------------------
  1 | /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2 | 
  3 | Licensed under the Apache License, Version 2.0 (the "License");
  4 | you may not use this file except in compliance with the License.
  5 | You may obtain a copy of the License at
  6 | 
  7 |     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | Unless required by applicable law or agreed to in writing, software
 10 | distributed under the License is distributed on an "AS IS" BASIS,
 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | See the License for the specific language governing permissions and
 13 | limitations under the License.
 14 | ==============================================================================
 15 | */
 16 | 
 17 | body {
 18 |   font-family: roboto, sans-serif;
 19 | }
 20 | 
 21 | h2 {
 22 |   font-weight: 400;
 23 | }
 24 | 
 25 | em {
 26 |   color: #666666;
 27 |   font-size: 18px;
 28 |   font-style: normal;
 29 | }
 30 | 
 31 | .outer_div {
 32 |   max-width: 1000px;
 33 |   margin: 20px;
 34 | }
 35 | 
 36 | table, th, td {
 37 |   border-collapse: collapse;
 38 |   border: 1px solid #d9d9d9;
 39 | }
 40 | 
 41 | th, td {
 42 |   padding: 15px;
 43 | }
 44 | 
 45 | th {
 46 |   text-align: left;
 47 |   font-weight: normal;
 48 | }
 49 | 
 50 | ul {
 51 |   width: 100%;
 52 |   margin: 0;
 53 |   padding: 0;
 54 | }
 55 | 
 56 | li {
 57 |   font-size: 14px;
 58 |   background-color: white;
 59 |   list-style: none;
 60 |   border: 1px solid #d9d9d9;
 61 |   border-radius: 2px;
 62 |   margin: 10px 0 0 0;
 63 | }
 64 | 
 65 | li:hover {
 66 |   background-color: #eeeeee;
 67 | }
 68 | 
 69 | li a {
 70 |   display: inline-block;
 71 |   width: 100%;
 72 |   height: 100%;
 73 |   color: black;
 74 |   text-decoration: none;
 75 |   padding: 8px 8px;
 76 | }
 77 | 
 78 | svg {
 79 |   margin-top: 20px;
 80 | }
 81 | 
 82 | #filter_input {
 83 |   display: block;
 84 |   width: 100%;
 85 |   font-size: 14px;
 86 |   padding: 8px 8px;
 87 |   border: 1px solid #d9d9d9;
 88 |   border-radius: 2px;
 89 |   box-sizing: border-box;
 90 | }
 91 | 
 92 | #filter_label, #arguments_label {
 93 |   color: #666666;
 94 |   font-size: 16px;
 95 | }
 96 | 
 97 | #latest_value_label {
 98 |   margin-bottom: 20px;
 99 | }
100 | 
101 | plottable .title-label text{
102 |   font-size: 16px;
103 |   font-family: roboto, sans-serif;
104 | }
105 | 


--------------------------------------------------------------------------------
/cluster/benchmarks/dashboard_app/static/js/benchmark_latency_chart.js:
--------------------------------------------------------------------------------
 1 | // Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | /**
16 |  * @fileoverview Provides a way to create a benchmark latency chart.
17 |  */
18 | 
19 | /**
20 |  * Constructor.
21 |  * @param {string} svg_element_id svg element to add the chart to.
22 |  * @param {string} test_id of the test to plot data for.
23 |  * @param {string} entry_id of the specific test entry to plot.
24 |  */
25 | var BenchmarkLatencyChart = function(svg_element, test_id, entry_id) {
26 |   this.svg_element = svg_element;
27 |   this.test_id = test_id;
28 |   this.entry_id = entry_id;
29 | };
30 | 
31 | /**
32 |  * Adds data to the given plots.
33 |  */
34 | BenchmarkLatencyChart.prototype.addData_ = function(plot) {
35 |   const encodedTestId = encodeURIComponent(this.test_id);
36 |   const encodedEntryId = encodeURIComponent(this.entry_id);
37 |   const jsonDataUrl =
38 |       '/benchmark_data/?test=' + encodedTestId + '&entry=' + encodedEntryId
39 |   d3.json(jsonDataUrl, function(data) {
40 |     benchmarks = []
41 |     for (var i = 0; i < data.length; i++) {
42 |       const name = this.entry_id;
43 |       const timestamp = new Date(+data[i]['start'] / 1000);
44 |       const mean_latency = data[i]['timing'];
45 |       benchmarks.push(
46 |           {name: name, timestamp: timestamp,
47 |            mean_latency: +mean_latency});
48 |     }
49 |     plot.addDataset(
50 |       new Plottable.Dataset(benchmarks, {name: 'Forward'}));
51 |   });
52 | };
53 | 
54 | /**
55 |  * Create the chart.
56 |  */
57 | BenchmarkLatencyChart.prototype.makeChart = function() {
58 |   const xScale = new Plottable.Scales.Time();
59 |   const yScaleForward = new Plottable.Scales.Linear();
60 | 
61 |   const plot = new LatencyChart(
62 |       this.entry_id, 'value',
63 |       xScale, yScaleForward);
64 | 
65 |   this.addData_(plot);
66 | 
67 |   const table = new Plottable.Components.Table([[plot.table]]);
68 |   table.renderTo(this.svg_element);
69 | 
70 |   plot.addTooltip();
71 |   new Plottable.Interactions.Click()
72 |       .attachTo(plot.linePlot)
73 |       .onClick(function(p) {
74 |          plot.updateForPosition(p);
75 |       });
76 | };
77 | 


--------------------------------------------------------------------------------
/cluster/benchmarks/dashboard_app/templates/index.html:
--------------------------------------------------------------------------------
 1 | <!--
 2 |   @license
 3 |   Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 4 | 
 5 |   Licensed under the Apache License, Version 2.0 (the "License");
 6 |   you may not use this file except in compliance with the License.
 7 |   You may obtain a copy of the License at
 8 | 
 9 |       http://www.apache.org/licenses/LICENSE-2.0
10 | 
11 |       Unless required by applicable law or agreed to in writing, software
12 |       distributed under the License is distributed on an "AS IS" BASIS,
13 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |       See the License for the specific language governing permissions and
15 |       limitations under the License.
16 | -->
17 | <!DOCTYPE html>
18 | <head>
19 | <meta charset="utf-8">
20 | <link rel="stylesheet" type="text/css" href="/static/bower_components/plottable/plottable.css">
21 | <link rel="stylesheet" type="text/css" href="/static/css/style.css">
22 | <script>
23 | updateFilter = function(e){
24 |    var input = document.getElementById('filter_input');
25 |    var filter = input.value.toUpperCase();
26 |    var ul = document.getElementById("benchmark_list");
27 |    var li = ul.getElementsByTagName('li');
28 |    var a, i;
29 | 
30 |    for (i = 0; i < li.length; i++) {
31 |      a = li[i].getElementsByTagName("a")[0];
32 |      if (a.innerHTML.toUpperCase().indexOf(filter) > -1) {
33 |        li[i].style.display = "";
34 |      } else {
35 |        li[i].style.display = "none";
36 |      }
37 |    }
38 | };
39 | </script>
40 | </head>
41 | <body>
42 | <div class="outer_div">
43 | <span id="filter_label">Filter</span>
44 | <input type="text" id="filter_input" onkeyup="updateFilter(this)" placeholder=""/>
45 | <ul id="benchmark_list">
46 | {% for test_name, encoded_test_name in tests %}
47 | <li><a href="{{ url_for('test', test_id=encoded_test_name) }}">{{ test_name }}</a></li>
48 | {% endfor %}
49 | </ul>
50 | </div>
51 | </body>
52 | </html>
53 | 
54 | 


--------------------------------------------------------------------------------
/cluster/benchmarks/dashboard_app/templates/test.html:
--------------------------------------------------------------------------------
 1 | <!--
 2 |   @license
 3 |   Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 4 | 
 5 |   Licensed under the Apache License, Version 2.0 (the "License");
 6 |   you may not use this file except in compliance with the License.
 7 |   You may obtain a copy of the License at
 8 | 
 9 |       http://www.apache.org/licenses/LICENSE-2.0
10 | 
11 |       Unless required by applicable law or agreed to in writing, software
12 |       distributed under the License is distributed on an "AS IS" BASIS,
13 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |       See the License for the specific language governing permissions and
15 |       limitations under the License.
16 | -->
17 | <!DOCTYPE html>
18 | <head>
19 | <meta charset="utf-8">
20 | <link rel="stylesheet" type="text/css" href="https://cdn.rawgit.com/tensorflow/benchmarks/master/bower_components/plottable/plottable.css">
21 | <link rel="stylesheet" type="text/css" href="/static/css/style.css">
22 | </head>
23 | <body>
24 | <div class="outer_div">
25 | <h2>Performance plots for {{ test_id }}</h2>
26 | 
27 | {% if arguments %}
28 | <div id="arguments_label"><span>Arguments:</span></div>
29 | <div>{{ arguments }}</div>
30 | {% endif %}
31 | 
32 | 
33 | {% for entry in entries %}
34 | <svg width="1000" height="500"
35 |   onload="new BenchmarkLatencyChart(this, '{{ test_id }}', '{{ entry.id }}').makeChart()">
36 | </svg>
37 | <div id="latest_value_label"><span>Latest value: <em>{{ entry.latest_value }}</em> at <em>{{ latest_time }}</em>.</span></div>
38 | {% endfor %}
39 | 
40 | </div>
41 | <script src="https://cdn.rawgit.com/tensorflow/benchmarks/master/bower_components/d3/d3.js"></script>
42 | <script src="https://cdn.rawgit.com/tensorflow/benchmarks/master/bower_components/plottable/plottable.js"></script>
43 | <script src="https://cdn.rawgit.com/tensorflow/benchmarks/master/js/latency_chart.js"></script>
44 | <script src="/static/js/benchmark_latency_chart.js"></script>
45 | </body>
46 | </html>
47 | 
48 | 


--------------------------------------------------------------------------------
/cluster/benchmarks/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <head>
 3 | <meta charset="utf-8">
 4 | <style>
 5 | body {
 6 |   font-family: roboto, sans-serif;
 7 | }
 8 | a {
 9 |   font-weight: 400;
10 | }
11 | .outer_div {
12 |   margin: 20px;
13 | }
14 | </style>
15 | </head>
16 | <body>
17 | <div class="outer_div">
18 | <a href="soumith_benchmarks.html">Soumith benchmarks</a>
19 | </div>
20 | </body>
21 | </html>
22 | 


--------------------------------------------------------------------------------
/cluster/benchmarks/js/csv_benchmark_chart.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @fileoverview Provides a way to create a mean latency chart based on a
 3 |  * csv file with latency data.
 4 |  */
 5 | 
 6 | /**
 7 |  * Constructor.
 8 |  * @param {string} svg_element_id svg element to add the chart to.
 9 |  * @param {string} latency_csv_file File to read input data from. The file
10 |  *     must have lines in the following format:
11 |  *     (Forward|Forward-Backward),timestamp,num_batches,mean,sd
12 |  */
13 | var CsvLatencyChart = function(svg_element_id, latency_csv_file) {
14 |   this.svg_element_id = svg_element_id;
15 |   this.latency_csv_file = latency_csv_file;
16 | };
17 | 
18 | /**
19 |  * Adds data to the given plots.
20 |  */
21 | CsvLatencyChart.prototype.addData_ = function(
22 |     plotForward, plotForwardBackward) {
23 |   d3.text(this.latency_csv_file, function(data) {
24 |     data = d3.csv.parseRows(data);
25 |     const parseDate = d3.time.format('%Y-%m-%d %H:%M:%S').parse;
26 |     let forwardBenchmarks = [];
27 |     let forwardBackwardBenchmarks = [];
28 |     for (var i = 0; i < data.length; i++) {
29 |       const name = data[i][0];
30 |       const timestamp = data[i][1];
31 |       const mean_latency = data[i][3];
32 |       // Timestamp has the format: 2016-08-31 23:38:55.159320
33 |       // However, we can't parse this date format using d3 time
34 |       // functions, so we remove everything after the dot before parsing.
35 |       const dateUpToSeconds = timestamp.split('.')[0]
36 |       if (name == 'Forward') {
37 |         forwardBenchmarks.push(
38 |             {name: name, timestamp: parseDate(dateUpToSeconds),
39 |              mean_latency: +mean_latency});
40 |       } else {
41 |         forwardBackwardBenchmarks.push(
42 |             {name: name, timestamp: parseDate(dateUpToSeconds),
43 |              mean_latency: +mean_latency});
44 |       }
45 |     }
46 |     plotForward.addDataset(
47 |         new Plottable.Dataset(forwardBenchmarks, {name: 'Forward'}));
48 |     plotForwardBackward.addDataset(
49 |         new Plottable.Dataset(
50 |             forwardBackwardBenchmarks, {name: 'Forward-Backward'}));
51 |   });
52 | };
53 | 
54 | /**
55 |  * Create the chart.
56 |  */
57 | CsvLatencyChart.prototype.makeChart = function() {
58 |   const xScale = new Plottable.Scales.Time();
59 |   const yScaleForward = new Plottable.Scales.Linear();
60 |   const yScaleForwardBackward = new Plottable.Scales.Linear();
61 | 
62 |   const plotForward = new LatencyChart(
63 |       'Forward pass per-batch latency', 'Mean latency (sec)',
64 |       xScale, yScaleForward);
65 |   const plotForwardBackward = new LatencyChart(
66 |       'Forward-backward pass per-batch latency', 'Mean latency (sec)',
67 |       xScale, yScaleForwardBackward);
68 | 
69 |   this.addData_(plotForward, plotForwardBackward);
70 | 
71 |   const table = new Plottable.Components.Table([
72 |       [plotForward.table],
73 |       [plotForwardBackward.table]
74 |   ]);
75 |   table.renderTo(this.svg_element_id);
76 | 
77 |   plotForward.addTooltip();
78 |   plotForwardBackward.addTooltip();
79 |   new Plottable.Interactions.Click()
80 |       .attachTo(plotForward.linePlot)
81 |       .onClick(function(p) {
82 |          plotForward.updateForPosition(p);
83 |          plotForwardBackward.updateForPosition(p);
84 |       });
85 |   new Plottable.Interactions.Click()
86 |       .attachTo(plotForwardBackward.linePlot)
87 |       .onClick(function(p) {
88 |          plotForward.updateForPosition(p);
89 |          plotForwardBackward.updateForPosition(p);
90 |       });
91 | };
92 | 


--------------------------------------------------------------------------------
/cluster/benchmarks/scripts/Dockerfile.tf_cnn_benchmarks:
--------------------------------------------------------------------------------
1 | FROM tensorflow/tensorflow:nightly-gpu
2 | 
3 | RUN apt-get update && apt-get install -y python-pip && pip install google-cloud
4 | COPY tf_cnn_benchmarks/ ./tf_cnn_benchmarks/
5 | RUN touch tf_cnn_benchmarks/__init__.py
6 | RUN mkdir ./util/
7 | COPY util/ ./util/
8 | ENTRYPOINT ["python", "-m", "tf_cnn_benchmarks.tf_cnn_benchmarks"]
9 | 


--------------------------------------------------------------------------------
/cluster/benchmarks/scripts/benchmark_configs.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | # Distributed benchmark configs to run with continuous build.
17 | # For each benchmark, the following properties are supported:
18 | #
19 | # benchmark_name: (required) unique name of the benchmark to run
20 | # args: (optional) argument values to pass to the benchmark.
21 | # env_vars: (optional) environment variables to set for benchmark jobs.
22 | # worker_count: (required) number of worker jobs to run
23 | # ps_count: (required) number of ps jobs to run.
24 | # gpus_per_machine: (optional) number of required gpus per worker
25 | #   (currently only supporting <= 1).
26 | # docker_file: (required) docker file to build a docker image for.
27 | #   Path to the docker file should be relative to Jenkins build folder.
28 | #   'benchmarks' github repo will be cloned to 'benchmarks' folder.
29 | #
30 | # Example:
31 | # - benchmark_name: "benchmark_alexnet"
32 | #   args:
33 | #     data_format: "NHWC"
34 | #   worker_count: 1
35 | #   ps_count: 2
36 | #   docker_file: "benchmarks/models/Dockerfile.alexnet_distributed_test"
37 | 
38 | - benchmark_name: "tf_cnn_benchmark_resnet50"
39 |   args:
40 |     data_format: "NHWC"
41 |     model: "resnet50"
42 |     result_storage: "cbuild_benchmark_datastore"
43 |     num_gpus: 8
44 |     local_parameter_device: "cpu"
45 |   worker_count: 2
46 |   ps_count: 2
47 |   gpus_per_machine: 8
48 |   docker_file: "benchmarks/scripts/Dockerfile.tf_cnn_benchmarks"
49 | 


--------------------------------------------------------------------------------
/cluster/benchmarks/scripts/tf_cnn_benchmarks/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yaroslavvb/stuff/a8024ead315aa1b5d6976940b3a062178f0e499d/cluster/benchmarks/scripts/tf_cnn_benchmarks/.DS_Store


--------------------------------------------------------------------------------
/cluster/benchmarks/scripts/tf_cnn_benchmarks/README.md:
--------------------------------------------------------------------------------
 1 | # tf_cnn_benchmarks: High performance benchmarks
 2 | 
 3 | tf_cnn_benchmarks contains implementations of several popular convolutional
 4 | models, and is designed to be as fast as possible. tf_cnn_benchmarks supports
 5 | both running on a single machine or running in distributed mode across multiple
 6 | hosts. See the [High-Performance models
 7 | guide](https://www.tensorflow.org/performance/performance_models) for more
 8 | information.
 9 | 
10 | These models utilize many of the strategies in the [TensorFlow Performance
11 | Guide](https://www.tensorflow.org/performance/performance_guide). Benchmark
12 | results can be found [here](https://www.tensorflow.org/performance/benchmarks).
13 | 
14 | These models are designed for performance. For models that have clean and
15 | easy-to-read implementations, see the [TensorFlow Official
16 | Models](https://github.com/tensorflow/models/tree/master/official).
17 | 
18 | ## Getting Started
19 | 
20 | To run ResNet50 with synthetic data without distortions with a single GPU, run
21 | 
22 | ```
23 | python tf_cnn_benchmarks.py --num_gpus=1 --batch_size=32 --model=resnet50 --variable_update=parameter_server
24 | ```
25 | 
26 | Some important flags are
27 | 
28 | *   model: Model to use, e.g. resnet50, inception3, vgg16, and alexnet.
29 | *   num_gpus: Number of GPUs to use.
30 | *   data_dir: Path to data to process. If not set, synthetic data is used. To
31 |     use Imagenet data use these
32 |     [instructions](https://github.com/tensorflow/models/tree/master/research/inception#getting-started)
33 |     as a starting point.
34 | *   batch_size: Batch size for each GPU.
35 | *   variable_update: The method for managing variables: parameter_server
36 |     ,replicated, distributed_replicated, independent
37 | *   local_parameter_device: Device to use as parameter server: cpu or gpu.
38 | 
39 | See
40 | [benchmark_cnn.py](https://github.com/tensorflow/benchmarks/blob/master/scripts/tf_cnn_benchmarks/benchmark_cnn.py)
41 | for the full list of flags. The `_DEFAULT_PARAMS` dict in that file contains the
42 | flags.
43 | 


--------------------------------------------------------------------------------
/cluster/benchmarks/scripts/tf_cnn_benchmarks/benchmark_storage.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Provides ways to store benchmark output."""
16 | 
17 | 
18 | def store_benchmark(data, storage_type=None):
19 |   """Store benchmark data.
20 | 
21 |   Args:
22 |     data: Dictionary mapping from string benchmark name to
23 |       numeric benchmark value.
24 |     storage_type: (string) Specifies where to store benchmark
25 |       result. If storage_type is
26 |       'cbuild_benchmark_datastore': store outputs in our continuous
27 |         build datastore. gcloud must be setup in current environment
28 |         pointing to the project where data will be added.
29 |   """
30 |   if storage_type == 'cbuild_benchmark_datastore':
31 |     try:
32 |       # pylint: disable=g-import-not-at-top
33 |       import cbuild_benchmark_storage
34 |       # pylint: enable=g-import-not-at-top
35 |     except ImportError:
36 |       raise ImportError(
37 |           'Missing cbuild_benchmark_storage.py required for '
38 |           'benchmark_cloud_datastore option')
39 |     cbuild_benchmark_storage.upload_to_benchmark_datastore(data)
40 |   else:
41 |     assert False, 'unknown storage_type: ' + storage_type
42 | 


--------------------------------------------------------------------------------
/cluster/benchmarks/scripts/tf_cnn_benchmarks/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yaroslavvb/stuff/a8024ead315aa1b5d6976940b3a062178f0e499d/cluster/benchmarks/scripts/tf_cnn_benchmarks/models/__init__.py


--------------------------------------------------------------------------------
/cluster/benchmarks/scripts/tf_cnn_benchmarks/models/alexnet_model.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | """Alexnet model configuration.
17 | 
18 | References:
19 |   Krizhevsky, Alex, Ilya Sutskever, and Geoffrey E. Hinton
20 |   ImageNet Classification with Deep Convolutional Neural Networks
21 |   Advances in Neural Information Processing Systems. 2012
22 | """
23 | 
24 | import tensorflow as tf
25 | from models import model
26 | 
27 | 
28 | class AlexnetModel(model.Model):
29 |   """Alexnet cnn model."""
30 | 
31 |   def __init__(self):
32 |     super(AlexnetModel, self).__init__('alexnet', 224 + 3, 512, 0.005)
33 | 
34 |   def add_inference(self, cnn):
35 |     # Note: VALID requires padding the images by 3 in width and height
36 |     cnn.conv(64, 11, 11, 4, 4, 'VALID')
37 |     cnn.mpool(3, 3, 2, 2)
38 |     cnn.conv(192, 5, 5)
39 |     cnn.mpool(3, 3, 2, 2)
40 |     cnn.conv(384, 3, 3)
41 |     cnn.conv(384, 3, 3)
42 |     cnn.conv(256, 3, 3)
43 |     cnn.mpool(3, 3, 2, 2)
44 |     cnn.reshape([-1, 256 * 6 * 6])
45 |     cnn.affine(4096)
46 |     cnn.dropout()
47 |     cnn.affine(4096)
48 |     cnn.dropout()
49 | 
50 | 
51 | class AlexnetCifar10Model(model.Model):
52 |   """Alexnet cnn model for cifar datasets.
53 | 
54 |   The model architecture follows the one defined in the tensorflow tutorial
55 |   model.
56 | 
57 |   Reference model: tensorflow/models/tutorials/image/cifar10/cifar10.py
58 |   Paper: http://www.cs.toronto.edu/~kriz/learning-features-2009-TR.pdf
59 |   """
60 | 
61 |   def __init__(self):
62 |     super(AlexnetCifar10Model, self).__init__('alexnet', 32, 128, 0.1)
63 | 
64 |   def add_inference(self, cnn):
65 |     cnn.conv(64, 5, 5, 1, 1, 'SAME', stddev=5e-2)
66 |     cnn.mpool(3, 3, 2, 2, mode='SAME')
67 |     cnn.lrn(depth_radius=4, bias=1.0, alpha=0.001 / 9.0, beta=0.75)
68 |     cnn.conv(64, 5, 5, 1, 1, 'SAME', bias=0.1, stddev=5e-2)
69 |     cnn.lrn(depth_radius=4, bias=1.0, alpha=0.001 / 9.0, beta=0.75)
70 |     cnn.mpool(3, 3, 2, 2, mode='SAME')
71 |     shape = cnn.top_layer.get_shape().as_list()
72 |     flat_dim = shape[1] * shape[2] * shape[3]
73 |     cnn.reshape([-1, flat_dim])
74 |     cnn.affine(384, stddev=0.04, bias=0.1)
75 |     cnn.affine(192, stddev=0.04, bias=0.1)
76 | 
77 |   def get_learning_rate(self, global_step, batch_size):
78 |     num_examples_per_epoch = 50000
79 |     num_epochs_per_decay = 100
80 |     decay_steps = int(num_epochs_per_decay * num_examples_per_epoch /
81 |                       batch_size)
82 |     decay_factor = 0.1
83 |     return tf.train.exponential_decay(
84 |         self.learning_rate, global_step, decay_steps, decay_factor,
85 |         staircase=True)
86 | 


--------------------------------------------------------------------------------
/cluster/benchmarks/scripts/tf_cnn_benchmarks/models/densenet_model.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | """Densenet model configuration.
17 | 
18 | References:
19 |   "Densely Connected Convolutional Networks": https://arxiv.org/pdf/1608.06993
20 | """
21 | import numpy as np
22 | from six.moves import xrange  # pylint: disable=redefined-builtin
23 | import tensorflow as tf
24 | from models import model as model_lib
25 | 
26 | 
27 | class DensenetCifar10Model(model_lib.Model):
28 |   """Densenet cnn network configuration."""
29 | 
30 |   def __init__(self, model, layer_counts, growth_rate):
31 |     self.growth_rate = growth_rate
32 |     super(DensenetCifar10Model, self).__init__(model, 32, 64, 0.1,
33 |                                                layer_counts=layer_counts)
34 |     self.batch_norm_config = {'decay': 0.9, 'epsilon': 1e-5, 'scale': True}
35 | 
36 |   def dense_block(self, cnn, growth_rate):
37 |     input_layer = cnn.top_layer
38 |     c = cnn.batch_norm(input_layer, **self.batch_norm_config)
39 |     c = tf.nn.relu(c)
40 |     c = cnn.conv(growth_rate, 3, 3, 1, 1, stddev=np.sqrt(2.0/9/growth_rate),
41 |                  activation=None, input_layer=c)
42 |     channel_index = 3 if cnn.channel_pos == 'channels_last' else 1
43 |     cnn.top_layer = tf.concat([input_layer, c], channel_index)
44 |     cnn.top_size += growth_rate
45 | 
46 |   def transition_layer(self, cnn):
47 |     in_size = cnn.top_size
48 |     cnn.batch_norm(**self.batch_norm_config)
49 |     cnn.top_layer = tf.nn.relu(cnn.top_layer)
50 |     cnn.conv(in_size, 1, 1, 1, 1, stddev=np.sqrt(2.0/9/in_size))
51 |     cnn.apool(2, 2, 2, 2)
52 | 
53 |   def add_inference(self, cnn):
54 |     if self.layer_counts is None:
55 |       raise ValueError('Layer counts not specified for %s' % self.get_model())
56 |     if self.growth_rate is None:
57 |       raise ValueError('Growth rate not specified for %s' % self.get_model())
58 | 
59 |     cnn.conv(16, 3, 3, 1, 1, activation=None)
60 |     # Block 1
61 |     for _ in xrange(self.layer_counts[0]):
62 |       self.dense_block(cnn, self.growth_rate)
63 |     self.transition_layer(cnn)
64 |     # Block 2
65 |     for _ in xrange(self.layer_counts[1]):
66 |       self.dense_block(cnn, self.growth_rate)
67 |     self.transition_layer(cnn)
68 |     # Block 3
69 |     for _ in xrange(self.layer_counts[2]):
70 |       self.dense_block(cnn, self.growth_rate)
71 |     cnn.batch_norm(**self.batch_norm_config)
72 |     cnn.top_layer = tf.nn.relu(cnn.top_layer)
73 |     channel_index = 3 if cnn.channel_pos == 'channels_last' else 1
74 |     cnn.top_size = cnn.top_layer.get_shape().as_list()[channel_index]
75 |     cnn.spatial_mean()
76 | 
77 |   def get_learning_rate(self, global_step, batch_size):
78 |     num_batches_per_epoch = int(50000 / batch_size)
79 |     boundaries = num_batches_per_epoch * np.array([150, 225, 300],
80 |                                                   dtype=np.int64)
81 |     boundaries = [x for x in boundaries]
82 |     values = [0.1, 0.01, 0.001, 0.0001]
83 |     return tf.train.piecewise_constant(global_step, boundaries, values)
84 | 


--------------------------------------------------------------------------------
/cluster/benchmarks/scripts/tf_cnn_benchmarks/models/googlenet_model.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | """Googlenet model configuration.
17 | 
18 | References:
19 |   Szegedy, Christian, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed,
20 |   Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, and Andrew Rabinovich
21 |   Going deeper with convolutions
22 |   arXiv preprint arXiv:1409.4842 (2014)
23 | """
24 | 
25 | from models import model
26 | 
27 | 
28 | class GooglenetModel(model.Model):
29 | 
30 |   def __init__(self):
31 |     super(GooglenetModel, self).__init__('googlenet', 224, 32, 0.005)
32 | 
33 |   def add_inference(self, cnn):
34 |     def inception_v1(cnn, k, l, m, n, p, q):
35 |       cols = [[('conv', k, 1, 1)], [('conv', l, 1, 1), ('conv', m, 3, 3)],
36 |               [('conv', n, 1, 1), ('conv', p, 5, 5)],
37 |               [('mpool', 3, 3, 1, 1, 'SAME'), ('conv', q, 1, 1)]]
38 |       cnn.inception_module('incept_v1', cols)
39 | 
40 |     cnn.conv(64, 7, 7, 2, 2)
41 |     cnn.mpool(3, 3, 2, 2, mode='SAME')
42 |     cnn.conv(64, 1, 1)
43 |     cnn.conv(192, 3, 3)
44 |     cnn.mpool(3, 3, 2, 2, mode='SAME')
45 |     inception_v1(cnn, 64, 96, 128, 16, 32, 32)
46 |     inception_v1(cnn, 128, 128, 192, 32, 96, 64)
47 |     cnn.mpool(3, 3, 2, 2, mode='SAME')
48 |     inception_v1(cnn, 192, 96, 208, 16, 48, 64)
49 |     inception_v1(cnn, 160, 112, 224, 24, 64, 64)
50 |     inception_v1(cnn, 128, 128, 256, 24, 64, 64)
51 |     inception_v1(cnn, 112, 144, 288, 32, 64, 64)
52 |     inception_v1(cnn, 256, 160, 320, 32, 128, 128)
53 |     cnn.mpool(3, 3, 2, 2, mode='SAME')
54 |     inception_v1(cnn, 256, 160, 320, 32, 128, 128)
55 |     inception_v1(cnn, 384, 192, 384, 48, 128, 128)
56 |     cnn.apool(7, 7, 1, 1, mode='VALID')
57 |     cnn.reshape([-1, 1024])
58 | 


--------------------------------------------------------------------------------
/cluster/benchmarks/scripts/tf_cnn_benchmarks/models/lenet_model.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | """Lenet model configuration.
17 | 
18 | References:
19 |   LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick Haffner
20 |   Gradient-based learning applied to document recognition
21 |   Proceedings of the IEEE (1998)
22 | """
23 | 
24 | from models import model
25 | 
26 | 
27 | class Lenet5Model(model.Model):
28 | 
29 |   def __init__(self):
30 |     super(Lenet5Model, self).__init__('lenet5', 28, 32, 0.005)
31 | 
32 |   def add_inference(self, cnn):
33 |     # Note: This matches TF's MNIST tutorial model
34 |     cnn.conv(32, 5, 5)
35 |     cnn.mpool(2, 2)
36 |     cnn.conv(64, 5, 5)
37 |     cnn.mpool(2, 2)
38 |     cnn.reshape([-1, 64 * 7 * 7])
39 |     cnn.affine(512)
40 | 


--------------------------------------------------------------------------------
/cluster/benchmarks/scripts/tf_cnn_benchmarks/models/model.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Base model configuration for CNN benchmarks."""
16 | 
17 | 
18 | class Model(object):
19 |   """Base model configuration for CNN benchmarks."""
20 | 
21 |   def __init__(self,
22 |                model,
23 |                image_size,
24 |                batch_size,
25 |                learning_rate,
26 |                layer_counts=None,
27 |                fp16_loss_scale=128):
28 |     self.model = model
29 |     self.image_size = image_size
30 |     self.batch_size = batch_size
31 |     self.default_batch_size = batch_size
32 |     self.learning_rate = learning_rate
33 |     self.layer_counts = layer_counts
34 |     # TODO(reedwm) Set custom loss scales for each model instead of using the
35 |     # default of 128.
36 |     self.fp16_loss_scale = fp16_loss_scale
37 | 
38 |   def get_model(self):
39 |     return self.model
40 | 
41 |   def get_image_size(self):
42 |     return self.image_size
43 | 
44 |   def get_batch_size(self):
45 |     return self.batch_size
46 | 
47 |   def set_batch_size(self, batch_size):
48 |     self.batch_size = batch_size
49 | 
50 |   def get_default_batch_size(self):
51 |     return self.default_batch_size
52 | 
53 |   def get_layer_counts(self):
54 |     return self.layer_counts
55 | 
56 |   def get_fp16_loss_scale(self):
57 |     return self.fp16_loss_scale
58 | 
59 |   def get_learning_rate(self, global_step, batch_size):
60 |     del global_step
61 |     del batch_size
62 |     return self.learning_rate
63 | 
64 |   def add_inference(self, unused_cnn):
65 |     raise ValueError('Must be implemented in derived classes')
66 | 


--------------------------------------------------------------------------------
/cluster/benchmarks/scripts/tf_cnn_benchmarks/models/overfeat_model.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | """Overfeat model configuration.
17 | 
18 | References:
19 |   OverFeat: Integrated Recognition, Localization and Detection using
20 |   Convolutional Networks
21 |   Pierre Sermanet, David Eigen, Xiang Zhang, Michael Mathieu, Rob Fergus,
22 |   Yann LeCun, 2014
23 |   http://arxiv.org/abs/1312.6229
24 | """
25 | 
26 | from models import model
27 | 
28 | 
29 | class OverfeatModel(model.Model):
30 | 
31 |   def __init__(self):
32 |     super(OverfeatModel, self).__init__('overfeat', 231, 32, 0.005)
33 | 
34 |   def add_inference(self, cnn):
35 |     # Note: VALID requires padding the images by 3 in width and height
36 |     cnn.conv(96, 11, 11, 4, 4, mode='VALID')
37 |     cnn.mpool(2, 2)
38 |     cnn.conv(256, 5, 5, 1, 1, mode='VALID')
39 |     cnn.mpool(2, 2)
40 |     cnn.conv(512, 3, 3)
41 |     cnn.conv(1024, 3, 3)
42 |     cnn.conv(1024, 3, 3)
43 |     cnn.mpool(2, 2)
44 |     cnn.reshape([-1, 1024 * 6 * 6])
45 |     cnn.affine(3072)
46 |     cnn.affine(4096)
47 | 


--------------------------------------------------------------------------------
/cluster/benchmarks/scripts/tf_cnn_benchmarks/models/trivial_model.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Trivial model configuration."""
16 | 
17 | from models import model
18 | 
19 | 
20 | class TrivialModel(model.Model):
21 |   """Trivial model configuration."""
22 | 
23 |   def __init__(self):
24 |     super(TrivialModel, self).__init__('trivial', 224 + 3, 32, 0.005)
25 | 
26 |   def add_inference(self, cnn):
27 |     cnn.reshape([-1, 227 * 227 * 3])
28 |     cnn.affine(1)
29 |     cnn.affine(4096)
30 | 


--------------------------------------------------------------------------------
/cluster/benchmarks/scripts/tf_cnn_benchmarks/models/vgg_model.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | """Vgg model configuration.
17 | 
18 | Includes multiple models: vgg11, vgg16, vgg19, corresponding to
19 |   model A, D, and E in Table 1 of [1].
20 | 
21 | References:
22 | [1]  Simonyan, Karen, Andrew Zisserman
23 |      Very Deep Convolutional Networks for Large-Scale Image Recognition
24 |      arXiv:1409.1556 (2014)
25 | """
26 | 
27 | from six.moves import xrange  # pylint: disable=redefined-builtin
28 | from models import model
29 | 
30 | 
31 | def _construct_vgg(cnn, num_conv_layers):
32 |   """Build vgg architecture from blocks."""
33 |   assert len(num_conv_layers) == 5
34 |   for _ in xrange(num_conv_layers[0]):
35 |     cnn.conv(64, 3, 3)
36 |   cnn.mpool(2, 2)
37 |   for _ in xrange(num_conv_layers[1]):
38 |     cnn.conv(128, 3, 3)
39 |   cnn.mpool(2, 2)
40 |   for _ in xrange(num_conv_layers[2]):
41 |     cnn.conv(256, 3, 3)
42 |   cnn.mpool(2, 2)
43 |   for _ in xrange(num_conv_layers[3]):
44 |     cnn.conv(512, 3, 3)
45 |   cnn.mpool(2, 2)
46 |   for _ in xrange(num_conv_layers[4]):
47 |     cnn.conv(512, 3, 3)
48 |   cnn.mpool(2, 2)
49 |   cnn.reshape([-1, 512 * 7 * 7])
50 |   cnn.affine(4096)
51 |   cnn.dropout()
52 |   cnn.affine(4096)
53 |   cnn.dropout()
54 | 
55 | 
56 | class Vgg11Model(model.Model):
57 | 
58 |   def __init__(self):
59 |     super(Vgg11Model, self).__init__('vgg11', 224, 64, 0.005)
60 | 
61 |   def add_inference(self, cnn):
62 |     _construct_vgg(cnn, [1, 1, 2, 2, 2])
63 | 
64 | 
65 | class Vgg16Model(model.Model):
66 | 
67 |   def __init__(self):
68 |     super(Vgg16Model, self).__init__('vgg16', 224, 64, 0.005)
69 | 
70 |   def add_inference(self, cnn):
71 |     _construct_vgg(cnn, [2, 2, 3, 3, 3])
72 | 
73 | 
74 | class Vgg19Model(model.Model):
75 | 
76 |   def __init__(self):
77 |     super(Vgg19Model, self).__init__('vgg19', 224, 64, 0.005)
78 | 
79 |   def add_inference(self, cnn):
80 |     _construct_vgg(cnn, [2, 2, 4, 4, 4])
81 | 


--------------------------------------------------------------------------------
/cluster/benchmarks/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | """Benchmark script for TensorFlow.
17 | 
18 | See the README for more information.
19 | """
20 | 
21 | from __future__ import print_function
22 | 
23 | 
24 | import tensorflow as tf
25 | 
26 | import benchmark_cnn
27 | import cnn_util
28 | from cnn_util import log_fn
29 | 
30 | benchmark_cnn.define_flags()
31 | 
32 | 
33 | def main(extra_flags):
34 |   # extra_flags is a list of command line arguments, excluding those defined
35 |   # in tf.flags.FLAGS. extra_flags[0] is always the program name. It is an error
36 |   # to supply flags not defined with tf.flags.FLAGS, so we raise an ValueError
37 |   # in that case.
38 |   assert len(extra_flags) >= 1
39 |   if len(extra_flags) > 1:
40 |     raise ValueError('Received unknown flags: %s' % extra_flags[1:])
41 | 
42 |   params = benchmark_cnn.make_params_from_flags()
43 |   benchmark_cnn.setup(params)
44 |   bench = benchmark_cnn.BenchmarkCNN(params)
45 | 
46 |   tfversion = cnn_util.tensorflow_version_tuple()
47 |   log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))
48 | 
49 |   bench.print_info()
50 |   bench.run()
51 | 
52 | 
53 | if __name__ == '__main__':
54 |   tf.app.run()
55 | 


--------------------------------------------------------------------------------
/cluster/benchmarks/scripts/util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yaroslavvb/stuff/a8024ead315aa1b5d6976940b3a062178f0e499d/cluster/benchmarks/scripts/util/__init__.py


--------------------------------------------------------------------------------
/cluster/benchmarks/scripts/util/benchmark_util_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Tests for benchmark_util."""
16 | 
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 | 
21 | import datetime
22 | import json
23 | import os
24 | import tempfile
25 | import unittest
26 | 
27 | import benchmark_util
28 | 
29 | 
30 | class BenchmarkUtilTest(unittest.TestCase):
31 | 
32 |   def testStoreDataWithNoEntries(self):
33 |     with tempfile.NamedTemporaryFile() as temp_file:
34 |       timing_entries = []
35 |       benchmark_util.store_data_in_json(
36 |           timing_entries, datetime.date(2017, 1, 1), temp_file.name)
37 |       with open(temp_file.name, 'r') as json_file:
38 |         json_output = json.loads(json_file.read())
39 |         self.assertEqual('TestBenchmark', json_output['name'])
40 |         self.assertEqual(u'1483228800', json_output['startTime'])
41 | 
42 |   def testStoreDataWithEntries(self):
43 |     with tempfile.NamedTemporaryFile() as temp_file:
44 |       timing_entries = [benchmark_util.StatEntry('test', 0.1, 1)]
45 |       benchmark_util.store_data_in_json(
46 |           timing_entries, datetime.date(2017, 1, 1), temp_file.name)
47 | 
48 |       with open(temp_file.name, 'r') as json_file:
49 |         json_output = json.loads(json_file.read())
50 |         self.assertEqual(1, len(json_output['entries']['entry']))
51 |         self.assertEqual('test', json_output['entries']['entry'][0]['name'])
52 |         self.assertEqual(0.1, json_output['entries']['entry'][0]['wallTime'])
53 |         self.assertEqual(u'1', json_output['entries']['entry'][0]['iters'])
54 |         self.assertEqual(u'1483228800', json_output['startTime'])
55 |         self.assertEqual('TestBenchmark', json_output['name'])
56 | 
57 | 
58 | if __name__ == '__main__':
59 |   unittest.main()
60 | 


--------------------------------------------------------------------------------
/cluster/benchmarks/scripts/util/convert_csv_to_json.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Convert CSV benchmark data to JSON format.
16 | 
17 | CSV benchmark data has the format:
18 |   Description,timestamp,num_batches,time mean value,time sd
19 | 
20 | JSON benchmark data in in the format of TestResults proto
21 | converted to JSON.
22 | https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/test_log.proto.
23 | """
24 | import argparse
25 | import csv
26 | from datetime import datetime
27 | 
28 | import benchmark_util
29 | 
30 | 
31 | def get_data_from_csv(csv_reader):
32 |   """Creates a list of StatEntry objects based on data in CSV data.
33 | 
34 |   Input CSV data must be in the format:
35 |     Description,timestamp,num_batches,time mean value,time sd
36 | 
37 |   Args:
38 |     csv_reader: csv.reader instance.
39 | 
40 |   Returns:
41 |     A tuple of datetime timestamp and list of benchmark_util.StatEntry objects.
42 | 
43 |   Raises:
44 |     ValueError: if CSV is invalid.
45 |   """
46 |   timestamp = None
47 |   stat_entries = []
48 | 
49 |   for row in csv_reader:
50 |     if len(row) != 5:
51 |       raise ValueError('Expected 5 entries per line in the input CSV file, '
52 |                        'but found %d entries.' % len(row))
53 |     if '' in row:
54 |       raise ValueError('Found empty entries in row: %s' % row)
55 | 
56 |     # Set timestamp based on the first line in CSV file.
57 |     if timestamp is None:
58 |       # Example of time formatting: 2017-06-26 02:59:29.325579
59 |       timestamp = datetime.strptime(row[1], "%Y-%m-%d %H:%M:%S.%f")
60 |     stat_entries.append(
61 |         benchmark_util.StatEntry(row[0], float(row[3]), 1))
62 |   return timestamp, stat_entries
63 | 
64 | 
65 | def main():
66 |   with open(FLAGS.input_csv_file, 'r') as csvfile:
67 |     csv_reader = csv.reader(csvfile)
68 |     timestamp, stat_entries = get_data_from_csv(csv_reader)
69 |     benchmark_util.store_data_in_json(
70 |         stat_entries, timestamp,
71 |         output_file=FLAGS.output_json_file,
72 |         test_name=FLAGS.test_name)
73 | 
74 | 
75 | if __name__ == '__main__':
76 |     parser = argparse.ArgumentParser()
77 |     parser.register(
78 |         'type', 'bool', lambda v: v.lower() in ('true', 't', 'y', 'yes'))
79 |     parser.add_argument(
80 |         '--test_name', type=str, default=None, required=True,
81 |         help='Name of the test.')
82 |     parser.add_argument(
83 |         '--input_csv_file', type=str, default=None, required=True,
84 |         help='Path to the CSV file.')
85 |     parser.add_argument(
86 |         '--output_json_file', type=str, default=None, required=True,
87 |         help='Path to output JSON file.')
88 |     FLAGS, _ = parser.parse_known_args()
89 |     main()
90 | 
91 | 


--------------------------------------------------------------------------------
/cluster/benchmarks/scripts/util/convert_csv_to_json_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Tests for convert_csv_to_json."""
16 | import csv
17 | import datetime
18 | import unittest
19 | 
20 | import convert_csv_to_json
21 | 
22 | 
23 | class ConvertCsvToJsonTest(unittest.TestCase):
24 | 
25 |   def testSingleEntryCSV(self):
26 |     # Description,timestamp,num_batches,time mean value,time sd
27 |     csv_reader = csv.reader(
28 |         ['abc,2017-06-26 02:59:29.325579,10,2.15,0.1'])
29 |     timestamp, stat_entries = convert_csv_to_json.get_data_from_csv(csv_reader)
30 |     self.assertEqual(
31 |         datetime.datetime(2017, 06, 26, 2, 59, 29, 325579),
32 |         timestamp)
33 |     self.assertEqual(1, len(stat_entries))
34 |     self.assertEqual('abc', stat_entries[0].name)
35 |     self.assertEqual(2.15, stat_entries[0].stat_value)
36 | 
37 |   def testTwoEntryCSV(self):
38 |     # Description,timestamp,num_batches,time mean value,time sd
39 |     csv_reader = csv.reader(
40 |         ['abc,2017-06-26 02:59:35.425579,10,2.15,0.1',
41 |          'def,2017-06-26 02:59:29.325579,10,10.1,0.1'])
42 |     timestamp, stat_entries = convert_csv_to_json.get_data_from_csv(csv_reader)
43 |     self.assertEqual(
44 |         datetime.datetime(2017, 06, 26, 2, 59, 35, 425579),
45 |         timestamp)
46 |     self.assertEqual(2, len(stat_entries))
47 |     self.assertEqual('abc', stat_entries[0].name)
48 |     self.assertEqual(2.15, stat_entries[0].stat_value)
49 |     self.assertEqual('def', stat_entries[1].name)
50 |     self.assertEqual(10.1, stat_entries[1].stat_value)
51 | 
52 |   def testInvalidCSV_LessEntries(self):
53 |     csv_reader = csv.reader(
54 |         ['abc,2017-06-26 02:59:29.325579,10,2.15'])
55 |     with self.assertRaises(ValueError):
56 |       timestamp, stat_entries = convert_csv_to_json.get_data_from_csv(
57 |           csv_reader)
58 | 
59 |   def testInvalidCSV_MoreEntries(self):
60 |     csv_reader = csv.reader(
61 |         ['abc,2017-06-26 02:59:29.325579,10,2.15,0.1,extra_entry'])
62 |     with self.assertRaises(ValueError):
63 |       timestamp, stat_entries = convert_csv_to_json.get_data_from_csv(
64 |           csv_reader)
65 | 
66 |   def testInvalidCSV_EmptyEntry(self):
67 |     csv_reader = csv.reader(
68 |         [',2017-06-26 02:59:29.325579,10,2.15,0.1'])
69 |     with self.assertRaises(ValueError):
70 |       timestamp, stat_entries = convert_csv_to_json.get_data_from_csv(
71 |           csv_reader)
72 | 
73 |   def testInvalidCSV_InvalidDate(self):
74 |     csv_reader = csv.reader(['abc,invaliddate,10,2.15,0.1'])
75 |     with self.assertRaises(ValueError):
76 |       timestamp, stat_entries = convert_csv_to_json.get_data_from_csv(
77 |           csv_reader)
78 | 
79 |   def testInvalidCSV_InvalidValue(self):
80 |     csv_reader = csv.reader(
81 |         ['abc,2017-06-26 02:59:29.325579,10,invalidfloat,0.1'])
82 |     with self.assertRaises(ValueError):
83 |       timestamp, stat_entries = convert_csv_to_json.get_data_from_csv(
84 |           csv_reader)
85 | 
86 | 
87 | if __name__ == '__main__':
88 |   unittest.main()
89 | 


--------------------------------------------------------------------------------
/cluster/benchmarks/soumith_benchmarks.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <head>
 3 | <meta charset="utf-8">
 4 | <link rel="stylesheet" type="text/css" href="./bower_components/plottable/plottable.css">
 5 | <style>
 6 | body {
 7 |   font-family: roboto, sans-serif;
 8 | }
 9 | .plottable .title-label text{
10 |   font-size: 16px;
11 |   font-family: roboto, sans-serif;
12 | }
13 | h2 {
14 |   font-weight: 400;
15 | }
16 | .outer_div {
17 |   margin: 20px;
18 | }
19 | </style>
20 | </head>
21 | <body>
22 | <div class="outer_div">
23 | <h2>Alexnet</h2>
24 | <svg id="alexnet_benchmark_plot" width="1000" height="500"></svg>
25 | <h2>Googlenet</h2>
26 | <svg id="googlenet_benchmark_plot" width="1000" height="500"></svg>
27 | <h2>Overfeat</h2>
28 | <svg id="overfeat_benchmark_plot" width="1000" height="500"></svg>
29 | <h2>VGG</h2>
30 | <svg id="vgg_benchmark_plot" width="1000" height="500"></svg>
31 | </div>
32 | <script src="./bower_components/d3/d3.js"></script>
33 | <script src="./bower_components/plottable/plottable.js"></script>
34 | <script src="./js/latency_chart.js"></script>
35 | <script src="./js/csv_benchmark_chart.js"></script>
36 | <script>
37 | var BASE_BENCHMARK_URL =
38 |   'https://ci.tensorflow.org/job/nightly-soumith-benchmarks/lastSuccessfulBuild/artifact/';
39 | new CsvLatencyChart(
40 |     'svg#alexnet_benchmark_plot',
41 |     BASE_BENCHMARK_URL + 'benchmark_alexnet_history.csv')
42 |     .makeChart();
43 | new CsvLatencyChart(
44 |     'svg#googlenet_benchmark_plot',
45 |     BASE_BENCHMARK_URL + 'benchmark_googlenet_history.csv')
46 |     .makeChart();
47 | new CsvLatencyChart(
48 |     'svg#overfeat_benchmark_plot',
49 |     BASE_BENCHMARK_URL + 'benchmark_overfeat_history.csv')
50 |     .makeChart();
51 | new CsvLatencyChart(
52 |     'svg#vgg_benchmark_plot',
53 |     BASE_BENCHMARK_URL + 'benchmark_vgg_history.csv')
54 |     .makeChart();
55 | </script>
56 | </body>
57 | </html>
58 | 


--------------------------------------------------------------------------------
/cluster/benchmarks/tools/kubectl_util_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Tests for kubectl_util."""
16 | 
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 | 
21 | import mock
22 | import subprocess
23 | import unittest
24 | 
25 | import kubectl_util
26 | 
27 | 
28 | kubectl_util.WAIT_PERIOD_SECONDS = 1
29 | 
30 | 
31 | class KubectlUtilTest(unittest.TestCase):
32 | 
33 |   @mock.patch.object(subprocess, 'check_output')
34 |   @mock.patch.object(subprocess, 'check_call')
35 |   def testCreatePods(self, mock_check_call, mock_check_output):
36 |     mock_check_output.return_value = 'nonempty'
37 |     kubectl_util.CreatePods('test_pod', 'test.yaml')
38 |     mock_check_call.assert_called_once_with(
39 |         ['kubectl', 'create', '--filename=test.yaml'])
40 |     mock_check_output.assert_called_once_with(
41 |         ['kubectl', 'get', 'pods', '-o', 'name', '-a', '-l',
42 |          'name-prefix in (test_pod)'], universal_newlines=True)
43 | 
44 |   @mock.patch.object(subprocess, 'check_output')
45 |   @mock.patch.object(subprocess, 'call')
46 |   def testDeletePods(self, mock_check_call, mock_check_output):
47 |     mock_check_output.return_value = ''
48 |     kubectl_util.DeletePods('test_pod', 'test.yaml')
49 |     mock_check_call.assert_called_once_with(
50 |         ['kubectl', 'delete', '--filename=test.yaml'])
51 |     mock_check_output.assert_called_once_with(
52 |         ['kubectl', 'get', 'pods', '-o', 'name', '-a', '-l',
53 |          'name-prefix in (test_pod)'], universal_newlines=True)
54 | 
55 |   @mock.patch.object(subprocess, 'check_output')
56 |   def testWaitForCompletion(self, mock_check_output):
57 |     # Test success
58 |     mock_check_output.return_value = '\'0,0,\''
59 |     self.assertTrue(kubectl_util.WaitForCompletion('test_pod'))
60 | 
61 |     # Test failure
62 |     mock_check_output.return_value = '\'0,1,\''
63 |     self.assertFalse(kubectl_util.WaitForCompletion('test_pod'))
64 | 
65 |     # Test timeout
66 |     with self.assertRaises(kubectl_util.TimeoutError):
67 |       mock_check_output.return_value = '\'0,,\''
68 |       kubectl_util.WaitForCompletion('test_pod', timeout=5)
69 | 
70 | 
71 | if __name__ == '__main__':
72 |   unittest.main()
73 | 


--------------------------------------------------------------------------------
/cluster/cloud-formation-example/README.md:
--------------------------------------------------------------------------------
 1 | # TensorFlow
 2 | 
 3 | 
 4 | Create Stack:
 5 | ```
 6 | aws --region ap-southeast-2 cloudformation create-stack --stack-name tensorflow --template-body file://tensorflow.yaml --parameters ParameterKey=KeyName,ParameterValue=[KeyName]
 7 | ```
 8 | 
 9 | Update Stack:
10 | ```
11 | aws --region ap-southeast-2 cloudformation update-stack --stack-name tensorflow --template-body file://tensorflow.yaml --parameters ParameterKey=KeyName,ParameterValue=[KeyName]
12 | ```
13 | 
14 | Delete Stack:
15 | ```
16 | aws --region ap-southeast-2 cloudformation delete-stack --stack-name tensorflow
17 | ```
18 | 
19 | Describe Stack:
20 | ```
21 | aws --region ap-southeast-2 cloudformation describe-stacks --stack-name tensorflow
22 | ```
23 | 
24 | # Create DNS zone distributed.tensorflow.
25 | bash -x zone.sh create distributed.tensorflow. ap-southeast-2 vpc-9e314bfa
26 | # Launch cluster with CloudFormation
27 | aws --region ap-southeast-2 cloudformation create-stack --stack-name tensorflow --template-body file://tensorflow.yaml --parameters ParameterKey=KeyName,ParameterValue=ytang ParameterKey=SubnetId,ParameterValue=subnet-8eaba9ea ParameterKey=VPC,ParameterValue=vpc-9e314bfa
28 | # Destroy cluster with CloudFormation
29 | aws --region ap-southeast-2 cloudformation delete-stack --stack-name tensorflow
30 | # Delete DNS zone distributed.tensorflow.
31 | bash -x zone.sh delete distributed.tensorflow. ap-southeast-2 vpc-9e314bfa
32 | 


--------------------------------------------------------------------------------
/cluster/cloud-formation-example/iam.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: 2010-09-09
 2 | Description: TensorFlow IamInstanceProfile CloudFormation
 3 | Parameters:
 4 |   InstanceProfileName:
 5 |     Type: String
 6 |     Default: TensorFlowCloudFormation
 7 | Resources:
 8 |   Role:
 9 |     Type: AWS::IAM::Role
10 |     Properties:
11 |       AssumeRolePolicyDocument:
12 |         Version: 2012-10-17
13 |         Statement:
14 |           Effect: Allow
15 |           Principal:
16 |             Service:
17 |             - ec2.amazonaws.com
18 |           Action:
19 |           - sts:AssumeRole
20 |   Policies:
21 |     Type: AWS::IAM::Policy
22 |     Properties:
23 |       PolicyDocument: 
24 |         Version: 2012-10-17
25 |         Statement:
26 |         - 
27 |           Effect: "Allow"
28 |           Action: 
29 |           - "ec2:AssociateAddress"
30 |           - "ec2:DisassociateAddress"
31 |           Resource: "*"
32 |       PolicyName: !Join [ "-", [ !Ref "AWS::StackName", "Policies" ] ]
33 |       Roles:
34 |       - !Ref Role
35 |   InstanceProfile:
36 |     Type: AWS::IAM::InstanceProfile
37 |     Properties:
38 |       Roles:
39 |       - !Ref Role
40 |       InstanceProfileName: !Ref InstanceProfileName
41 | 


--------------------------------------------------------------------------------
/cluster/cloud-formation-example/zone.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | set -e
 3 | 
 4 | option=$1
 5 | Name=$2
 6 | Region=$3
 7 | VPC=$4
 8 | 
 9 | if [[ "$option" == "create" ]]; then
10 | aws --region $Region route53 create-hosted-zone --name $Name --vpc VPCRegion=$Region,VPCId=$VPC --caller-reference $Name.$(date "+%F-%T")
11 | exit 0
12 | elif [[ "$option" == "delete" ]]; then
13 | HostedZoneId=$(aws --region $Region route53 list-hosted-zones --query "HostedZones[?Name == '$Name'].Id" --output text | sed 's/\/hostedzone\///g')
14 | if [[ ! -z $HostedZoneId ]]; then
15 | aws --region $Region route53 list-resource-record-sets \
16 |   --hosted-zone-id $HostedZoneId |
17 | jq -c '.ResourceRecordSets[]' |
18 | while read -r resourcerecordset; do
19 |   read -r name type <<<$(echo $(jq -r '.Name,.Type' <<<"$resourcerecordset"))
20 |   if [ $type != "NS" -a $type != "SOA" ]; then
21 |     aws --region $Region route53 change-resource-record-sets \
22 |       --hosted-zone-id $HostedZoneId \
23 |       --change-batch '{"Changes":[{"Action":"DELETE","ResourceRecordSet": '"$resourcerecordset"' }]}' \
24 |       --output text --query 'ChangeInfo.Id'
25 |   fi
26 | done
27 |     aws --region $Region route53 delete-hosted-zone --id $HostedZoneId
28 | fi
29 | exit 0
30 | else
31 | exit 1
32 | fi
33 | 


--------------------------------------------------------------------------------
/cluster/connect:
--------------------------------------------------------------------------------
1 | connect.py


--------------------------------------------------------------------------------
/cluster/connect.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | 
 4 | Script to connect to most recent instance with containing given fragment:
 5 | Usage:
 6 | connect
 7 | -- connects to most recently launched instance
 8 | connect i3
 9 | -- connects to most recently launchedn instance containing i3 in instance id
10 | 
11 | 
12 | Debugging/exploring:
13 | 
14 | python
15 | from pprint import pprint
16 | import boto3
17 | ec2 = boto3.client('ec2')
18 | response = ec2.describe_instances()
19 | reservation=response['Reservations'][0]
20 | instance = reservation['Instances'][0]
21 | pprint(instance)
22 | """
23 | 
24 | # todo: allow to do ls, show tags
25 | # todo: handle KeyError: 'PublicIpAddress'
26 | 
27 | import boto3
28 | import time
29 | import sys
30 | import os
31 | from datetime import datetime
32 | from operator import itemgetter
33 | 
34 | 
35 | def toseconds(dt):
36 |   # to invert:
37 |   # import pytz
38 |   # utc = pytz.UTC
39 |   # utc.localize(datetime.fromtimestamp(seconds))
40 |   return time.mktime(dt.utctimetuple())
41 | 
42 | def main():
43 |   fragment = ''
44 |   if len(sys.argv)>1:
45 |     fragment = sys.argv[1]
46 |     
47 |   ec2 = boto3.client('ec2')
48 |   response = ec2.describe_instances()
49 | 
50 |   instance_list = []
51 |   for reservation in response['Reservations']:
52 |     for instance in reservation['Instances']:
53 |       instance_list.append((toseconds(instance['LaunchTime']), instance))
54 | 
55 |   import pytz
56 |   from tzlocal import get_localzone # $ pip install tzlocal
57 | 
58 |   sorted_instance_list = sorted(instance_list, key=itemgetter(0))
59 |   cmd = ''
60 |   for (ts, instance) in reversed(sorted_instance_list):
61 |     if fragment in instance['InstanceId']:
62 |       
63 |       localtime = instance['LaunchTime'].astimezone(get_localzone())
64 |       keyname = instance.get('KeyName','none')
65 |       print("Connecting to %s launched at %s with key %s" % (instance['InstanceId'], localtime, keyname))
66 |       cmd = "ssh -i $HOME/Dropbox/yaroslav.pem -o StrictHostKeyChecking=no ubuntu@"+instance['PublicIpAddress']
67 |       break
68 |   if not cmd:
69 |     print("no instance id contains fragment '%s'"%(fragment,))
70 |   else:
71 |     print(cmd)
72 |     os.system(cmd)
73 | 
74 | 
75 | 
76 | if __name__=='__main__':
77 |   main()
78 | 


--------------------------------------------------------------------------------
/cluster/delete_placement_groups.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # delete all placement groups
 4 | 
 5 | import boto3
 6 | 
 7 | # {'PlacementGroups': [{'GroupName': 'gpu12',
 8 | #    'State': 'available',
 9 | #    'Strategy': 'cluster'},
10 | #   {'GroupName': 'gpu6', 'State': 'available', 'Strategy': 'cluster'},
11 | #   {'GroupName': 'gpu10', 'State': 'available', 'Strategy': 'cluster'},
12 | #   {'GroupName': 'gpu4', 'State': 'available', 'Strategy': 'cluster'},
13 | #   {'GroupName': 'cnn2', 'State': 'available', 'Strategy': 'cluster'},
14 | #   {'GroupName': 'gpu5', 'State': 'available', 'Strategy': 'cluster'},
15 | #   {'GroupName': 'gpu3', 'State': 'available', 'Strategy': 'cluster'},
16 | #   {'GroupName': 'tf', 'State': 'available', 'Strategy': 'cluster'},
17 | #   {'GroupName': 'gpu7', 'State': 'available', 'Strategy': 'cluster'},
18 | #   {'GroupName': 'gpu11', 'State': 'available', 'Strategy': 'cluster'},
19 | #   {'GroupName': 'gpu8', 'State': 'available', 'Strategy': 'cluster'},
20 | #   {'GroupName': 'gpu9', 'State': 'available', 'Strategy': 'cluster'},
21 | #   {'GroupName': 'cnn', 'State': 'available', 'Strategy': 'cluster'}],
22 | #  'ResponseMetadata': {'HTTPHeaders': {'content-type': 'text/xml;charset=UTF-8',
23 | #    'date': 'Tue, 28 Nov 2017 18:52:18 GMT',
24 | #    'server': 'AmazonEC2',
25 | #    'transfer-encoding': 'chunked',
26 | #    'vary': 'Accept-Encoding'},
27 | #   'HTTPStatusCode': 200,
28 | #   'RequestId': '3d7adfe7-1109-413d-9aab-2f0aeafef968',
29 | #   'RetryAttempts': 0}}
30 | 
31 | import boto3
32 | ec2 = boto3.client('ec2')
33 | 
34 | result=ec2.describe_placement_groups()
35 | #print(result)
36 | for entry in result["PlacementGroups"]:
37 |   name = entry.get('GroupName', '---')
38 |   try:
39 |     print("Deleting "+name)
40 |     response = ec2.delete_placement_group(GroupName=name)
41 |     print("Response was %d" %(response['ResponseMetadata']['HTTPStatusCode']))
42 |   except Exception as e:
43 |     print("Failed with %s"%(e,))
44 | 


--------------------------------------------------------------------------------
/cluster/fill_efs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import numpy as np
 4 | import math
 5 | import argparse
 6 | 
 7 | parser = argparse.ArgumentParser(description='script to fill EFS with data')
 8 | 
 9 | parser.add_argument('--gb', type=int, default=100, metavar='N',
10 |                     help='how many GBs to dump')
11 | parser.add_argument('--chunk_gb', type=int, default=1, metavar='N',
12 |                     help='how many GBs to dump')
13 | parser.add_argument('--fn', type=str, default="fill", metavar='N',
14 |                     help='filename')
15 | args = parser.parse_args()
16 | 
17 | def main():
18 |   chunk_size = args.chunk_gb*1e9
19 |   current_size = 0
20 | 
21 |   file_counter = 0
22 |   max_file_counter = int(math.ceil(args.gb/args.chunk_gb))
23 |   while current_size < args.gb*1e9:
24 |     fn = args.fn+"-%05d-of-%05d"%(file_counter, max_file_counter)
25 |     file_counter+=1
26 |     with open(fn, 'wb') as out:
27 |       out.write(np.random.bytes(chunk_size))
28 |     print("Wrote %5.1f GBs"%(current_size/1e9))
29 |     current_size+=chunk_size
30 | 
31 | if __name__=='__main__':
32 |   main()
33 | 


--------------------------------------------------------------------------------
/cluster/imagenet64/requirements.txt:
--------------------------------------------------------------------------------
1 | boto3
2 | paramiko
3 | pyyaml
4 | tensorflow-gpu==1.4
5 | 


--------------------------------------------------------------------------------
/cluster/instance_info.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | 
 3 | import boto3
 4 | 
 5 | """
 6 | A tool for retrieving basic information from the running EC2 instances.
 7 | """
 8 | 
 9 | # Connect to EC2
10 | ec2 = boto3.resource('ec2')
11 | 
12 | # Get information for all running instances
13 | running_instances = ec2.instances.filter(Filters=[{
14 |   'Name': 'instance-state-name',
15 |   'Values': ['running']}])
16 | 
17 | ec2info = defaultdict()
18 | for instance in running_instances:
19 |   for tag in instance.tags or []:
20 |     if 'Name'in tag['Key']:
21 |       name = tag['Value']
22 |   if name != 'tf':
23 |     continue
24 |   # Add instance info to a dictionary         
25 |   ec2info[instance.id] = {
26 |     'Name': name,
27 |     'Type': instance.instance_type,
28 |     'State': instance.state['Name'],
29 |     'Private IP': instance.private_ip_address,
30 |     'Public IP': instance.public_ip_address,
31 |     'Launch Time': instance.launch_time
32 |   }
33 | 
34 | attributes = ['Name', 'Type', 'State', 'Private IP', 'Public IP', 'Launch Time']
35 | for instance_id, instance in ec2info.items():
36 |   for key in attributes:
37 |     print("{0}: {1}".format(key, instance[key]))
38 |   print("------")
39 |   
40 | 


--------------------------------------------------------------------------------
/cluster/launch_simple_tf.py:
--------------------------------------------------------------------------------
 1 | # simple example of launching tensorflow job
 2 | 
 3 | import time
 4 | import tensorflow as tf
 5 | 
 6 | flags = tf.flags
 7 | flags.DEFINE_string("role", "launcher", "either launcher or worker")
 8 | flags.DEFINE_integer("data_mb", 128, "size of vector in MBs")
 9 | flags.DEFINE_integer("iters_per_step", 10, "number of additions per step")
10 | flags.DEFINE_string("cluster", "aws", "where to run (aws or local)")
11 | FLAGS = flags.FLAGS
12 | 
13 |   
14 | def main():
15 |   if FLAGS.role == "launcher":
16 |     launcher()
17 |   elif FLAGS.role == "worker":
18 |     worker()
19 |   else:
20 |     assert False, "Unknown role "+FLAGS.role
21 | 
22 | 
23 | def launcher(do_local=False):
24 |   if FLAGS.cluster == 'local':
25 |     import tmux
26 |     job = tmux.tf_job('myjob', 1)
27 |   elif FLAGS.cluster == 'aws':
28 |     import aws
29 |     job = aws.tf_job('myjob', 1)
30 |   else:
31 |     assert False, "Unknown cluster "+FLAGS.cluster
32 | 
33 |   task = job.tasks[0]
34 |   task.upload(__file__)   # copies current script onto machine
35 |   setup_cmd =  ("source ~/.bashrc && export PATH=~/anaconda3/bin:$PATH && "
36 |                 "source activate tf")
37 |   task.run("%s && python %s --role=worker" % (setup_cmd, __file__,))
38 |   
39 |   print("To see the output: tail -f %s" %(task.last_stdout))
40 |   print("To interact with the task, do "+task.connect_instructions)
41 |  
42 | 
43 | def worker():
44 |   """Worker script that runs on AWS machine. Adds vectors of ones forever,
45 |   prints MB/s."""
46 |   
47 |   def session_config():
48 |     optimizer_options = tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0)
49 |     config = tf.ConfigProto(
50 |       graph_options=tf.GraphOptions(optimizer_options=optimizer_options))
51 |     config.operation_timeout_in_ms = 10*1000  # abort after 10 seconds
52 |     return config
53 | 
54 |   params_size = 250*1000*FLAGS.data_mb # 1MB is 250k floats
55 |   dtype=tf.float32
56 |   val = tf.ones((), dtype=dtype)
57 |   vals = tf.fill([params_size], val)
58 |   params = tf.Variable(vals)
59 |   update = params.assign_add(vals)
60 |   
61 |   sess = tf.Session(config=session_config())
62 |   sess.run(params.initializer)
63 |   
64 |   while True:
65 |     start_time = time.perf_counter()
66 |     for i in range(FLAGS.iters_per_step):
67 |       sess.run(update.op)
68 | 
69 |     elapsed_time = time.perf_counter() - start_time
70 |     rate = float(FLAGS.iters_per_step)*FLAGS.data_mb/elapsed_time
71 |     print('%.2f MB/s'%(rate,))    
72 | 
73 |     
74 | if __name__=='__main__':
75 |   main()
76 | 


--------------------------------------------------------------------------------
/cluster/local_distributed_benchmark.py:
--------------------------------------------------------------------------------
  1 | """Benchmark tensorflow distributed by adding vector of ones on worker2
  2 | to variable on worker1 as fast as possible.
  3 | On 2014 macbook, TensorFlow 0.10 this shows
  4 | Local rate:       2175.28 MB per second
  5 | Distributed rate: 107.13 MB per second
  6 | """
  7 | 
  8 | import subprocess
  9 | import tensorflow as tf
 10 | import time
 11 | import sys
 12 | 
 13 | flags = tf.flags
 14 | flags.DEFINE_integer("iters", 10, "Maximum number of additions")
 15 | flags.DEFINE_integer("data_mb", 100, "size of vector in MBs")
 16 | flags.DEFINE_string("port1", "12224", "port of worker1")
 17 | flags.DEFINE_string("port2", "12225", "port of worker2")
 18 | flags.DEFINE_string("task", "", "internal use")
 19 | FLAGS = flags.FLAGS
 20 | 
 21 | # setup local cluster from flags
 22 | host = "127.0.0.1:"
 23 | cluster = {"worker": [host+FLAGS.port1, host+FLAGS.port2]}
 24 | clusterspec = tf.train.ClusterSpec(cluster).as_cluster_def()
 25 | 
 26 | def default_config():
 27 |   optimizer_options = tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0)
 28 |   config = tf.ConfigProto(
 29 |     graph_options=tf.GraphOptions(optimizer_options=optimizer_options))
 30 |   config.log_device_placement = False
 31 |   config.allow_soft_placement = False
 32 |   return config
 33 | 
 34 | def create_graph(device1, device2):
 35 |   """Create graph that keeps variable on device1 and
 36 |   vector of ones/addition op on device2"""
 37 |   
 38 |   tf.reset_default_graph()
 39 |   dtype=tf.int32
 40 |   params_size = 250*1000*FLAGS.data_mb # 1MB is 250k integers
 41 | 
 42 |   with tf.device(device1):
 43 |     params = tf.get_variable("params", [params_size], dtype,
 44 |                              initializer=tf.zeros_initializer)
 45 |   with tf.device(device2):
 46 |     # constant node gets placed on device1 because of simple_placer
 47 |     #    update = tf.constant(1, shape=[params_size], dtype=dtype)
 48 |     update = tf.get_variable("update", [params_size], dtype,
 49 |                              initializer=tf.ones_initializer)
 50 |     add_op = params.assign_add(update)
 51 |     
 52 |   init_op = tf.initialize_all_variables()
 53 |   return init_op, add_op
 54 | 
 55 | def run_benchmark(sess, init_op, add_op):
 56 |   """Returns MB/s rate of addition."""
 57 |   
 58 |   sess.run(init_op)
 59 |   sess.run(add_op.op)  # warm-up
 60 |   start_time = time.time()
 61 |   for i in range(FLAGS.iters):
 62 |     # change to add_op.op to make faster
 63 |     sess.run(add_op)
 64 |   elapsed_time = time.time() - start_time
 65 |   return float(FLAGS.iters)*FLAGS.data_mb/elapsed_time
 66 | 
 67 | 
 68 | def run_benchmark_local():
 69 |   ops = create_graph(None, None)
 70 |   sess = tf.Session(config=default_config())
 71 |   return run_benchmark(sess, *ops)
 72 | 
 73 | 
 74 | def run_benchmark_distributed():
 75 |   ops = create_graph("/job:worker/task:0", "/job:worker/task:1")
 76 | 
 77 |   # launch distributed service
 78 |   def runcmd(cmd): subprocess.Popen(cmd, shell=True, stderr=subprocess.STDOUT)
 79 |   runcmd("python %s --task=0"%(sys.argv[0]))
 80 |   runcmd("python %s --task=1"%(sys.argv[0]))
 81 |   time.sleep(1)
 82 | 
 83 |   sess = tf.Session("grpc://"+host+FLAGS.port1, config=default_config())
 84 |   return run_benchmark(sess, *ops)
 85 |   
 86 | if __name__=='__main__':
 87 |   if not FLAGS.task:
 88 | 
 89 |     rate1 = run_benchmark_local()
 90 |     rate2 = run_benchmark_distributed()
 91 | 
 92 |     print("Adding data in %d MB chunks" %(FLAGS.data_mb))
 93 |     print("Local rate:       %.2f MB per second" %(rate1,))
 94 |     print("Distributed rate: %.2f MB per second" %(rate2,))
 95 | 
 96 |   else: # Launch TensorFlow server
 97 |     server = tf.train.Server(clusterspec, config=default_config(),
 98 |                              job_name="worker",
 99 |                              task_index=int(FLAGS.task))
100 |     server.join()
101 | 


--------------------------------------------------------------------------------
/cluster/myutil.py:
--------------------------------------------------------------------------------
 1 | from pprint import pprint as pp
 2 | import yaml
 3 | #import util
 4 | import boto3
 5 | from collections import OrderedDict
 6 | import time
 7 | 
 8 | class timeit:
 9 |   """Decorator to measure length of time spent in the block in millis and log
10 |   it to TensorBoard."""
11 |   
12 |   def __init__(self, tag=""):
13 |     self.tag = tag
14 |     
15 |   def __enter__(self):
16 |     self.start = time.perf_counter()
17 |     return self
18 |   
19 |   def __exit__(self, *args):
20 |     self.end = time.perf_counter()
21 |     interval_sec = (self.end - self.start)
22 |     print("%s took %.2f seconds"%(self.tag, interval_sec))
23 | 
24 | def get_instance_ip_map():
25 |   """Return instance_id->private_ip map for all running instances."""
26 |   
27 |   ec2 = boto3.resource('ec2')
28 | 
29 |   # Get information for all running instances
30 |   running_instances = ec2.instances.filter(Filters=[{
31 |     'Name': 'instance-state-name',
32 |     'Values': ['running']}])
33 | 
34 |   ec2info = OrderedDict()
35 |   for instance in running_instances:
36 |     name = ''
37 |     for tag in instance.tags or []:
38 |       if 'Name' in tag['Key']:
39 |         name = tag['Value']
40 |     ec2info[instance.id] = instance.private_ip_address
41 |     
42 |   return ec2info
43 | 


--------------------------------------------------------------------------------
/cluster/terminate_instances.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | 
 4 | Script to kill all instances matching given prefix.
 5 | 
 6 | Usage:
 7 | 
 8 | ./terminate_instances.py gpu   # terminates all instances matching "gpu*"
 9 | """
10 | 
11 | import boto3
12 | import time
13 | import sys
14 | import os
15 | 
16 | LIMIT_TO_KEY = 'yaroslav'   # only touch instances launched with this key,
17 |                             # set to '' to remove restriction
18 | 
19 | def main():
20 |   prefix = sys.argv[1]
21 | 
22 |   ec2 = boto3.client('ec2')
23 |   response = ec2.describe_instances()
24 | 
25 |   def get_name(instance_response):
26 |     names = [entry['Value'] for entry in instance_response.get('Tags',[]) if
27 |              entry['Key']=='Name']
28 |     if not names:
29 |       names = ['']
30 |     assert len(names)==1
31 |     return names[0]
32 | 
33 |   instance_list = []
34 |   for reservation in response['Reservations']:
35 |     for instance_response in reservation['Instances']:
36 |       instance_list.append((get_name(instance_response),
37 |                             instance_response))
38 | 
39 |   instances_to_kill = []
40 |   for (name, instance_response) in instance_list:
41 |     if not name.startswith(prefix):
42 |       continue
43 |     key = instance_response.get('KeyName', '')
44 |     if LIMIT_TO_KEY and LIMIT_TO_KEY != key:
45 |       print("instance %s matches but key %s doesn't match desired key %s, "
46 |             "skipping" %(name, key, LIMIT_TO_KEY))
47 |       continue
48 |     state = instance_response['State']['Name']
49 |     if state == 'terminated':
50 |       continue
51 |     instances_to_kill.append((instance_response['InstanceId'],
52 |                               name,
53 |                               instance_response['AmiLaunchIndex'],
54 |                               state))
55 | 
56 |   for (instance_id, name, task_id, state) in instances_to_kill:
57 |     print("%s:%s   %s"%(name, task_id, state))
58 | 
59 |   answer = input("%d instances found, terminate? (Y/n) " % (
60 |     len(instances_to_kill)))
61 |   if not answer:
62 |     answer = "y"
63 |   if answer.lower() == "y":
64 |     instance_ids = [record[0] for record in instances_to_kill]
65 |     response = ec2.terminate_instances(InstanceIds=instance_ids)
66 |     print("Terminating, got response: %s", response)
67 |   else:
68 |     print("Didn't get y, doing nothing")
69 |   
70 | 
71 | if __name__=='__main__':
72 |   main()
73 | 


--------------------------------------------------------------------------------
/cluster/test_aws.py:
--------------------------------------------------------------------------------
 1 | # simple example of launching tensorflow job
 2 | 
 3 | import aws
 4 | import os
 5 | import sys
 6 | import time
 7 | import tensorflow as tf
 8 | import boto3
 9 | 
10 | flags = tf.flags
11 | flags.DEFINE_string("role", "launcher", "either launcher or worker")
12 | flags.DEFINE_integer("data_mb", 128, "size of vector in MBs")
13 | flags.DEFINE_integer("iters_per_step", 10, "number of additions per step")
14 | FLAGS = flags.FLAGS
15 | 
16 | module_path=os.path.dirname(os.path.abspath(__file__))
17 | sys.path.insert(0, module_path+'/tf-tools/benchmark/runner')
18 | import cluster_aws as toby_aws
19 | 
20 | 
21 | def test_new_job():
22 |   name = "testjob"
23 |   instances = toby_aws.LookupAwsInstances(instance_tag=name)
24 |   assert not instances, "Instances already exist, kill them first"
25 | 
26 |   job = aws.tf_job(name, 2)
27 |   instances = toby_aws.LookupAwsInstances(instance_tag=name)
28 |   assert len(instances) == 2
29 | 
30 | def test_terminate_job():
31 |   aws.terminate_job("testjob")
32 | 
33 | 
34 | def test_reuse_job():
35 |   name = "testjob"
36 |   job = aws.tf_job(name, 2)
37 | 
38 | def test_send_file():
39 |   name = "testjob"
40 |   job = aws.tf_job(name, 4)
41 |   job.wait_until_ready()
42 |   task0 = job.tasks[0]
43 |   secret_word = "testfile3"
44 |   os.system("echo '%s' > upload_test.txt"%(secret_word,))
45 |   task0.upload('upload_test.txt')
46 |   stdout,stderr = task0.run_sync("cat upload_test.txt")
47 |   print(stdout)    # => testfile2
48 |   assert stdout.strip() == secret_word
49 | 
50 | def test_upload_directory():
51 |   pass
52 | 
53 | def test_stream_output():
54 |   name = "testjob"
55 |   job = aws.tf_job(name, 4)
56 |   job.wait_until_ready()
57 |   task = job.tasks[0]
58 |   task.run('cd Dropbox && ls') 
59 |   time.sleep(0.5)  # async ... todo: expose thread and join instead of sleep?
60 |   os.system('cat '+task.last_stdout)
61 | 
62 | 
63 | def main():
64 |   #  test_terminate_job()
65 |   #  test_new_job()
66 |   #  test_reuse_job()
67 |   #  test_send_file()
68 |   test_stream_output()
69 |     
70 | if __name__=='__main__':
71 |   main()
72 | 


--------------------------------------------------------------------------------
/cluster/tf-tools/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | .DS_Store
3 | 


--------------------------------------------------------------------------------
/cluster/tf-tools/benchmark/multi_gpu/advanced_tweaks_compare.sh:
--------------------------------------------------------------------------------
 1 | # Showing NCHW vs NHWC, NCCL and paramater server GPU vs CPU
 2 | _NUM_GPUS=1,2,8
 3 | LOG_FOLDER=advanced_tests
 4 | 
 5 | # PS GPU vs. CPU NHWC
 6 | ./test_runner.sh --model resnet50 --num_batches 100 --batch_size 32 --gpus ${_NUM_GPUS} --ps_server gpu --variable_update send_recv --data_format NHWC --log_folder_prefix ${LOG_FOLDER} --framework tensorflow
 7 | ./test_runner.sh --model resnet50 --num_batches 100 --batch_size 32 --gpus ${_NUM_GPUS} --ps_server cpu --variable_update send_recv --data_format NHWC --log_folder_prefix ${LOG_FOLDER} --framework tensorflow
 8 | 
 9 | # NCHW vs NHWC  (GPU PS)
10 | ./test_runner.sh --model resnet50 --num_batches 100 --batch_size 32 --gpus ${_NUM_GPUS} --ps_server gpu --data_format NCHW --variable_update send_recv --log_folder_prefix ${LOG_FOLDER} --framework tensorflow
11 | ./test_runner.sh --model resnet50 --num_batches 100 --batch_size 32 --gpus ${_NUM_GPUS} --ps_server gpu --data_format NHWC --variable_update send_recv --log_folder_prefix ${LOG_FOLDER} --framework tensorflow
12 | 
13 | # Add NCCL to NCHW (GPU PS)
14 | ./test_runner.sh --model resnet50 --num_batches 100 --batch_size 32 --gpus ${_NUM_GPUS} --ps_server gpu --data_format NCHW --variable_update replicated --log_folder_prefix ${LOG_FOLDER} --framework tensorflow
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/cluster/tf-tools/benchmark/multi_gpu/image_classification_bench_tests.sh:
--------------------------------------------------------------------------------
 1 | # Runs tests for an 8 GPU server
 2 | _NUM_GPUS=1,2,4,8
 3 | # Inception v3
 4 | ./test_runner.sh --model inception3 --num_batches 100 --batch_size 32 --gpus ${_NUM_GPUS} --framework tensorflow
 5 | ./test_runner.sh --model inception3 --num_batches 4 --batch_size 32 --gpus ${_NUM_GPUS} --framework mxnet
 6 | 
 7 | 
 8 | # Resnet-50
 9 | ./test_runner.sh --model resnet50 --num_batches 100 --batch_size 32 --gpus ${_NUM_GPUS} --framework tensorflow
10 | ./test_runner.sh --model resnet50 --num_batches 4 --batch_size 32 --gpus ${_NUM_GPUS} --framework mxnet
11 | 
12 | 
13 | # Resnet-152
14 | ./test_runner.sh --model resnet152 --num_batches 100 --batch_size 32 --gpus ${_NUM_GPUS} --framework tensorflow
15 | ./test_runner.sh --model resnet152 --num_batches 4 --batch_size 32 --gpus ${_NUM_GPUS} --framework mxnet
16 | 
17 | 
18 | # AlexNet (OWT)
19 | # AlexNet script is broken on MXNet.
20 | #./test_runner.sh --model alexnet --num_batches 4 --batch_size 512 --gpus ${_NUM_GPUS} --framework mxnet
21 | ./test_runner.sh --model alexnet --num_batches 100 --batch_size 512 --gpus ${_NUM_GPUS} --framework tensorflow
22 | ./test_runner.sh --model alexnet --num_batches 100 --batch_size 128 --gpus ${_NUM_GPUS} --framework tensorflow
23 | 


--------------------------------------------------------------------------------
/cluster/tf-tools/benchmark/multi_gpu/stats_monitor.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | # Get all nvidia-smi data worth having
 5 | # There is no historical data so calling this after a run
 6 | # when the GPU may no long be throttled is of no value.
 7 | 
 8 | KEEP_LOOP=true
 9 | LOG_FULL_PATH="./monitor_log.txt"
10 | LOG_SUMMARY_FULL_PATH="./log_summary.txt"
11 | 
12 | 
13 | while [[ $# -gt -0 ]]; do
14 |   key="$1"
15 | #  echo $key
16 | #  echo $2
17 |   case $key in
18 |     --log_full_path)
19 |       LOG_FULL_PATH="$2" # location to log raw monitoring logs, e.g. nvidia-smi
20 |       shift
21 |       ;;
22 |     --log_summary_full_path)
23 |       LOG_SUMMARY_FULL_PATH="$2"  # Format of the data NHWC or NCHW (NVIDIA)
24 |       shift
25 |       ;;
26 |     *)
27 |       echo "Unknown flag: $key"
28 |       ;;
29 |   esac
30 |   shift # past argument or value
31 | done
32 | 
33 | 
34 | 
35 | 
36 | MAX_SLOWDOWN_GPUS=0
37 | # Handle CTRL-C or other term signal, log the max number of GPUs that showed
38 | # Errors, for now that means "HW Slowdown: Active"
39 | function summarizeCleanup {
40 |   echo "Max GPUs throttled: ${MAX_SLOWDOWN_GPUS}"
41 |   echo "Max GPUs throttled: ${MAX_SLOWDOWN_GPUS}" >> $LOG_SUMMARY_FULL_PATH
42 |   exit;
43 | }
44 | 
45 | # catch being asked to end
46 | trap summarizeCleanup SIGINT SIGTERM
47 | 
48 | # Log nvidia-smi data forever (until killed externally) and track when HW Slowdown: Active occures 
49 | # which indicates overheating and likely lower clock.
50 | while [ "$KEEP_LOOP" = "true" ]; do
51 | 
52 |   RESULT=$(nvidia-smi -q -d UTILIZATION,CLOCK,PERFORMANCE | tee -a ${LOG_FULL_PATH} | \
53 | grep -E 'HW Slowdown' | awk '!/Not Active/ {count++} END{print count}')
54 |   
55 |   # Handle result being blank.  Likely a better way using awk above
56 |   if [ "$RESULT" = "" ]; then
57 |     RESULT=0
58 |   fi  
59 | 
60 |   if [ "$RESULT" -gt "$MAX_SLOWDOWN_GPUS" ]; then
61 |   	MAX_SLOWDOWN_GPUS=$RESULT
62 |   	echo "$MAX_SLOWDOWN_GPUS GPU(s) with slowdown"
63 |   fi
64 | 
65 |   
66 |   # 10 second seem to be reasonable.
67 |   sleep 10
68 | 
69 | done
70 | 
71 | 


--------------------------------------------------------------------------------
/cluster/tf-tools/benchmark/multi_gpu/unit_test_stats_monitor.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ./monitor_nvidia.sh --log_full_path ./full_log.txt --log_summary_full_path ./log_summary.txt  &
 4 | 
 5 | NVIDIA_MONITOR=$!
 6 | 
 7 | echo "Log monitor pid ${NVIDIA_MONITOR}"
 8 | 
 9 | sleep 12
10 | 
11 | kill $NVIDIA_MONITOR
12 | wait $NVIDIA_MONITOR
13 | echo "Success:  ${NVIDIA_MONITOR} is not longer running" 
14 | 
15 | # put this in any script to hard kill monitor_nvidida
16 | echo "Test killing with pgrep"
17 | ./monitor_nvidia.sh --log_full_path ./full_log --log_summary_full_path ./log_summary  &
18 | NVIDIA_MONITOR_2=$!
19 | 
20 | sleep 12
21 | 
22 | echo "Log monitor pid ${NVIDIA_MONITOR_2}"
23 | echo "kill with pgrep"
24 | pgrep "monitor_nvidia" | xargs kill
25 | 
26 | echo "Wait until dead"
27 | wait $NVIDIA_MONITOR_2
28 | echo "Process is dead:  Test successful"
29 | 
30 | 


--------------------------------------------------------------------------------
/cluster/tf-tools/benchmark/runner/configs/aws/multi_server.yaml:
--------------------------------------------------------------------------------
 1 | # Run config
 2 | cloud_type: aws
 3 | 
 4 | tf_url: tensorflow-gpu
 5 | 
 6 | # Shared with AWS and GCE
 7 | instance_tag: tf-monster
 8 | instance_type: p2.8xlarge
 9 | instance_force_reuse: False
10 | instance_ami: ami-xxxxxxx
11 | instance_count: 8
12 | #instance_on_finish: stop
13 | 
14 | # As of May 2017 this config matches what was published on tf.org.
15 | # For batch-size 32, 4 ps_servers is the right setting for 8 workers
16 | run_configs:
17 |   - name: distributed
18 |     workers: 8
19 |     ps_servers: 8
20 |     gpus: 8
21 |     models: ['resnet50']
22 |     ps_server: gpu
23 |     data_format: NCHW
24 |     variable_update: distributed_replicated
25 |     log_folder: results
26 |     framework: tensorflow
27 |     num_batches: 100
28 |     batch_size: 64
29 |     repeat: 5
30 |     cross_replica_sync: True
31 |     optimizer: sgd
32 | 
33 | 
34 | ####
35 | # Full run 32 GPUs down to 1 GPU with ps_servers tuned for resnet50
36 | #######
37 | 
38 |   - name: distributed
39 |     workers: 4
40 |     ps_servers: 4
41 | 
42 |   - name: distributed
43 |     workers: 2
44 |     ps_servers: 2
45 | 
46 |   - name: distributed
47 |     workers: 1
48 |     ps_servers: 1
49 | 
50 |   - name: distributed
51 |     workers: '0'
52 |     ps_servers: '0'
53 |     gpus: 1
54 | 


--------------------------------------------------------------------------------
/cluster/tf-tools/benchmark/runner/configs/aws/yaroslav.yaml:
--------------------------------------------------------------------------------
 1 | # Run config
 2 | cloud_type: aws
 3 | 
 4 | tf_url: tensorflow-gpu
 5 | 
 6 | instance_tag: yaroslav
 7 | instance_type: p2.xlarge
 8 | instance_force_reuse: False
 9 | instance_ami: ami-60df1418
10 | instance_count: 8
11 | #instance_on_finish: stop
12 | 
13 | run_configs:
14 |   - name: distributed
15 |     workers: 1
16 |     ps_servers: 1
17 |     gpus: 1
18 |     models: ['resnet50']
19 |     ps_server: gpu
20 |     data_format: NCHW
21 |     variable_update: distributed_replicated
22 |     log_folder: results
23 |     framework: tensorflow
24 |     num_batches: 100
25 |     batch_size: 64
26 |     repeat: 1
27 |     cross_replica_sync: True
28 |     optimizer: sgd
29 | 
30 | 


--------------------------------------------------------------------------------
/cluster/tf-tools/benchmark/runner/instance_info.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | 
 3 | import boto3
 4 | 
 5 | """
 6 | A tool for retrieving basic information from the running EC2 instances.
 7 | """
 8 | 
 9 | # Connect to EC2
10 | ec2 = boto3.resource('ec2')
11 | 
12 | # Get information for all running instances
13 | running_instances = ec2.instances.filter(Filters=[{
14 |   'Name': 'instance-state-name',
15 |   'Values': ['running']}])
16 | 
17 | ec2info = defaultdict()
18 | for instance in running_instances:
19 |   for tag in instance.tags or []:
20 |     if 'Name'in tag['Key']:
21 |       name = tag['Value']
22 |   if name != 'tf':
23 |     continue
24 |   # Add instance info to a dictionary         
25 |   ec2info[instance.id] = {
26 |     'Name': name,
27 |     'Type': instance.instance_type,
28 |     'State': instance.state['Name'],
29 |     'Private IP': instance.private_ip_address,
30 |     'Public IP': instance.public_ip_address,
31 |     'Launch Time': instance.launch_time
32 |   }
33 | 
34 | attributes = ['Name', 'Type', 'State', 'Private IP', 'Public IP', 'Launch Time']
35 | for instance_id, instance in ec2info.items():
36 |   for key in attributes:
37 |     print("{0}: {1}".format(key, instance[key]))
38 |   print("------")
39 |   
40 | 


--------------------------------------------------------------------------------
/cluster/tf-tools/benchmark/runner/test_cluster_aws.py:
--------------------------------------------------------------------------------
 1 | from command_builder import *
 2 | from pprint import pprint as pp
 3 | import yaml
 4 | import cluster_aws
 5 | 
 6 | from collections import OrderedDict
 7 | import time
 8 | 
 9 | AMI='ami-60df1418'   # cuda 8
10 | AMI='ami-9ddb0fe5'   # boyd base
11 | KEY_NAME='yaroslav'
12 | KEY_FILE=os.environ['HOME']+'/d/yaroslav.pem'
13 | SECURITY_GROUP='open'
14 | #INSTANCE_TYPE='g3.16xlarge'
15 | INSTANCE_TYPE='p2.8xlarge'
16 | TAG='tf'
17 | 
18 | global_timeit_dict = OrderedDict()
19 | class timeit:
20 |   """Decorator to measure length of time spent in the block in millis and log
21 |   it to TensorBoard."""
22 |   
23 |   def __init__(self, tag=""):
24 |     self.tag = tag
25 |     
26 |   def __enter__(self):
27 |     self.start = time.perf_counter()
28 |     return self
29 |   
30 |   def __exit__(self, *args):
31 |     self.end = time.perf_counter()
32 |     interval_sec = (self.end - self.start)
33 |     print("%s took %.2f seconds"%(self.tag, interval_sec))
34 | 
35 | def test_two_machine():
36 |   
37 | 
38 | 
39 | def main():
40 |   FIRST_TIME = False
41 |   
42 |   if FIRST_TIME:
43 |     with timeit('create_instances'):
44 |       instances = cluster_aws.CreateAwsInstances(num_instances=2,
45 |                                                  image_id=AMI,
46 |                                                  key_name=KEY_NAME,
47 |                                                  ssh_key=KEY_FILE,
48 |                                                  security_group=SECURITY_GROUP,
49 |                                                  instance_tag=TAG,
50 |                                                  placement_group='',
51 |                                                  instance_type=INSTANCE_TYPE)
52 |   else:
53 |     instances = cluster_aws.LookupAwsInstances(instance_tag=TAG,
54 |                                                ssh_key=KEY_FILE)
55 |     #    Exception connecting to host via ssh (could be a timeout):
56 | 
57 | 
58 |   
59 |   with timeit('connect'):
60 |     instance = instances[0]
61 |     instance.WaitUntilReady()
62 |     
63 | 
64 |   def line_extractor(line):
65 |     return True
66 |   
67 |   instance.ExecuteCommandAndStreamOutput('mkdir 43',
68 |                                          stdout_file='/tmp/output')
69 |   instance.ExecuteCommandAndStreamOutput('ls', stdout_file='/tmp/output')
70 | 
71 |   import pdb; pdb.set_trace()
72 | 
73 | 
74 | if __name__=='__main__':
75 |   main()
76 | 


--------------------------------------------------------------------------------
/cluster/tf-tools/benchmark/runner/test_command_builder.py:
--------------------------------------------------------------------------------
 1 | from command_builder import *
 2 | from pprint import pprint as pp
 3 | import yaml
 4 | 
 5 | def main():
 6 | 
 7 |   
 8 |   with open('configs/aws/yaroslav.yaml') as stream:
 9 |     config_yaml = yaml.load(stream)
10 | 
11 |   configs = LoadYamlRunConfig(config_yaml, 1)
12 | #  pp(configs)
13 | 
14 |   config = configs[0]
15 | 
16 |   worker_hosts = ['1','2']
17 |   worker_hosts_str = ','.join(worker_hosts)
18 |   ps_hosts = ['a','b']
19 |   ps_hosts_str = ','.join(ps_hosts)
20 |   for i,worker in enumerate(worker_hosts):
21 |     print(BuildDistributedCommandWorker(config, worker_hosts_str, ps_hosts_str, i))
22 |     
23 |   for i,worker in enumerate(ps_hosts):
24 |     print(BuildDistributedCommandPS(config, worker_hosts_str, ps_hosts_str, i))
25 |                                 
26 |   
27 | 
28 |   
29 | 
30 | if __name__=='__main__':
31 |   main()
32 | 


--------------------------------------------------------------------------------
/cluster/upload_test.txt:
--------------------------------------------------------------------------------
1 | testfile3
2 | 


--------------------------------------------------------------------------------
/conditional_backprop.py:
--------------------------------------------------------------------------------
 1 | # Example of conditionally enabling backprop based on a variable.
 2 | # variable "switches" determines which entries of "y" will be backpropagated
 3 | # through.
 4 | #
 5 | # IE, switches.assign([1,0]) enables backprop through first value but not
 6 | # second.
 7 | #
 8 | # Running it you should see following on stdout:
 9 | # Value 2.0, gradient 2.0
10 | # Value 2.0, gradient 0.0
11 | # Value 2.0, gradient 1.0
12 | 
13 | import tensorflow as tf
14 | 
15 | def conditional_backprop(do_backprop, tensor):
16 |     do_backprop = tf.Print(do_backprop, [do_backprop], "switch query")
17 |     t = tf.cond(tf.cast(do_backprop, tf.bool),
18 |                 lambda: tf.Print(tensor, [0],
19 |                                  "backprop enabled for "+tensor.op.name),
20 |                 lambda: tf.zeros_like(tensor))
21 |     y = t + tf.stop_gradient(tensor - t)
22 |     return y
23 | 
24 | x = tf.ones((), name="x")
25 | y0 = tf.add(x, 0, name="y0")
26 | y1 = tf.add(x, 0, name="y1")
27 | 
28 | switches = tf.Variable(tf.ones((2)))
29 | doit = tf.constant(True)
30 | yy0 = conditional_backprop(switches[0], y0)
31 | yy1 = conditional_backprop(switches[1], y1)
32 | y = tf.stack([yy0, yy1], name="y")
33 | 
34 | z = tf.reduce_sum(y)
35 | 
36 | grad = tf.gradients(z, [x])[0]
37 | 
38 | sess = tf.Session()
39 | sess.run(tf.global_variables_initializer())
40 | print("Value %.1f, gradient %.1f"%tuple(sess.run([z, grad])))
41 | 
42 | sess.run(switches.assign([0,0]))
43 | print("Value %.1f, gradient %.1f"%tuple(sess.run([z, grad])))
44 | 
45 | sess.run(switches.assign([1,0]))
46 | print("Value %.1f, gradient %.1f"%tuple(sess.run([z, grad])))
47 | 


--------------------------------------------------------------------------------
/configure_tf.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/expect -d
 2 | # Helper script that uses expect to automatically go through all configure
 3 | # steps using the defaults for all options except
 4 | # XLA: y
 5 | # CUDA: y
 6 | # compute capability: 3.5,5.0,6.0,6.1
 7 | spawn ./configure
 8 | expect "Please specify the location of python*"
 9 | send "\r"
10 | expect "Please specify optimization flags to use during compilation when bazel option*"
11 | send "\r"
12 | expect "Do you wish to use jemalloc*"
13 | send "\r"
14 | expect "Do you wish to build TensorFlow with Google Cloud Platform*"
15 | send "\r"
16 | expect "Do you wish to build TensorFlow with Hadoop File System support*"
17 | send "\r"
18 | expect "Do you wish to build TensorFlow with the XLA*"
19 | send "y\r"
20 | expect "Please input the desired Python library*"
21 | send "\r"
22 | expect "Do you wish to build TensorFlow with OpenCL*"
23 | send "\r"
24 | expect "Do you wish to build TensorFlow with CUDA*"
25 | send "y\r"
26 | expect "Please specify which gcc should*"
27 | send "\r"
28 | expect "Please specify the CUDA SDK version you want to use*"
29 | send "\r"
30 | expect "Please specify the location where CUDA  toolkit*"
31 | send "\r"
32 | expect "Please specify the Cudnn version*"
33 | send "\r"
34 | expect "Please specify the location where cuDNN"
35 | send "\r"
36 | expect "lease specify a list of comma-separated Cuda compute"
37 | send "3.5,5.2,6.0,6.1\r"
38 | set timeout 120
39 | expect eof
40 | 


--------------------------------------------------------------------------------
/configure_tf_cpu.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/expect -d
 2 | # Helper script that uses expect to automatically go through all configure
 3 | # steps using the defaults for all options except
 4 | # XLA: y
 5 | # CUDA: n
 6 | spawn ./configure
 7 | expect "Please specify the location of python*"
 8 | send "\r"
 9 | expect "Please specify optimization flags to use during compilation when bazel option*"
10 | send "\r"
11 | expect "Do you wish to use jemalloc*"
12 | send "\r"
13 | expect "Do you wish to build TensorFlow with Google Cloud Platform*"
14 | send "\r"
15 | expect "Do you wish to build TensorFlow with Hadoop File System support*"
16 | send "\r"
17 | expect "Do you wish to build TensorFlow with the XLA*"
18 | send "y\r"
19 | expect "Please input the desired Python library*"
20 | send "\r"
21 | expect "Do you wish to build TensorFlow with OpenCL*"
22 | send "\r"
23 | expect "Do you wish to build TensorFlow with CUDA*"
24 | send "\r"
25 | set timeout 120
26 | expect eof
27 | 


--------------------------------------------------------------------------------
/danjar_peek.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.python.client import timeline
 3 | 
 4 | 
 5 | class Queue(tf.FIFOQueue):
 6 | 
 7 |   def __init__(self, capacity):
 8 |     s = ()
 9 |     d = tf.int32
10 |     super().__init__(capacity - 1, [d], [s])
11 |     self._first = tf.get_variable(name="var1",
12 |                                   initializer=tf.ones_initializer(),
13 |                                   shape=s, dtype=d, use_resource=False)
14 |     self._size = tf.get_variable(name="size", shape=(),
15 |                                  initializer=tf.zeros_initializer(),
16 |                                  dtype=tf.int32, use_resource=False)
17 |  
18 |   def peek(self):
19 |     return self._first.read_value()
20 | 
21 |   def enqueue(self, element):
22 |     super_ = super()
23 |     def first():
24 |       assigns = [self._first.assign(element)]
25 |       with tf.control_dependencies(assigns):
26 |         return tf.constant(0)
27 |       
28 |     def other():
29 |       with tf.control_dependencies([super_.enqueue(element)]):
30 |         return tf.constant(0)
31 |       
32 |     with tf.control_dependencies([self._size.assign_add(1)]):
33 |       dummy = tf.cond(tf.equal(self._size, 0), first, other)
34 |       return tf.identity(dummy)
35 | 
36 | 
37 | queue = Queue(10)
38 | queue_peek = queue.peek()
39 | print("Peek op is "+str(queue_peek))
40 | 
41 | queue_init = queue.enqueue(tf.constant(-2))
42 | 
43 | 
44 | print(tf.get_default_graph().as_graph_def())
45 | for i in range(20):
46 |   sess = tf.Session()
47 |   sess.run(tf.global_variables_initializer())
48 |   sess.run(queue_init)
49 |   print("queue size", sess.run(queue.size()))
50 |   sess.run(queue.close())
51 | 
52 | #  print("Printing queue")
53 | #  while True:
54 | #    print(sess.run(queue.dequeue()))
55 | 
56 |   run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
57 |   run_options.output_partition_graphs = True
58 |   run_metadata = tf.RunMetadata()
59 |   #import pdb; pdb.set_trace()
60 |   # queue_peek, 
61 |   result = sess.run(queue_peek, run_metadata=run_metadata,
62 |                     options=run_options)
63 | 
64 |   tl = timeline.Timeline(run_metadata.step_stats)
65 |   ctf = tl.generate_chrome_trace_format()
66 |   with open('timeline-%d.json'%(i,), 'w') as f:
67 |     f.write(ctf)
68 |   with open('stepstats-%d.json'%(i,), 'w') as f:
69 |     f.write(str(run_metadata))
70 | 
71 |   print(result, end=' ')
72 |   
73 | # Expected: 1 1 1 1 1 1 1 1 1 1
74 | # Actual: 0 1 0 0 1 1 0 0 0 1
75 | 


--------------------------------------------------------------------------------
/distributed/README.md:
--------------------------------------------------------------------------------
1 | TF distributed tools
2 |  


--------------------------------------------------------------------------------
/double_memory_bug.py:
--------------------------------------------------------------------------------
 1 | # Troubleshooting
 2 | # https://github.com/tensorflow/tensorflow/issues/13433#issuecomment-351722017
 3 | 
 4 | import tensorflow as tf
 5 | import numpy as np
 6 | 
 7 | def sessrun(*args, **kwargs):
 8 |   """Helper to do sess.run and save run_metadata"""
 9 |   global sess, run_metadata
10 |   
11 |   run_metadata = tf.RunMetadata()
12 | 
13 |   kwargs['options'] = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
14 |   kwargs['run_metadata'] = run_metadata
15 |   result = sess.run(*args, **kwargs)
16 |   first_entry = args[0]
17 |   # have to do this because sess.run(tensor) is same as sess.run([tensor]) 
18 |   if isinstance(first_entry, list):
19 |     if len(first_entry) == 0 and len(args) == 1:
20 |       return None
21 |     first_entry = first_entry[0]
22 | 
23 | import urllib.request
24 | response = urllib.request.urlopen("https://raw.githubusercontent.com/yaroslavvb/chain_constant_memory/master/mem_util.py")
25 | open("mem_util.py", "wb").write(response.read())
26 | 
27 | import mem_util
28 | 
29 | 
30 | dtype = tf.float32
31 | dtype_size = 4 # bytes
32 | #shape = (1000,1000*1000)
33 | shape = (100, 1000*1000)
34 | total_size = np.prod(shape)*dtype_size
35 | print("Variable with %.1f GB" %(total_size/1e9,))
36 | w = tf.Variable(tf.random_uniform(shape,dtype=dtype),dtype=dtype)
37 | sess = tf.Session()
38 | sessrun(tf.global_variables_initializer())
39 | print(sess.run(w[0,0]))
40 | 
41 | mem_util.print_memory_timeline(run_metadata)
42 | 


--------------------------------------------------------------------------------
/dynamic_stitch_gpu.py:
--------------------------------------------------------------------------------
 1 | # from https://github.com/tensorflow/tensorflow/issues/7251
 2 | import os
 3 | os.environ["CUDA_VISIBLE_DEVICES"]="0"
 4 | 
 5 | import tensorflow as tf
 6 | from tensorflow.python.client.timeline import Timeline
 7 | 
 8 | with tf.device("/gpu:0"):
 9 |     x = tf.ones(100, name="x")
10 |     idxs = tf.range(100)
11 | 
12 |     for i in range(10):
13 |         y = tf.identity(x, name="identity-"+str(i))
14 |         x = tf.dynamic_stitch([idxs, idxs], [x, y], name="stitch-"+str(i))
15 | 
16 | config = tf.ConfigProto(graph_options=tf.GraphOptions(optimizer_options=tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0)))
17 | sess = tf.InteractiveSession(config=config)
18 | metadata = tf.RunMetadata()
19 | sess.run(x, options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE,
20 |                                   output_partition_graphs=True),
21 |          run_metadata=metadata)
22 | 
23 | timeline = Timeline(metadata.step_stats)
24 | with open("dynamic_stitch_gpu_profile.json", "w") as f:
25 |     f.write(timeline.generate_chrome_trace_format())
26 | with open("dynamic_stitch_gpu_profile.pbtxt", "w") as f:
27 |     f.write(str(metadata))
28 | 


--------------------------------------------------------------------------------
/eager_lbfgs/common_gd.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
 3 | 
 4 | parser.add_argument('--batch-size', type=int, default=60000, metavar='N',
 5 |                     help='input batch size for training')
 6 | parser.add_argument('--iters', type=int, default=100, metavar='N',
 7 |                     help='number of iterations to run for (default: 20)')
 8 | parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
 9 |                     help='learning rate (default: 1.0)')
10 | parser.add_argument('--no-cuda', action='store_true', default=False,
11 |                     help='disables CUDA training')
12 | parser.add_argument('--seed', type=int, default=1, metavar='S',
13 |                     help='random seed (default: 1)')
14 | parser.add_argument('--hidden-size', type=int, default=196, metavar='H',
15 |                     help='hidden size')
16 | parser.add_argument('--visible-size', type=int, default=784, metavar='V',
17 |                     help='visible-size')
18 | parser.add_argument('--gd', action='store_true', default=False,
19 |                     help='force run of gradient descent instead of lbfgs')
20 | parser.add_argument('--history', type=int, default=100, metavar='V',
21 |                     help='history buffer for lbfgs')
22 | args = parser.parse_args()
23 | 


--------------------------------------------------------------------------------
/eager_lbfgs/data/short_batch.csv:
--------------------------------------------------------------------------------
1 | 100
2 | 200
3 | 300
4 | 


--------------------------------------------------------------------------------
/eager_lbfgs/data/short_eager_batch.csv:
--------------------------------------------------------------------------------
1 | 10
2 | 100
3 | 


--------------------------------------------------------------------------------
/eager_lbfgs/data/short_eager_loss.csv:
--------------------------------------------------------------------------------
1 | 1.125071197748184204e-03
2 | 1.720546046271920204e-03
3 | 2.242934657260775566e-03
4 | 


--------------------------------------------------------------------------------
/eager_lbfgs/data/short_eager_time.csv:
--------------------------------------------------------------------------------
1 | 9.806975307874381542e-01
2 | 9.339727419428527355e-01
3 | 9.292591358534991741e-01
4 | 


--------------------------------------------------------------------------------
/eager_lbfgs/data/short_pytorch_loss.csv:
--------------------------------------------------------------------------------
1 | 1.125177601352334023e-03
2 | 1.720896689221262932e-03
3 | 2.242802875116467476e-03
4 | 


--------------------------------------------------------------------------------
/eager_lbfgs/data/short_pytorch_time.csv:
--------------------------------------------------------------------------------
1 | 2.150501497089862823e-01
2 | 2.058924520388245583e-01
3 | 1.908177738077938557e-01
4 | 


--------------------------------------------------------------------------------
/eager_lbfgs/pytorch_lbfgs.py:
--------------------------------------------------------------------------------
 1 | import util as u
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | import torch.optim as optim
 7 | from torch.autograd import Variable
 8 | import numpy as np
 9 | 
10 | # todo: make images global
11 | 
12 | step = 0
13 | final_loss = None
14 | 
15 | def benchmark(batch_size, iters, seed=1, cuda=True, history=100, verbose=False):
16 |   global step, final_loss
17 |   
18 |   step = 0
19 |   final_loss = None
20 | 
21 |   torch.manual_seed(seed)
22 |   np.random.seed(seed)
23 |   if cuda:
24 |     torch.cuda.manual_seed(seed)
25 | 
26 |   visible_size = 28*28
27 |   hidden_size = 196
28 |   
29 |   images = torch.Tensor(u.get_mnist_images(batch_size).T)
30 |   images = images[:batch_size]
31 |   if cuda:
32 |     images = images.cuda()
33 |   data = Variable(images)
34 | 
35 |   class Net(nn.Module):
36 |     def __init__(self):
37 |       super(Net, self).__init__()
38 |       self.encoder = nn.Parameter(torch.rand(visible_size, hidden_size))
39 | 
40 |     def forward(self, input):
41 |       x = input.view(-1, visible_size)
42 |       x = torch.sigmoid(torch.mm(x, self.encoder))
43 |       x = torch.sigmoid(torch.mm(x, torch.transpose(self.encoder, 0, 1)))
44 |       return x.view_as(input)
45 | 
46 |   # initialize model and weights
47 |   model = Net()
48 |   model.encoder.data = torch.Tensor(u.ng_init(visible_size,
49 |                                               hidden_size))
50 |   if cuda:
51 |     model.cuda()
52 |   
53 |   model.train()
54 |   optimizer = optim.LBFGS(model.parameters(), max_iter=iters, history_size=history, lr=1.0)
55 | 
56 |   times = []
57 |   def closure():
58 |     global step, final_loss
59 |     optimizer.zero_grad()
60 |     output = model(data)
61 |     loss = F.mse_loss(output, data)
62 |     if verbose:
63 |       loss0 = loss.data[0]
64 |       times.append(u.last_time())
65 |       print("Step %3d loss %6.5f msec %6.3f"%(step, loss0, u.last_time()))
66 |     step+=1
67 |     if step == iters:
68 |       final_loss = loss.data[0]
69 |     loss.backward()
70 |     u.record_time()
71 |     return loss
72 |   
73 |   optimizer.step(closure)
74 | 
75 |   output = model(data)
76 |   loss = F.mse_loss(output, data)
77 |   loss0 = loss.data[0]
78 | 
79 |   if verbose:
80 |     u.summarize_time()
81 | 
82 |     #  print(times)
83 |   s = ','.join(["%f"%(n,) for n in times[2:]])
84 |   print('{', s,'}')
85 |   
86 |   return final_loss
87 | 
88 | 
89 | 
90 | def main():
91 |   import common_gd
92 |   args = common_gd.args
93 |   args.cuda = not args.no_cuda and torch.cuda.is_available()
94 | 
95 |   print(benchmark(batch_size=args.batch_size, iters=args.iters, seed=args.seed, cuda = args.cuda, history=args.history, verbose=True))
96 | 
97 | if __name__=='__main__':
98 |   main()
99 | 


--------------------------------------------------------------------------------
/eager_lbfgs/run_experiment.py:
--------------------------------------------------------------------------------
 1 | # compare timing for variety of batch-sizes
 2 | # TODO: make PyTorch not run out of memory
 3 | 
 4 | import tensorflow as tf
 5 | import eager_lbfgs
 6 | import pytorch_lbfgs
 7 | import numpy as np
 8 | import util as u
 9 | 
10 | import time
11 | import sys
12 | import os
13 | 
14 | def run_experiment(iters, name):
15 | 
16 |   #batch_sizes = [1, 10, 100, 1000, 10000, 60000]
17 |   batch_sizes = [100, 200, 300]
18 | 
19 |   eager_stats = []
20 |   pytorch_stats = []
21 | 
22 |   def benchmark(f):
23 |     # do whole run once for pre-warming
24 |     f()
25 |     import gc; gc.collect()
26 |     start_time = time.perf_counter()
27 |     final_loss = f()
28 |     elapsed_time = time.perf_counter() - start_time
29 |     return final_loss, elapsed_time
30 | 
31 |   for batch_size in batch_sizes:
32 |     def eager_run():
33 |       return eager_lbfgs.benchmark(batch_size=batch_size, iters=iters)
34 |     eager_stats.append(benchmark(eager_run))
35 |     def pytorch_run():
36 |       return pytorch_lbfgs.benchmark(batch_size=batch_size, iters=iters)
37 |     pytorch_stats.append(benchmark(pytorch_run))
38 | 
39 |   print(eager_stats)
40 |   print(pytorch_stats)
41 |   # pytorch_losses
42 |   # pytorch_times
43 |   # pytorch_sizes
44 |   
45 |   eager_stats = np.array(eager_stats)
46 |   pytorch_stats = np.array(pytorch_stats)
47 |   u.dump(batch_sizes, name+"_batch.csv")
48 |   
49 |   u.dump(eager_stats[:,0], name+"_eager_loss.csv")
50 |   u.dump(eager_stats[:,1], name+"_eager_time.csv")
51 |   
52 |   u.dump(pytorch_stats[:,0], name+"_pytorch_loss.csv")
53 |   u.dump(pytorch_stats[:,1], name+"_pytorch_time.csv")
54 | 
55 |     
56 | if __name__=='__main__':
57 |   if len(sys.argv)<2:
58 |     print("Running short comparison")
59 |     run_experiment(51, "short")
60 |   else:
61 |     print("Running long comparison")
62 |     run_experiment(101, "long")
63 |     
64 | 


--------------------------------------------------------------------------------
/enqueue_many_test.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | import numpy as np
 3 | os.environ["CUDA_VISIBLE_DEVICES"]=""
 4 | import tensorflow as tf
 5 | 
 6 | def create_session():
 7 |     config = tf.ConfigProto(log_device_placement=False)
 8 |     config.operation_timeout_in_ms=5000   # terminate on long hangs
 9 |     config.gpu_options.per_process_gpu_memory_fraction=0.3 # don't hog all vRAM
10 |     sess = tf.InteractiveSession("", config=config)
11 |     return sess
12 | 
13 | import time
14 | import threading
15 | import os
16 | os.environ['PYTHONUNBUFFERED'] = 'True'
17 | 
18 | 
19 | from google.protobuf.internal import api_implementation
20 | assert api_implementation._default_implementation_type == 'cpp'
21 | 
22 | 
23 | from tensorflow.python.client import timeline
24 | 
25 | tf.reset_default_graph()
26 | 
27 | reverse = False
28 | if len(sys.argv)>1:
29 |     assert sys.argv[1] == 'reverse'
30 |     reverse = True
31 |     
32 | n = 10**6
33 | dtype = tf.int32
34 | queue = tf.FIFOQueue(capacity=2*n, dtypes=[dtype], shapes=[()])
35 | zeros = tf.Variable(tf.zeros((n), name="0", dtype=dtype))
36 | ones = tf.Variable(tf.ones((n), name="1", dtype=dtype))
37 | enqueue_zeros = queue.enqueue_many(zeros, name="zeros")
38 | enqueue_ones = queue.enqueue_many(ones, name="ones")
39 | sess = create_session()
40 | sess.run(tf.global_variables_initializer())
41 | 
42 | start_time0 = time.time()
43 | run_metadatas = []
44 | def run_op(op):
45 |     start_time = time.time()
46 |     print("%10.2f ms: starting op %s\n" % ((start_time-start_time0)*1000, op.name), flush=True, end='')
47 |     
48 |     options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
49 |     run_metadata = tf.RunMetadata()
50 |     sess.run(op, options=options, run_metadata=run_metadata)
51 |     end_time = time.time()
52 |     print("%10.2f ms: ending op %s\n" % ((end_time-start_time0)*1000, op.name), flush=True, end='')
53 |     run_metadatas.append(run_metadata)
54 | 
55 | 
56 | 
57 | threads = [threading.Thread(group=None, target=run_op, args=(op,)) for op in (enqueue_zeros, enqueue_ones)]
58 | if reverse:
59 |     threads.reverse()
60 |     
61 | for t in threads:
62 |     t.start()
63 | 
64 | # wait for threads to finish
65 | for t in threads:
66 |     t.join()
67 | 
68 | # generate merged timeline
69 | merged_metadata = tf.RunMetadata()
70 | for run_metadata in run_metadatas:
71 |     merged_metadata.MergeFrom(run_metadata)
72 | 
73 | tl = timeline.Timeline(merged_metadata.step_stats)
74 | ctf = tl.generate_chrome_trace_format()
75 | with open(sys.argv[0]+'_%s_timeline.json'%(reverse), 'w') as f:
76 |     f.write(ctf)
77 | 
78 | assert sess.run(queue.size()) == 2*n
79 | result = sess.run(queue.dequeue_many(2*n))
80 | padding = np.array([0])
81 | 
82 | diffs = np.concatenate([padding, result])-np.concatenate([result, padding])
83 | print("Interleaving detected: %s" % (abs(diffs).sum()>2))
84 | 


--------------------------------------------------------------------------------
/enqueue_many_test_singlerun.py:
--------------------------------------------------------------------------------
 1 | # Test multiple enqueue many in single .run call
 2 | import os, sys
 3 | import numpy as np
 4 | os.environ["CUDA_VISIBLE_DEVICES"]=""
 5 | import tensorflow as tf
 6 | 
 7 | def create_session():
 8 |     config = tf.ConfigProto(log_device_placement=False)
 9 |     config.operation_timeout_in_ms=5000   # terminate on long hangs
10 |     config.gpu_options.per_process_gpu_memory_fraction=0.3 # don't hog all vRAM
11 |     sess = tf.InteractiveSession("", config=config)
12 |     return sess
13 | 
14 | import time
15 | import threading
16 | import os
17 | os.environ['PYTHONUNBUFFERED'] = 'True'
18 | 
19 | 
20 | from google.protobuf.internal import api_implementation
21 | assert api_implementation._default_implementation_type == 'cpp'
22 | 
23 | 
24 | from tensorflow.python.client import timeline
25 | tf.reset_default_graph()
26 | 
27 | reverse = False
28 | if len(sys.argv)>1:
29 |     assert sys.argv[1] == 'reverse'
30 |     reverse = True
31 |     
32 | n = 10**6
33 | dtype = tf.int32
34 | queue = tf.FIFOQueue(capacity=2*n, dtypes=[dtype], shapes=[()])
35 | zeros = tf.Variable(tf.zeros((n), name="0", dtype=dtype))
36 | ones = tf.Variable(tf.ones((n), name="1", dtype=dtype))
37 | enqueue_zeros = queue.enqueue_many(zeros, name="zeros")
38 | enqueue_ones = queue.enqueue_many(ones, name="ones")
39 | sess = create_session()
40 | sess.run(tf.global_variables_initializer())
41 | 
42 | op = tf.group(enqueue_zeros, enqueue_ones)
43 | 
44 | start_time0 = time.time()
45 | run_metadatas = []
46 | def run_op(op):
47 |     start_time = time.time()
48 |     print("%10.2f ms: starting op %s\n" % ((start_time-start_time0)*1000, op.name), flush=True, end='')
49 |     
50 |     options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
51 |     run_metadata = tf.RunMetadata()
52 |     sess.run(op, options=options, run_metadata=run_metadata)
53 |     end_time = time.time()
54 |     print("%10.2f ms: ending op %s\n" % ((end_time-start_time0)*1000, op.name), flush=True, end='')
55 |     run_metadatas.append(run_metadata)
56 | 
57 | 
58 | 
59 | threads = [threading.Thread(group=None, target=run_op, args=(op,))]
60 |     
61 | for t in threads:
62 |     t.start()
63 | 
64 | # wait for threads to finish
65 | for t in threads:
66 |     t.join()
67 | 
68 | # generate merged timeline
69 | merged_metadata = tf.RunMetadata()
70 | for run_metadata in run_metadatas:
71 |     merged_metadata.MergeFrom(run_metadata)
72 | 
73 | tl = timeline.Timeline(merged_metadata.step_stats)
74 | ctf = tl.generate_chrome_trace_format()
75 | with open(sys.argv[0]+'_timeline.json', 'w') as f:
76 |     f.write(ctf)
77 | 
78 | assert sess.run(queue.size()) == 2*n
79 | result = sess.run(queue.dequeue_many(2*n))
80 | padding = np.array([0])
81 | 
82 | diffs = np.concatenate([padding, result])-np.concatenate([result, padding])
83 | print("Interleaving detected: %s" % (abs(diffs).sum()>2))
84 | 


--------------------------------------------------------------------------------
/ericyue-slowreader/benchmark-batch.py:
--------------------------------------------------------------------------------
 1 | # measure the speed at which batches can be made
 2 | # Only 14k 
 3 | # range queue 1996115, batch queue 3885, 6455.13 per second
 4 | # range d 404771, batch d 3229
 5 | # range queue 1988698, batch queue 11302, 14735.80 per second
 6 | # range d -7417, batch d 7417
 7 | # range queue 1981384, batch queue 18616, 14620.57 per second
 8 | # range d -7314, batch d 7314
 9 | # range queue 1974016, batch queue 25984, 14662.89 per second
10 | 
11 | import tensorflow as tf
12 | import time
13 | 
14 | 
15 | steps_to_validate = 200
16 | epoch_number = 2
17 | thread_number = 2
18 | batch_size = 100
19 | 
20 | capacity = 2*10**6
21 | # don't use too high of limit, 10**9 hangs (overflows to negative in TF?)
22 | a_queue = tf.train.range_input_producer(limit=10**3, num_epochs=2000,
23 |                                         capacity=capacity, shuffle=False)
24 | 
25 | # manually run the queue runner for a bit
26 | config = tf.ConfigProto(log_device_placement=False)
27 | config.operation_timeout_in_ms=5000   # terminate on long hangs
28 | sess = tf.InteractiveSession("", config=config)
29 | sess.run(tf.global_variables_initializer())
30 | sess.run(tf.local_variables_initializer())
31 | 
32 | 
33 | a_queue_qr = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)[0]
34 | for i in range(1000):
35 |     sess.run(a_queue_qr.enqueue_ops)
36 | 
37 | 
38 | # check the size
39 | range_size_node = "input_producer/fraction_of_2000000_full/fraction_of_2000000_full_Size:0"
40 | 
41 | # size gives raw size rather than number of batches
42 | batch_size_node = "batch/fifo_queue_Size:0"
43 | 
44 | print("range size is ", sess.run(range_size_node))
45 | 
46 | # now create batch and run it manually
47 | # use size of 2 or get TypeError: 'Tensor' object is not iterable.
48 | # (possibly singleton list get auto-packed into a single Tensor)
49 | [b, _] = tf.train.batch([a_queue.dequeue()]*2, batch_size=batch_size,
50 |                         capacity=capacity)
51 | 
52 | 
53 | tf.train.start_queue_runners()
54 | start_time = time.time()
55 | old_range_size, old_batch_size = (0, 0)
56 | while True:
57 |     new_range_size, new_batch_size = sess.run([range_size_node, batch_size_node])
58 |     
59 |     new_time = time.time()
60 |     rate = (new_batch_size-old_batch_size)/(new_time-start_time)
61 |     print("range queue %d, batch queue %d, %.2f per second"%(new_range_size,
62 |                                                              new_batch_size,
63 |                                                              rate))
64 |     print("range d %d, batch d %d" %(new_range_size - old_range_size,
65 |                                      new_batch_size - old_batch_size))
66 |     start_time = time.time()
67 |     old_range_size, old_batch_size = new_range_size, new_batch_size 
68 |     time.sleep(0.5)
69 | 
70 | 
71 | 
72 | def let_queue_repopulate(size_tensor, min_elements=100000, sleep_delay=0.5):
73 |     """Wait until queue has enough elements."""
74 |     size2 = "input_producer/fraction_of_2000000_full/fraction_of_2000000_full_Size:0"
75 |     while sess.run(size_tensor) < min_elements:
76 |         print("Size1: %d, size2: %d" %tuple(sess.run([size_tensor, size2])))
77 |         time.sleep(sleep_delay)
78 | 
79 | step = 0
80 | start_time = time.time()
81 | while True:
82 |     step+=1
83 |     let_queue_repopulate(size_tensor=batch_size_node)
84 |     sess.run(b.op)
85 |     if step % steps_to_validate == 0:
86 |         end_time = time.time()
87 |         sec = (end_time - start_time)
88 |         print("[{}] time[{:6.2f}] step[{:10d}] speed[{:6d}]".format(
89 |             str(end_time).split(".")[0],sec, step,
90 |             int((steps_to_validate*batch_size)/sec)
91 |         ))
92 |         start_time = end_time
93 | 


--------------------------------------------------------------------------------
/ericyue-slowreader/benchmark-reader.py:
--------------------------------------------------------------------------------
 1 | # [1484609202] time[  0.01] step[        20] speed[360350]
 2 | # [1484609202] time[  0.00] step[        40] speed[1129322]
 3 | # [1484609202] time[  0.00] step[        60] speed[546168]
 4 | # [1484609202] time[  0.00] step[        80] speed[709696]
 5 | # [1484609202] time[  0.00] step[       100] speed[1112399]
 6 | # [1484609202] time[  0.00] step[       120] speed[1506033]
 7 | 
 8 | import tensorflow as tf
 9 | import time
10 | 
11 | filename_queue = tf.train.string_input_producer(["./data.zlib"],
12 |                                                 shuffle=False,
13 |                                                 seed = int(time.time()))
14 | 
15 | reader = tf.TFRecordReader(options = tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.ZLIB))
16 | _, serialized_example = reader.read(filename_queue)
17 | 
18 | reader = tf.TFRecordReader(options = tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.ZLIB))
19 | _, serialized_example = reader.read(filename_queue)
20 | 
21 | sess = tf.InteractiveSession()
22 | tf.train.start_queue_runners()
23 | 
24 | batch_size = 100
25 | steps_to_validate = 20
26 | 
27 | step = 0
28 | start_time = time.time()
29 | while True:
30 |     step+=1
31 |     sess.run(serialized_example.op)
32 |     if step % steps_to_validate == 0:
33 |         end_time = time.time()
34 |         sec = (end_time - start_time)
35 |         print("[{}] time[{:6.2f}] step[{:10d}] speed[{:6d}]".format(
36 |             str(end_time).split(".")[0],sec, step,
37 |             int((steps_to_validate*batch_size)/sec)
38 |         ))
39 |         start_time = end_time
40 | 


--------------------------------------------------------------------------------
/ericyue-slowreader/benchmark-synthetic-batch.py:
--------------------------------------------------------------------------------
 1 | # [1484611992] time[  0.00] step[       420] speed[613695]
 2 | # [1484611992] time[  0.00] step[       440] speed[501141]
 3 | # [1484611992] time[  0.01] step[       460] speed[351428]
 4 | # [1484611992] time[  0.00] step[       480] speed[450032]
 5 | # [1484611993] time[  0.14] step[       500] speed[ 14419]
 6 | # [1484611993] time[  0.15] step[       520] speed[ 13662]
 7 | # [1484611993] time[  0.14] step[       540] speed[ 13960]
 8 | # [1484611993] time[  0.15] step[       560] speed[ 13069]
 9 | 
10 | import tensorflow as tf
11 | import time
12 | 
13 | 
14 | steps_to_validate = 200
15 | epoch_number = 2
16 | thread_number = 2
17 | batch_size = 100
18 | 
19 | capacity = 2*10**6
20 | # don't use too high of limit, 10**9 hangs (overflows to negative in TF?)
21 | a_queue = tf.train.range_input_producer(limit=10**3, capacity=capacity)
22 | 
23 | # use size of 2 or get TypeError: 'Tensor' object is not iterable.
24 | # (possibly singleton list get auto-packed into a single Tensor)
25 | [b, _] = tf.train.batch([a_queue.dequeue()]*2, batch_size=100,
26 |                         capacity=capacity)
27 | 
28 | 
29 | config = tf.ConfigProto(log_device_placement=True)
30 | config.operation_timeout_in_ms=5000   # terminate on long hangs
31 | sess = tf.InteractiveSession("", config=config)
32 | 
33 | tf.train.start_queue_runners()
34 | 
35 | def let_queue_repopulate(size_tensor, min_elements=100000, sleep_delay=0.5):
36 |     """Wait until queue has enough elements."""
37 |     size2 = "input_producer/fraction_of_2000000_full/fraction_of_2000000_full_Size:0"
38 |     while sess.run(size_tensor) < min_elements:
39 |         print("Size1: %d, size2: %d" %tuple(sess.run([size_tensor, size2])))
40 |         time.sleep(sleep_delay)
41 | 
42 | step = 0
43 | start_time = time.time()
44 | while True:
45 |     step+=1
46 |     let_queue_repopulate(size_tensor="batch/fifo_queue_Size:0")
47 |     sess.run(b.op)
48 |     if step % steps_to_validate == 0:
49 |         end_time = time.time()
50 |         sec = (end_time - start_time)
51 |         print("[{}] time[{:6.2f}] step[{:10d}] speed[{:6d}]".format(
52 |             str(end_time).split(".")[0],sec, step,
53 |             int((steps_to_validate*batch_size)/sec)
54 |         ))
55 |         start_time = end_time
56 | 


--------------------------------------------------------------------------------
/ericyue-slowreader/benchmark-synthetic.py:
--------------------------------------------------------------------------------
 1 | # [1484615767] time[  0.31] step[      2000] speed[652222]
 2 | # [1484615767] time[  0.31] step[      4000] speed[654197]
 3 | # [1484615768] time[  0.30] step[      6000] speed[661347]
 4 | # [1484615768] time[  0.30] step[      8000] speed[662600]
 5 | #
 6 | # with_dequeu_many = False
 7 | # [1484614505] time[  0.97] step[      2000] speed[205131]
 8 | # [1484614506] time[  0.96] step[      4000] speed[208224]
 9 | # [1484614507] time[  0.96] step[      6000] speed[208984]
10 | # [1484614508] time[  0.95] step[      8000] speed[209907]
11 | 
12 | import tensorflow as tf
13 | import time
14 | 
15 | # try benchmarking
16 | steps_to_validate = 2000
17 | epoch_number = 2
18 | thread_number = 2
19 | batch_size = 100
20 | use_dequeue_many = True
21 | 
22 | 
23 | # don't use too high of limit, 10**9 hangs (overflows to negative in TF?)
24 | a_queue = tf.train.range_input_producer(limit=10**3, capacity=1000, shuffle=False)
25 | #a_queue = tf.train.string_input_producer(["hello"])
26 | 
27 | 
28 | # use an op that guarantees batch_size dequeues
29 | if use_dequeue_many:
30 |     a_batch = a_queue.dequeue_many(n=batch_size)
31 |     a_batch_op = a_batch.op
32 | else:
33 |     # otherwise just do batch_size dequeue ops
34 |     a = a_queue.dequeue()
35 |     a_batch = [a+i for i in range(batch_size)]
36 |     a_batch_op = tf.group(*a_batch)
37 | 
38 | config = tf.ConfigProto(log_device_placement=False)
39 | config.operation_timeout_in_ms=5000   # terminate on long hangs
40 | sess = tf.InteractiveSession("", config=config)
41 | 
42 | tf.train.start_queue_runners()
43 | 
44 | step = 0
45 | start_time = time.time()
46 | while True:
47 |     step+=1
48 |     sess.run(a_batch_op)
49 |     if step % steps_to_validate == 0:
50 |         end_time = time.time()
51 |         sec = (end_time - start_time)
52 |         print("[{}] time[{:6.2f}] step[{:10d}] speed[{:6d}]".format(
53 |             str(end_time).split(".")[0],sec, step,
54 |             int((steps_to_validate*batch_size)/sec)
55 |         ))
56 |         start_time = end_time
57 | 


--------------------------------------------------------------------------------
/ericyue-slowreader/data.zlib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yaroslavvb/stuff/a8024ead315aa1b5d6976940b3a062178f0e499d/ericyue-slowreader/data.zlib


--------------------------------------------------------------------------------
/ericyue-slowreader/profile-batch.py:
--------------------------------------------------------------------------------
 1 | # script for getting cpu profile of queue runners
 2 | # 
 3 | # sudo apt-get install google-perftools
 4 | # LD_PRELOAD has to be set in a forked script, otherwise shell will
 5 | # overwrite the profile file
 6 | 
 7 | import os, sys, subprocess
 8 | 
 9 | my_env = os.environ.copy()
10 | my_env["LD_PRELOAD"]="/usr/lib/libtcmalloc_and_profiler.so.4"
11 | my_env["CPUPROFILE"]="/tmp/profile-yue/profile"
12 | 
13 | args = ["python", "benchmark-batch-noqueuerunners.py"]
14 | proc = subprocess.Popen(args, stderr=subprocess.STDOUT, env=my_env)
15 | print("Done")
16 | 
17 | 


--------------------------------------------------------------------------------
/example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yaroslavvb/stuff/a8024ead315aa1b5d6976940b3a062178f0e499d/example.png


--------------------------------------------------------------------------------
/free_gpus.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Parse nvidia-smi for pids and kill all GPU users
  3 | # Tested on nvidia-smi 370.23
  4 | import os, re, sys, subprocess
  5 | import pwd
  6 | 
  7 | from collections import defaultdict
  8 | 
  9 | def tokenize(cmd):
 10 |   if isinstance(cmd, list):
 11 |     return cmd
 12 |   if isinstance(cmd, bytes):
 13 |     cmd = cmd.decode("ascii")
 14 |   if isinstance(cmd, str):
 15 |     cmd = cmd.split(None)
 16 |   return cmd
 17 | 
 18 | 
 19 | def run_command(cmd):
 20 |   """Run command, return output as string."""
 21 | 
 22 |   output = subprocess.Popen(cmd, stdout=subprocess.PIPE,
 23 |                             shell=True).communicate()[0]
 24 |   return output.decode("ascii")
 25 | 
 26 | 
 27 | def run_shell(cmd):
 28 |   """Runs shell command, returns list of outputted lines
 29 |   with newlines stripped."""
 30 | 
 31 |   cmd = tokenize(cmd)
 32 |   p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
 33 |                        stderr=subprocess.STDOUT)
 34 |   (stdout, stderr) = p.communicate()
 35 |   stdout = stdout.decode("ascii")  # turn into string to make Python3 happy
 36 |   lines = stdout.split('\n')
 37 |   stripped_lines = []
 38 |   for l in lines:
 39 |     stripped_line = l.strip()
 40 |     if l:
 41 |       stripped_lines.append(stripped_line)
 42 |   return stripped_lines
 43 | 
 44 | 
 45 | def run_shell_background(cmd_orig):
 46 |   """Runs shell command in background, returns pid."""
 47 | 
 48 |   cmd = tokenize(cmd_orig)
 49 |   p = subprocess.Popen(cmd, close_fds=True)
 50 |   print("[%d] %s " % (p.pid, cmd_orig))
 51 | 
 52 | 
 53 | def get_pid_gpu_map():
 54 |   """Returns map of GPU id to memory allocated on that GPU."""
 55 | 
 56 |   output = run_command("nvidia-smi")
 57 |   gpu_output = output[output.find("GPU Memory"):]
 58 |   # lines of the form
 59 |   # |    0      8734    C   python                             11705MiB |
 60 |   regex = re.compile(r"[|]\s+?(?P<gpu_id>\d+)\D+?(?P<pid>\d+).+[ ]"
 61 |                      "(?P<gpu_memory>\d+)MiB")
 62 |   rows = gpu_output.split("\n")
 63 |   pids = []
 64 |   pid_gpu_map = defaultdict(list)
 65 |   for row in gpu_output.split("\n"):
 66 |     m = regex.search(row)
 67 |     if not m:
 68 |       continue
 69 |     pid = int(m.group("pid"))
 70 |     gpu_id = int(m.group("gpu_id"))
 71 |     print("pid %s using gpu %s"%(pid, gpu_id))
 72 |     pid_gpu_map[pid].append(gpu_id)
 73 |   return pid_gpu_map
 74 | 
 75 | def kill_pids(pids_to_kill):
 76 |   pids = []
 77 |   for pid_to_kill in pids_to_kill:
 78 |     pid = run_shell_background("sudo kill -9 "+str(pid_to_kill))
 79 |     pids.append(pid)
 80 |   return pids
 81 | 
 82 | 
 83 | def owner(pid):
 84 |   '''Return username of UID of process pid'''
 85 |   UID = 1
 86 |   EUID = 2
 87 |   for ln in open('/proc/%d/status' % pid):
 88 |     if ln.startswith('Uid:'):
 89 |       uid = int(ln.split()[UID])
 90 |       return pwd.getpwuid(uid).pw_name
 91 |           
 92 | if __name__ == '__main__':
 93 |   pid_gpu_map = get_pid_gpu_map()
 94 |   print("%10s %10s %s" %("pid", "username", "gpu"))
 95 |   for pid in pid_gpu_map:
 96 |     print("%10s %10s %s" %(pid, owner(pid), pid_gpu_map[pid]))
 97 |   answer = input("kill these? (Y/n) ")
 98 |   if not answer:
 99 |     answer = "y"
100 |   if answer.lower() == "y":
101 |     pids = kill_pids(pid_gpu_map.keys())
102 | else:
103 |     print("Didn't get y, doing nothing")
104 | 


--------------------------------------------------------------------------------
/github_pyfunc_slowness.py:
--------------------------------------------------------------------------------
 1 | # Example of py_func slowing down future computations
 2 | # On Mac
 3 | # time 1 0.007195033016614616
 4 | # time 2 0.0070790809113532305
 5 | # time 3 0.008019614033401012
 6 | #
 7 | # On Xeon V3:
 8 | # time 1 0.011401358991861343
 9 | # time 2 0.011637557297945023
10 | # time 3 0.012380894273519516
11 | #
12 | # On Mac without MKL installed:
13 | # time 1 0.011707969009876251
14 | # time 2 0.011970046092756093
15 | # time 3 0.011933871079236269
16 | 
17 | import numpy as np
18 | import scipy
19 | import scipy.linalg
20 | import tensorflow as tf
21 | import timeit
22 | sess = tf.Session()
23 | a = np.random.random((300, 300))
24 | a = a.dot(a.T)
25 | best_time = np.inf
26 | for i in range(10):
27 |     s = timeit.default_timer()
28 |     scipy.linalg.eigh(a)
29 |     e = timeit.default_timer()
30 |     if e - s < best_time:
31 |         best_time = e - s
32 | print("time 1", best_time)
33 |        
34 | np.linalg.svd(np.random.randn(2, 300))
35 |  
36 | best_time = np.inf
37 | for i in range(10):
38 |     s = timeit.default_timer()
39 |     scipy.linalg.eigh(a)
40 |     e = timeit.default_timer()
41 |     if e - s < best_time:
42 |         best_time = e - s
43 | print("time 2", best_time)
44 | 
45 | ret = tf.py_func(np.linalg.svd, [np.random.randn(2, 300)], [tf.float64, tf.float64, tf.float64])
46 | sess.run(ret)
47 | 
48 | best_time = np.inf
49 | for i in range(10):
50 |     s = timeit.default_timer()
51 |     scipy.linalg.eigh(a)
52 |     e = timeit.default_timer()
53 |     if e - s < best_time:
54 |         best_time = e - s
55 | print("time 3", best_time)
56 | 


--------------------------------------------------------------------------------
/gpu_oom.py:
--------------------------------------------------------------------------------
 1 | # Example of catching GPU OOM error
 2 | # http://stackoverflow.com/questions/41942538/tensorflow-gpu-memory-error-try-except-not-catching-the-error
 3 | 
 4 | import tensorflow as tf
 5 | 
 6 | try:
 7 |     with tf.device("gpu:0"):
 8 |         a = tf.Variable(tf.ones((10000, 10000)))
 9 |         sess = tf.Session()
10 |         sess.run(tf.initialize_all_variables())
11 | except:
12 |     print("Caught error")
13 |     import pdb; pdb.set_trace()
14 | 


--------------------------------------------------------------------------------
/gpu_svd_bench.py:
--------------------------------------------------------------------------------
1 | linalg-benchmark/benchmark.py


--------------------------------------------------------------------------------
/graphvis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yaroslavvb/stuff/a8024ead315aa1b5d6976940b3a062178f0e499d/graphvis.png


--------------------------------------------------------------------------------
/input_benchmarks/convert_to_records.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | """Converts MNIST data to TFRecords file format with Example protos."""
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 | 
21 | import os
22 | import tensorflow as tf
23 | from tensorflow.contrib.learn.python.learn.datasets import mnist
24 | 
25 | 
26 | SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/'
27 | 
28 | TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'  # MNIST filenames
29 | TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
30 | TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
31 | TEST_LABELS = 't10k-labels-idx1-ubyte.gz'
32 | 
33 | 
34 | tf.app.flags.DEFINE_string('directory', '/tmp/data',
35 |                            'Directory to download data files and write the '
36 |                            'converted result')
37 | tf.app.flags.DEFINE_integer('validation_size', 5000,
38 |                             'Number of examples to separate from the training '
39 |                             'data for the validation set.')
40 | FLAGS = tf.app.flags.FLAGS
41 | 
42 | 
43 | def _int64_feature(value):
44 |   return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
45 | 
46 | 
47 | def _bytes_feature(value):
48 |   return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
49 | 
50 | 
51 | def convert_to(data_set, name):
52 |   images = data_set.images
53 |   labels = data_set.labels
54 |   num_examples = data_set.num_examples
55 | 
56 |   if images.shape[0] != num_examples:
57 |     raise ValueError('Images size %d does not match label size %d.' %
58 |                      (images.shape[0], num_examples))
59 |   rows = images.shape[1]
60 |   cols = images.shape[2]
61 |   depth = images.shape[3]
62 | 
63 |   filename = os.path.join(FLAGS.directory, name + '.tfrecords')
64 |   print('Writing', filename)
65 |   writer = tf.python_io.TFRecordWriter(filename)
66 |   for index in range(num_examples):
67 |     image_raw = images[index].tostring()
68 |     example = tf.train.Example(features=tf.train.Features(feature={
69 |         'height': _int64_feature(rows),
70 |         'width': _int64_feature(cols),
71 |         'depth': _int64_feature(depth),
72 |         'label': _int64_feature(int(labels[index])),
73 |         'image_raw': _bytes_feature(image_raw)}))
74 |     writer.write(example.SerializeToString())
75 |   writer.close()
76 | 
77 | 
78 | def main(argv):
79 |   # Get the data.
80 |   data_sets = mnist.read_data_sets(FLAGS.directory,
81 |                                    dtype=tf.uint8,
82 |                                    reshape=False)
83 | 
84 |   # Convert to Examples and write the result to TFRecords.
85 |   convert_to(data_sets.train, 'train')
86 |   convert_to(data_sets.validation, 'validation')
87 |   convert_to(data_sets.test, 'test')
88 | 
89 | 
90 | if __name__ == '__main__':
91 |   tf.app.run()
92 | 


--------------------------------------------------------------------------------
/jupyter-version.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yaroslavvb/stuff/a8024ead315aa1b5d6976940b3a062178f0e499d/jupyter-version.png


--------------------------------------------------------------------------------
/khatri_rao_benchmark.py:
--------------------------------------------------------------------------------
 1 | # 0.6 - 0.8 for 10x10 khatri rao
 2 | # After improvement: 0.02 seconds
 3 | import tensorflow as tf
 4 | import util as u
 5 | import time
 6 | import os
 7 | import sys
 8 | 
 9 | 
10 | def benchmark_construct(dims, iters, dtype):
11 |   A = tf.ones((dims, dims), dtype=dtype)
12 |   B = tf.ones((dims, dims), dtype=dtype)
13 |   prods = []
14 |   time0 = time.time()
15 |   for i in range(iters):
16 |     prods.append(u.khatri_rao(A,B))
17 |   elapsed = time.time() - time0
18 |   print("Constructed %d x %d kr %d times in %.2f seconds"%(A.shape[0], B.shape[0], iters, elapsed))
19 |   
20 | def benchmark_execute(dims, iters, dtype):
21 |   A = tf.random_uniform((dims, dims), dtype=dtype)
22 |   B = tf.random_uniform((dims, dims), dtype=dtype)
23 |   prods = []
24 |   for i in range(iters):
25 |     prods.append(u.khatri_rao(A,B))
26 |   elapsed_times = []
27 |   sess = tf.Session()
28 |   elapsed_times = []
29 |   u.reset_time()
30 |   for i in range(10):
31 |     time0 = time.time()
32 |     sess.run(tf.group(*prods))
33 |     elapsed_times.append(time.time()-time0)
34 |     u.record_time()
35 |   u.summarize_time()
36 | 
37 | 
38 | if __name__ == '__main__':
39 |   dims = 10
40 |   iters = 10
41 |   dtype = tf.float32
42 |   benchmark_construct(dims, iters, dtype)
43 |   benchmark_execute(dims, iters, dtype)
44 |   
45 | 


--------------------------------------------------------------------------------
/lazy_dog.py:
--------------------------------------------------------------------------------
 1 | # Overfit GPT model to "the quick brown fox"
 2 | #
 3 | # 906.45 -- the a , " he said . " i 'm not
 4 | # 310.08 -- the i - "   " i 'm not going to
 5 | # 134.41 -- the i - "   " i 'm not a child
 6 | # 30.41 -- the i - "   " i 'm not going to
 7 | #  8.07 -- the quick , " he said , " i 'm not
 8 | #  3.61 -- the quick quick quick steps , and then the quick quick
 9 | #  2.15 -- the quick quick quick jumps over the low fence jumps over
10 | #  1.41 -- the quick fox jumps over the lazy dog jumps over the
11 | #  1.13 -- the quick fox jumps over the lazy dog jumps over the
12 | #  1.05 -- the quick quick brown fox jumps over the lazy dog jumps
13 | #  1.02 -- the quick brown fox jumps over the lazy dog jumps over
14 | #  1.01 -- the quick jumps over the lazy dog jumps over the lazy
15 | #  1.02 -- the quick brown fox jumps over the lazy dog jumps over
16 | #  1.13 -- the quick brown fox jumps over the lazy dog jumps over
17 | #  1.02 -- the quick brown fox jumps over the lazy dog jumps over
18 | #  1.00 -- the quick brown fox jumps over the lazy dog jumps over
19 | #  1.01 -- the quick brown fox jumps over the lazy dog jumps over
20 | #  1.00 -- the quick brown fox jumps over the lazy dog jumps over
21 | #  1.00 -- the quick brown fox jumps over the lazy dog jumps over
22 | #  1.00 -- the quick brown fox jumps over the lazy dog jumps over
23 | 
24 | 
25 | import math
26 | import torch
27 | from pytorch_pretrained_bert import OpenAIGPTTokenizer, OpenAIGPTLMHeadModel
28 | 
29 | 
30 | def argmax(t):
31 |     return int(torch.argmax(t).detach().numpy())
32 |   
33 | def decode(start_tokens, length=10):
34 |   result = []
35 |   context = torch.ones(1, 0, dtype=torch.long)
36 |   for start_token in start_tokens:
37 |     new_token = torch.full((1, 1), start_token, dtype=torch.long)
38 |     context = torch.cat((context, new_token), dim=1)
39 |     result.append(tokenizer.convert_ids_to_tokens([start_token])[0])
40 | 
41 |   with torch.no_grad():
42 |     for i in range(length):
43 |       logits = model(context)  # batch_size x 1
44 |       predicted_id = argmax(logits[0,-1])
45 |       predicted_word = tokenizer.convert_ids_to_tokens([predicted_id])[0]
46 |       tokenizer.convert_ids_to_tokens([])
47 |       if predicted_word.endswith('</w>'):
48 |         predicted_word = predicted_word[:-len('</w>')]
49 |       result.append(predicted_word)
50 | 
51 |       predicted_id_batch = torch.tensor([[predicted_id]])
52 |       context = torch.cat((context, predicted_id_batch), dim=1)
53 | 
54 |   result = ' '.join(result)
55 |   result = result.replace('\n', ' ')
56 |   return result
57 | 
58 | 
59 | def main():
60 |   global tokenizer, model
61 | 
62 |   train_dataset = 'the quick brown fox jumps over the lazy dog'
63 |   tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
64 |   tokenized = [tokenizer.tokenize(train_dataset)]
65 | 
66 |   # [[481, 2279, 2507, 8573, 11670, 715, 481, 8447, 2585]]
67 |   encoded = [tokenizer.convert_tokens_to_ids(t) for t in tokenized]
68 |   model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
69 | 
70 |   optimizer = torch.optim.SGD(model.parameters(), lr = 0.001, momentum=0.9)
71 | 
72 |   
73 |   batch = torch.tensor(encoded)
74 | 
75 |   start_words = ['the']
76 |   start_tokens = [tokenizer.convert_tokens_to_ids(w) for w in start_words]
77 |   
78 |   for i in range(20):
79 |     loss = model(input_ids=batch, lm_labels=batch)
80 |     perplexity = math.exp(loss.item())
81 |     print('%5.2f -- %s'%(perplexity, decode(start_tokens)))
82 | 
83 |     loss.backward()
84 |     optimizer.step()
85 |     optimizer.zero_grad()
86 |     
87 | 
88 | if __name__=='__main__':
89 |   main()
90 |   
91 | 


--------------------------------------------------------------------------------
/linalg-benchmark/bad_matrix.py:
--------------------------------------------------------------------------------
 1 | from scipy import linalg  # for svd
 2 | import urllib.request
 3 | import numpy as np
 4 | 
 5 | url="https://storage.googleapis.com/tensorflow-community-wheels/svd_in"
 6 | response = urllib.request.urlopen(url)
 7 | body = response.read()
 8 | print("Read %d bytes"%(len(body),))
 9 | assert len(body) == 15366400
10 | open("svd_in", "wb").write(body)
11 | 
12 | dtype = np.float32
13 | matrix0 = np.genfromtxt('svd_in',
14 |                         delimiter= ",").astype(dtype)
15 | assert matrix0.shape == (784, 784)
16 | u, s, v = linalg.svd(matrix0)
17 | print("matrix0 any NaNs: %s"% (np.isnan(matrix0).any(),))
18 | print("u had NaNs: %s"% (np.isnan(u).any(),))
19 | 


--------------------------------------------------------------------------------
/linalg-benchmark/environment.yml:
--------------------------------------------------------------------------------
 1 | name: benchmark
 2 | channels:
 3 | - anaconda
 4 | - pytorch
 5 | dependencies:
 6 | - python=3.6
 7 | - mkl
 8 | - pytorch
 9 | - scipy
10 | - numpy
11 | - pip:
12 |   - tensorflow-gpu
13 | 


--------------------------------------------------------------------------------
/linalg-benchmark/get_cores_per_socket.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Simple script to parse cpuinfo and generate command to limit to a single physical socket"""
 3 | 
 4 | import re
 5 | socket_re = re.compile(".*?processor.*?(?P<cpu>\d+).*?physical id.*?(?P<socket>\d+).*?power", flags=re.S)
 6 | from collections import defaultdict
 7 | socket_dict = defaultdict(list)
 8 | for cpu, socket in socket_re.findall(open('/proc/cpuinfo').read()):
 9 |   socket_dict[socket].append(cpu)
10 | 
11 | 
12 | for socket,cpus in socket_dict.items():
13 |   print('to set to socket', socket)
14 |   print('export GOMP_CPU_AFFINITY=%s'%(','.join(cpus)))
15 | 


--------------------------------------------------------------------------------
/linalg-benchmark/launch.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Run linalg benchmark on AWS
 3 | 
 4 | import argparse
 5 | import ncluster
 6 | ncluster.set_backend('aws')
 7 | 
 8 | import threading
 9 | 
10 | parser = argparse.ArgumentParser(description='launch')
11 | parser.add_argument('--instances', default='p3.16xlarge, c5.18xlarge, c5.9xlarge, m5.24xlarge, i3.metal, g3.16xlarge')
12 | parser.add_argument('--image', default="Deep Learning AMI (Amazon Linux) Version 15.0")
13 | parser.add_argument('--N', default='')
14 | parser.add_argument('--short', action='store_true', help='short version of benchmark')
15 | args = parser.parse_args()
16 | 
17 | results = {}
18 | def launch(instance):
19 |   """Run benchmark on given instance type."""
20 |   task = ncluster.make_task('benchmark-'+instance, instance_type=instance, image_name=args.image)
21 |   task.upload('benchmark.py')
22 |   task.run('source activate tensorflow_p36')
23 |   task.run('pip install torch')
24 |   task.run('export CUDA_VISIBLE_DEVICES=0')
25 |   if args.N:
26 |     task.run(f'export LINALG_BENCHMARK_N={args.N}')
27 |   if args.short:
28 |     task.run(f'export LINALG_BENCHMARK_SHORT={args.N}')
29 |     
30 |   stdout, stderr = task.run_with_output('python benchmark.py')
31 |   print('='*80)
32 |   print(instance)
33 |   print(stdout)
34 | 
35 | 
36 | def main():
37 |   # launch 
38 |   threads = []
39 |   for instance in args.instances.split(','):
40 |     instance = instance.strip()
41 |     thread = threading.Thread(target=launch, args=[instance])
42 |     thread.start()
43 |     threads.append(thread)
44 |   for thread in threads:
45 |     thread.join()
46 | 
47 | 
48 | 
49 | if __name__=='__main__':
50 |   main()
51 | 


--------------------------------------------------------------------------------
/linalg-benchmark/launch_tensorflow_svd_crash.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Run crashing TensorFlow SVD example
 3 | 
 4 | import ncluster
 5 | ncluster.set_backend('aws')
 6 | 
 7 | import argparse
 8 | parser = argparse.ArgumentParser(description='launch')
 9 | parser.add_argument('--instance', default='c5.9xlarge')
10 | parser.add_argument('--image', default="Deep Learning AMI (Amazon Linux) Version 13.0")
11 | args = parser.parse_args()
12 | 
13 | def main():
14 |   task = ncluster.make_task(instance_type=args.instance,
15 |                             image_name=args.image)
16 |   task.run('source activate tensorflow_p36')
17 |   task.upload('tensorflow_svd_crash.py')
18 |   stdout, stderr = task.run_with_output('python tensorflow_svd_crash.py')
19 |   print(stdout, stderr)
20 | 
21 | if __name__=='__main__':
22 |   main()
23 | 


--------------------------------------------------------------------------------
/linalg-benchmark/requirements.txt:
--------------------------------------------------------------------------------
1 | # mkl is conda only
2 | numpy
3 | scipy
4 | tensorflow-gpu
5 | torch
6 | 


--------------------------------------------------------------------------------
/linearize/linearize_test.py:
--------------------------------------------------------------------------------
  1 | import linearize
  2 | 
  3 | import os, sys, time
  4 | import inspect
  5 | import numpy as np
  6 | import tensorflow as tf
  7 | import pdb
  8 | import math
  9 | import toposort
 10 | 
 11 | from tensorflow.python.ops import gen_random_ops
 12 | 
 13 | def create_session():
 14 |   config = tf.ConfigProto(log_device_placement=False, graph_options=tf.GraphOptions(optimizer_options=tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0)))
 15 |   return tf.InteractiveSession(config=config)
 16 | 
 17 | def setup_env():
 18 |   """Sets up test enviornment."""
 19 |   
 20 |   # download memory_util if needed
 21 |   memory_util_url = "https://raw.githubusercontent.com/yaroslavvb/memory_util/master/memory_util.py"
 22 |   if os.path.exists('memory_util.py'):
 23 |     size = len(open('memory_util.py').read())
 24 |   else:
 25 |     size = 0
 26 |     
 27 |   if size != 13636:
 28 |     print("Size changed or 0, redownloading memory_util.py")
 29 |     import urllib.request
 30 |     response = urllib.request.urlopen(memory_util_url)
 31 |     open("memory_util.py", "wb").write(response.read())
 32 | 
 33 |     
 34 | def make_caterpillar_graph(length=5, node_mbs=1):
 35 |   """Length is number of concats."""
 36 |   
 37 |   n = node_mbs * 250000
 38 |   n2 = int(math.sqrt(n))
 39 |   dtype = tf.float32
 40 |     
 41 |   def make_leaf(i):
 42 |     name = "leaf"+str(i)
 43 |     val = gen_random_ops._random_uniform((n2, n2), dtype, name=name)
 44 |     return val
 45 |    
 46 |   def make_merge(a, b, i):
 47 |     name = "merge"+str(i)
 48 |     merge_node = tf.matmul(a, b, name=name)
 49 |     #    nonlinear_node = tf.tanh(merge_node, name="tanh"+str(i))
 50 |     #nonlinear_node = tf.identity(merge_node, name="tanh"+str(i))
 51 |     return merge_node
 52 | 
 53 |   leaf0 = make_leaf(0)
 54 |   node0 = tf.identity(leaf0, name="merge0")
 55 |   node = node0
 56 |   nodes = [node]
 57 |   
 58 |   for i in range(1, length+1):
 59 |     leaf = make_leaf(i)
 60 |     node = make_merge(node, leaf, i)
 61 |     nodes.append(node)
 62 |   return nodes
 63 | 
 64 | def test_print():
 65 |   """Should print:
 66 |   leaf1 -> merge1
 67 |   leaf0 -> merge0
 68 |   merge1 -> merge2
 69 |   merge0 -> merge1
 70 |   leaf2 -> merge2
 71 |   leaf0/shape -> leaf0
 72 |   leaf1/shape -> leaf1
 73 |   leaf2/shape -> leaf2
 74 |   """
 75 |   
 76 |   nodes = make_caterpillar_graph(length=2)
 77 |   linearize.print_tf_graph(linearize.get_graph())
 78 |   
 79 | 
 80 | def test_toposort():
 81 |   nodes = make_caterpillar_graph(length=2)
 82 |   graph = linearize.get_graph()
 83 |   print(list(toposort.toposort(graph)))
 84 | 
 85 | 
 86 | def test_linearize():
 87 |   nodes = make_caterpillar_graph(5)
 88 |   linearize.linearize()
 89 | 
 90 |   sess = create_session()
 91 | 
 92 |   import memory_util
 93 |   memory_util.vlog(1)
 94 |   with memory_util.capture_stderr() as stderr:
 95 |     sess.run(nodes[-1].op)
 96 |   memory_util.print_memory_timeline(stderr, ignore_less_than_bytes=1000)
 97 | 
 98 | if __name__=='__main__':
 99 |   setup_env()
100 |   import memory_util
101 |   memory_util.vlog(1)
102 |   
103 |   #  sess = create_session()
104 |   #nodes = make_caterpillar_graph()
105 |   #  test_print()
106 |   #  linearize.print_tf_graph(linearize.get_graph())
107 |   #  print(tf.get_default_graph().as_graph_def())
108 |   #  test_toposort()
109 |   test_linearize()
110 |   sys.exit()
111 |   #  with memory_util.capture_stderr() as stderr:
112 |   #    print(sess.run(nodes[-1][0,0]))
113 |   print(len(stderr.getvalue()))
114 |   memory_util.print_memory_timeline(stderr, ignore_less_than_bytes=1000)
115 | 


--------------------------------------------------------------------------------
/matmul_benchmark.py:
--------------------------------------------------------------------------------
 1 | # On Titan X (Pascal)
 2 | # 8192 x 8192 matmul took: 0.10 sec, 11304.59 G ops/sec
 3 | # http://stackoverflow.com/questions/41804380/testing-gpu-with-tensorflow-matrix-multiplication
 4 | 
 5 | import os
 6 | import sys
 7 | import tensorflow as tf
 8 | import time
 9 | 
10 | n = 8192
11 | dtype = tf.float32
12 | with tf.device("/gpu:0"):
13 |     matrix1 = tf.Variable(tf.ones((n, n), dtype=dtype))
14 |     matrix2 = tf.Variable(tf.ones((n, n), dtype=dtype))
15 |     product = tf.matmul(matrix1, matrix2)
16 | 
17 | 
18 | # avoid optimizing away redundant nodes
19 | config = tf.ConfigProto(graph_options=tf.GraphOptions(optimizer_options=tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0)))
20 | sess = tf.Session(config=config)
21 | 
22 | sess.run(tf.global_variables_initializer())
23 | iters = 10
24 | 
25 | # pre-warming
26 | sess.run(product.op)
27 | 
28 | start = time.time()
29 | for i in range(iters):
30 |   sess.run(product.op)
31 | end = time.time()
32 | ops = n**3 + (n-1)*n**2 # n^2*(n-1) additions, n^3 multiplications
33 | elapsed = (end - start)
34 | rate = iters*ops/elapsed/10**9
35 | print('\n %d x %d matmul took: %.2f sec, %.2f G ops/sec' % (n, n,
36 |                                                             elapsed/iters,
37 |                                                             rate,))
38 | 


--------------------------------------------------------------------------------
/matmul_times/1080-float16.csv:
--------------------------------------------------------------------------------
  1 | 1,0.0000000034
  2 | 1,0.0000000036
  3 | 1,0.0000000039
  4 | 1,0.0000000035
  5 | 1,0.0000000043
  6 | 1,0.0000000046
  7 | 1,0.0000000050
  8 | 1,0.0000000054
  9 | 2,0.0000000584
 10 | 2,0.0000000644
 11 | 2,0.0000000674
 12 | 2,0.0000000687
 13 | 2,0.0000000752
 14 | 3,0.0000002873
 15 | 3,0.0000002522
 16 | 3,0.0000002471
 17 | 4,0.0000007086
 18 | 4,0.0000007393
 19 | 4,0.0000007178
 20 | 5,0.0000013838
 21 | 5,0.0000015370
 22 | 6,0.0000026987
 23 | 6,0.0000022227
 24 | 7,0.0000036381
 25 | 8,0.0000054416
 26 | 8,0.0000053568
 27 | 9,0.0000082184
 28 | 10,0.0000110903
 29 | 11,0.0000138653
 30 | 12,0.0000189915
 31 | 13,0.0000241109
 32 | 14,0.0000293803
 33 | 16,0.0000457074
 34 | 17,0.0000509526
 35 | 19,0.0000716285
 36 | 20,0.0000863740
 37 | 22,0.0001204977
 38 | 24,0.0001568045
 39 | 26,0.0001843983
 40 | 29,0.0002785588
 41 | 32,0.0003894144
 42 | 34,0.0004765664
 43 | 38,0.0006730307
 44 | 41,0.0008188075
 45 | 45,0.0010837874
 46 | 49,0.0014305645
 47 | 53,0.0022087100
 48 | 58,0.0026427016
 49 | 64,0.0031762071
 50 | 69,0.0033630118
 51 | 76,0.0040770393
 52 | 82,0.0044708883
 53 | 90,0.0062248578
 54 | 98,0.0092337182
 55 | 107,0.0139221632
 56 | 117,0.0178044032
 57 | 128,0.0224213489
 58 | 139,0.0299333631
 59 | 152,0.0379965451
 60 | 165,0.0496895280
 61 | 181,0.0672505017
 62 | 197,0.0831603483
 63 | 215,0.1132126430
 64 | 234,0.1287379171
 65 | 256,0.1819730888
 66 | 279,0.2485196740
 67 | 304,0.3036680806
 68 | 331,0.3879232771
 69 | 362,0.4661769607
 70 | 394,0.5644570767
 71 | 430,0.7777496690
 72 | 469,0.9742523210
 73 | 512,1.2651589030
 74 | 558,1.4602956948
 75 | 608,1.8217860513
 76 | 663,1.9921930853
 77 | 724,2.2364226876
 78 | 789,2.1433891063
 79 | 861,2.7090783898
 80 | 939,2.9531931908
 81 | 1024,3.5877896025
 82 | 1116,3.6779722946
 83 | 1217,4.3078682686
 84 | 1327,4.6144299678
 85 | 1448,5.0839816350
 86 | 1579,5.3015632066
 87 | 1722,5.6268885114
 88 | 1878,5.8209107716
 89 | 2048,6.2596829924
 90 | 2233,5.9811348181
 91 | 2435,5.6518923737
 92 | 2655,7.0360807776
 93 | 2896,7.2112054057
 94 | 3158,8.0330287265
 95 | 3444,8.2290337210
 96 | 3756,8.0293896669
 97 | 4096,8.5125871285
 98 | 4466,8.5370020832
 99 | 4870,7.7743161522
100 | 5311,8.3979650843
101 | 5792,8.4688923679
102 | 6316,8.4772457343
103 | 6888,8.6937689402
104 | 7512,8.6359499182
105 | 8192,8.8169536875
106 | 8933,8.6668573644
107 | 9741,8.3756722311
108 | 10623,8.7015778540
109 | 11585,8.5536109363
110 | 12633,8.6088656910
111 | 13777,8.5709830209
112 | 


--------------------------------------------------------------------------------
/matmul_times/1080-float32.csv:
--------------------------------------------------------------------------------
  1 | 1,0.0000000045
  2 | 1,0.0000000035
  3 | 1,0.0000000043
  4 | 1,0.0000000034
  5 | 1,0.0000000043
  6 | 1,0.0000000049
  7 | 1,0.0000000050
  8 | 1,0.0000000056
  9 | 2,0.0000000609
 10 | 2,0.0000000714
 11 | 2,0.0000000697
 12 | 2,0.0000000659
 13 | 2,0.0000000714
 14 | 3,0.0000003000
 15 | 3,0.0000003220
 16 | 3,0.0000002666
 17 | 4,0.0000006627
 18 | 4,0.0000007880
 19 | 4,0.0000008079
 20 | 5,0.0000015459
 21 | 5,0.0000012673
 22 | 6,0.0000023556
 23 | 6,0.0000025808
 24 | 7,0.0000039604
 25 | 8,0.0000058466
 26 | 8,0.0000060740
 27 | 9,0.0000092089
 28 | 10,0.0000118796
 29 | 11,0.0000163369
 30 | 12,0.0000240841
 31 | 13,0.0000299625
 32 | 14,0.0000387380
 33 | 16,0.0000520994
 34 | 17,0.0000523390
 35 | 19,0.0000795539
 36 | 20,0.0000982683
 37 | 22,0.0001214796
 38 | 24,0.0001636920
 39 | 26,0.0002092548
 40 | 29,0.0002703317
 41 | 32,0.0003729074
 42 | 34,0.0004653672
 43 | 38,0.0006168099
 44 | 41,0.0008476423
 45 | 45,0.0010729708
 46 | 49,0.0014324164
 47 | 53,0.0017386005
 48 | 58,0.0028449365
 49 | 64,0.0031177020
 50 | 69,0.0038735519
 51 | 76,0.0062974793
 52 | 82,0.0064873469
 53 | 90,0.0098341099
 54 | 98,0.0116686863
 55 | 107,0.0157851631
 56 | 117,0.0202608931
 57 | 128,0.0292374706
 58 | 139,0.0367351869
 59 | 152,0.0526993296
 60 | 165,0.0644459866
 61 | 181,0.0702350321
 62 | 197,0.0879495323
 63 | 215,0.1242069765
 64 | 234,0.1359054587
 65 | 256,0.1859243927
 66 | 279,0.2025442452
 67 | 304,0.2457022567
 68 | 331,0.3658846812
 69 | 362,0.4848338147
 70 | 394,0.5143358856
 71 | 430,0.7882646333
 72 | 469,0.8928990397
 73 | 512,1.1656835586
 74 | 558,1.3195744775
 75 | 608,1.6413668097
 76 | 663,1.8156536501
 77 | 724,2.2290742492
 78 | 789,2.6347425512
 79 | 861,2.7744974657
 80 | 939,3.3441175965
 81 | 1024,3.7699864220
 82 | 1116,4.2669058057
 83 | 1217,4.2239304343
 84 | 1327,4.2999952491
 85 | 1448,5.0765567637
 86 | 1579,4.8243019385
 87 | 1722,5.5287772808
 88 | 1878,5.8710045088
 89 | 2048,6.2494979996
 90 | 2233,6.0713500257
 91 | 2435,5.6761027623
 92 | 2655,6.5709195721
 93 | 2896,7.4728911051
 94 | 3158,7.9778022427
 95 | 3444,8.1693656027
 96 | 3756,7.9928773714
 97 | 4096,8.5269678381
 98 | 4466,8.3420676045
 99 | 4870,7.4717510687
100 | 5311,8.1053401717
101 | 5792,8.1765165436
102 | 6316,8.2193813665
103 | 6888,8.2207526766
104 | 7512,8.2530108115
105 | 8192,8.6345897045
106 | 8933,8.3026164362
107 | 9741,7.9344509507
108 | 


--------------------------------------------------------------------------------
/matmul_times/g3-float16.csv:
--------------------------------------------------------------------------------
  1 | 1,0.0000000054
  2 | 1,0.0000000051
  3 | 1,0.0000000065
  4 | 1,0.0000000044
  5 | 1,0.0000000048
  6 | 1,0.0000000045
  7 | 1,0.0000000042
  8 | 1,0.0000000048
  9 | 2,0.0000000598
 10 | 2,0.0000000522
 11 | 2,0.0000000602
 12 | 2,0.0000000592
 13 | 2,0.0000000569
 14 | 3,0.0000002437
 15 | 3,0.0000002260
 16 | 3,0.0000002362
 17 | 4,0.0000005616
 18 | 4,0.0000005722
 19 | 4,0.0000005844
 20 | 5,0.0000012099
 21 | 5,0.0000011887
 22 | 6,0.0000018102
 23 | 6,0.0000016259
 24 | 7,0.0000028216
 25 | 8,0.0000040517
 26 | 8,0.0000043483
 27 | 9,0.0000062360
 28 | 10,0.0000095128
 29 | 11,0.0000114769
 30 | 12,0.0000153840
 31 | 13,0.0000180906
 32 | 14,0.0000243247
 33 | 16,0.0000400715
 34 | 17,0.0000377767
 35 | 19,0.0000514943
 36 | 20,0.0000683860
 37 | 22,0.0000865112
 38 | 24,0.0001051213
 39 | 26,0.0001426751
 40 | 29,0.0001913256
 41 | 32,0.0002875058
 42 | 34,0.0003051285
 43 | 38,0.0004346936
 44 | 41,0.0005442804
 45 | 45,0.0007014911
 46 | 49,0.0010251019
 47 | 53,0.0012280063
 48 | 58,0.0016934492
 49 | 64,0.0020862005
 50 | 69,0.0027503521
 51 | 76,0.0031500031
 52 | 82,0.0035116997
 53 | 90,0.0046931443
 54 | 98,0.0081784715
 55 | 107,0.0083872862
 56 | 117,0.0115189820
 57 | 128,0.0190747343
 58 | 139,0.0185188080
 59 | 152,0.0249379696
 60 | 165,0.0328195935
 61 | 181,0.0387405693
 62 | 197,0.0508121822
 63 | 215,0.0742819229
 64 | 234,0.0777197828
 65 | 256,0.1125915606
 66 | 279,0.1304913186
 67 | 304,0.1583694332
 68 | 331,0.1888342444
 69 | 362,0.2387355070
 70 | 394,0.2557542415
 71 | 430,0.3165042259
 72 | 469,0.4105311260
 73 | 512,0.5243249028
 74 | 558,0.5828114282
 75 | 608,0.6143411728
 76 | 663,0.6469807857
 77 | 724,0.9105040736
 78 | 789,0.9376811718
 79 | 861,1.0619575012
 80 | 939,1.2155576995
 81 | 1024,1.4132319602
 82 | 1116,1.4260724553
 83 | 1217,1.4432986598
 84 | 1327,1.4319563294
 85 | 1448,1.4471410970
 86 | 1579,1.5607382246
 87 | 1722,1.6237686216
 88 | 1878,1.9481190133
 89 | 2048,2.3587374197
 90 | 2233,2.4338772716
 91 | 2435,2.5985527843
 92 | 2655,3.1787637172
 93 | 2896,3.4698303371
 94 | 3158,3.8602679332
 95 | 3444,4.0042434553
 96 | 3756,3.8255689503
 97 | 4096,4.0256793455
 98 | 4466,4.0370775306
 99 | 4870,3.8937734305
100 | 5311,3.9990913681
101 | 5792,3.9777389470
102 | 6316,4.0011357263
103 | 6888,4.0787678870
104 | 7512,4.0671405648
105 | 8192,4.1312549545
106 | 8933,4.0862087134
107 | 9741,4.0105048138
108 | 10623,4.1034223568
109 | 11585,4.0652498675
110 | 12633,4.0691972371
111 | 


--------------------------------------------------------------------------------
/matmul_times/g3-float32.csv:
--------------------------------------------------------------------------------
  1 | 1,0.0000000073
  2 | 1,0.0000000048
  3 | 1,0.0000000054
  4 | 1,0.0000000052
  5 | 1,0.0000000055
  6 | 1,0.0000000052
  7 | 1,0.0000000051
  8 | 1,0.0000000048
  9 | 2,0.0000000586
 10 | 2,0.0000000613
 11 | 2,0.0000000606
 12 | 2,0.0000000604
 13 | 2,0.0000000590
 14 | 3,0.0000002199
 15 | 3,0.0000002166
 16 | 3,0.0000002331
 17 | 4,0.0000005445
 18 | 4,0.0000005953
 19 | 4,0.0000005509
 20 | 5,0.0000011035
 21 | 5,0.0000010665
 22 | 6,0.0000018245
 23 | 6,0.0000020743
 24 | 7,0.0000030141
 25 | 8,0.0000048806
 26 | 8,0.0000041701
 27 | 9,0.0000067732
 28 | 10,0.0000091322
 29 | 11,0.0000126449
 30 | 12,0.0000167117
 31 | 13,0.0000194493
 32 | 14,0.0000258426
 33 | 16,0.0000528046
 34 | 17,0.0000458252
 35 | 19,0.0000690545
 36 | 20,0.0000746672
 37 | 22,0.0001094539
 38 | 24,0.0001388742
 39 | 26,0.0001667570
 40 | 29,0.0002413090
 41 | 32,0.0003086639
 42 | 34,0.0003747490
 43 | 38,0.0005190560
 44 | 41,0.0005871084
 45 | 45,0.0009346655
 46 | 49,0.0010798767
 47 | 53,0.0014085849
 48 | 58,0.0022631995
 49 | 64,0.0025011876
 50 | 69,0.0032449305
 51 | 76,0.0034306438
 52 | 82,0.0041311552
 53 | 90,0.0051279059
 54 | 98,0.0074588679
 55 | 107,0.0098178931
 56 | 117,0.0124193482
 57 | 128,0.0160830886
 58 | 139,0.0211496424
 59 | 152,0.0247961973
 60 | 165,0.0338157899
 61 | 181,0.0430062583
 62 | 197,0.0542269884
 63 | 215,0.0718169422
 64 | 234,0.0988467147
 65 | 256,0.1196925419
 66 | 279,0.1455098916
 67 | 304,0.1972818843
 68 | 331,0.1772180971
 69 | 362,0.2636590135
 70 | 394,0.2754185967
 71 | 430,0.3726414457
 72 | 469,0.3835439101
 73 | 512,0.6403236611
 74 | 558,0.6271523764
 75 | 608,0.6723487806
 76 | 663,0.6716123262
 77 | 724,0.8588339883
 78 | 789,0.9945220783
 79 | 861,1.1051620527
 80 | 939,1.2015233788
 81 | 1024,1.3994641071
 82 | 1116,1.3509447411
 83 | 1217,1.4638138307
 84 | 1327,1.4528602068
 85 | 1448,1.4676407356
 86 | 1579,1.5869783426
 87 | 1722,1.6656387593
 88 | 1878,1.9194971156
 89 | 2048,2.3645307724
 90 | 2233,2.4380628450
 91 | 2435,2.6216720574
 92 | 2655,3.1950040147
 93 | 2896,3.4910323127
 94 | 3158,3.8479634522
 95 | 3444,3.9335505780
 96 | 3756,3.8047738191
 97 | 4096,3.9912365038
 98 | 4466,3.9395745634
 99 | 4870,3.7676875717
100 | 5311,3.8672090765
101 | 5792,3.8548877523
102 | 6316,3.8473078459
103 | 6888,3.9200867856
104 | 7512,3.8976030316
105 | 8192,3.9648233326
106 | 8933,3.9030829731
107 | 


--------------------------------------------------------------------------------
/matmul_times/nvidia-p3-float16.csv:
--------------------------------------------------------------------------------
  1 | 1,0.0000000052
  2 | 1,0.0000000041
  3 | 1,0.0000000046
  4 | 1,0.0000000049
  5 | 1,0.0000000042
  6 | 1,0.0000000041
  7 | 1,0.0000000046
  8 | 1,0.0000000055
  9 | 2,0.0000000554
 10 | 2,0.0000000637
 11 | 2,0.0000000545
 12 | 2,0.0000000532
 13 | 2,0.0000000507
 14 | 3,0.0000002001
 15 | 3,0.0000002108
 16 | 3,0.0000002082
 17 | 4,0.0000004921
 18 | 4,0.0000004808
 19 | 4,0.0000005280
 20 | 5,0.0000010680
 21 | 5,0.0000010738
 22 | 6,0.0000017084
 23 | 6,0.0000017649
 24 | 7,0.0000027460
 25 | 8,0.0000041874
 26 | 8,0.0000042690
 27 | 9,0.0000069834
 28 | 10,0.0000086023
 29 | 11,0.0000106921
 30 | 12,0.0000133234
 31 | 13,0.0000179914
 32 | 14,0.0000246968
 33 | 16,0.0000337639
 34 | 17,0.0000395959
 35 | 19,0.0000656157
 36 | 20,0.0000653406
 37 | 22,0.0000929182
 38 | 24,0.0001354284
 39 | 26,0.0001817980
 40 | 29,0.0002643254
 41 | 32,0.0002998829
 42 | 34,0.0003157878
 43 | 38,0.0004163927
 44 | 41,0.0007430828
 45 | 45,0.0007193491
 46 | 49,0.0012699617
 47 | 53,0.0014592136
 48 | 58,0.0019013776
 49 | 64,0.0028069479
 50 | 69,0.0026474907
 51 | 76,0.0037250988
 52 | 82,0.0044074782
 53 | 90,0.0076624283
 54 | 98,0.0099851778
 55 | 107,0.0103070034
 56 | 117,0.0138518935
 57 | 128,0.0231608708
 58 | 139,0.0216869965
 59 | 152,0.0316261520
 60 | 165,0.0363883348
 61 | 181,0.0517542283
 62 | 197,0.0623640412
 63 | 215,0.0814378555
 64 | 234,0.1148254428
 65 | 256,0.1336743332
 66 | 279,0.2045750757
 67 | 304,0.2282127190
 68 | 331,0.3014729031
 69 | 362,0.3264810883
 70 | 394,0.4374849399
 71 | 430,0.6657965652
 72 | 469,0.6715217545
 73 | 512,1.1204232802
 74 | 558,0.9380739764
 75 | 608,1.8426026193
 76 | 663,1.4936602735
 77 | 724,2.0298141173
 78 | 789,2.4388995886
 79 | 861,2.8479440678
 80 | 939,3.5406921338
 81 | 1024,8.3014007112
 82 | 1116,4.9172887222
 83 | 1217,4.8020404036
 84 | 1327,6.9848150236
 85 | 1448,14.6789663308
 86 | 1579,7.2316324211
 87 | 1722,8.7278280040
 88 | 1878,10.5813931247
 89 | 2048,27.3441403434
 90 | 2233,12.1147870981
 91 | 2435,12.0358759694
 92 | 2655,12.1954933158
 93 | 2896,46.6051363046
 94 | 3158,13.2800244576
 95 | 3444,13.8267628176
 96 | 3756,13.7995925164
 97 | 4096,69.9589102386
 98 | 4466,14.3317932735
 99 | 4870,14.1166294853
100 | 5311,14.9101748975
101 | 5792,71.8273441365
102 | 6316,14.8697839164
103 | 6888,75.0021194804
104 | 7512,76.0847702634
105 | 8192,87.2323633474
106 | 8933,15.2443599021
107 | 9741,15.0255254543
108 | 10623,15.4011254535
109 | 11585,15.3233762417
110 | 12633,15.4141927233
111 | 13777,15.4542546400
112 | 15024,51.3086154117
113 | 16384,54.8225731495
114 | 17866,15.4363718738
115 | 19483,15.4177083800
116 | 


--------------------------------------------------------------------------------
/matmul_times/nvidia-p3-float32.csv:
--------------------------------------------------------------------------------
  1 | 1,0.0000000064
  2 | 1,0.0000000049
  3 | 1,0.0000000051
  4 | 1,0.0000000050
  5 | 1,0.0000000052
  6 | 1,0.0000000042
  7 | 1,0.0000000046
  8 | 1,0.0000000045
  9 | 2,0.0000000510
 10 | 2,0.0000000485
 11 | 2,0.0000000479
 12 | 2,0.0000000548
 13 | 2,0.0000000471
 14 | 3,0.0000002029
 15 | 3,0.0000002306
 16 | 3,0.0000001912
 17 | 4,0.0000004770
 18 | 4,0.0000004901
 19 | 4,0.0000005156
 20 | 5,0.0000009719
 21 | 5,0.0000009661
 22 | 6,0.0000017499
 23 | 6,0.0000017244
 24 | 7,0.0000031790
 25 | 8,0.0000038894
 26 | 8,0.0000038526
 27 | 9,0.0000059240
 28 | 10,0.0000106741
 29 | 11,0.0000110602
 30 | 12,0.0000139035
 31 | 13,0.0000181564
 32 | 14,0.0000236016
 33 | 16,0.0000321635
 34 | 17,0.0000413539
 35 | 19,0.0000546057
 36 | 20,0.0000616567
 37 | 22,0.0001286502
 38 | 24,0.0001390447
 39 | 26,0.0001460748
 40 | 29,0.0001951062
 41 | 32,0.0002772766
 42 | 34,0.0003086856
 43 | 38,0.0004384349
 44 | 41,0.0005533988
 45 | 45,0.0007149317
 46 | 49,0.0009276363
 47 | 53,0.0013331269
 48 | 58,0.0018695030
 49 | 64,0.0022682527
 50 | 69,0.0031001839
 51 | 76,0.0035845403
 52 | 82,0.0045948409
 53 | 90,0.0074623255
 54 | 98,0.0074607993
 55 | 107,0.0097530265
 56 | 117,0.0138274526
 57 | 128,0.0186669288
 58 | 139,0.0286824569
 59 | 152,0.0268727477
 60 | 165,0.0355386730
 61 | 181,0.0473796592
 62 | 197,0.0659762906
 63 | 215,0.0864406612
 64 | 234,0.0885813776
 65 | 256,0.1523531400
 66 | 279,0.1636012400
 67 | 304,0.2584961682
 68 | 331,0.2811011999
 69 | 362,0.3519482168
 70 | 394,0.4218602630
 71 | 430,0.5220794553
 72 | 469,0.5562290865
 73 | 512,0.9788097312
 74 | 558,0.8579025595
 75 | 608,1.2380418081
 76 | 663,1.5557990302
 77 | 724,2.1672233818
 78 | 789,2.5753768218
 79 | 861,2.2584686231
 80 | 939,3.0033014996
 81 | 1024,3.8812342065
 82 | 1116,4.1678976639
 83 | 1217,4.4648922758
 84 | 1327,5.3470564838
 85 | 1448,6.3065089240
 86 | 1579,6.8292859679
 87 | 1722,7.5663780618
 88 | 1878,8.2142042796
 89 | 2048,8.9502136919
 90 | 2233,10.0775378689
 91 | 2435,9.4611509618
 92 | 2655,10.4873276759
 93 | 2896,11.0233645833
 94 | 3158,11.7347679907
 95 | 3444,11.8439367523
 96 | 3756,11.8604344909
 97 | 4096,13.3133777524
 98 | 4466,14.6612454411
 99 | 4870,14.1017926913
100 | 5311,15.0166186211
101 | 5792,14.5565515674
102 | 6316,14.8739420297
103 | 6888,14.7140872238
104 | 7512,15.1461848986
105 | 8192,15.2734148530
106 | 8933,15.2323380953
107 | 9741,14.9980861870
108 | 10623,15.3962168307
109 | 11585,15.3180348029
110 | 12633,15.4027308920
111 | 13777,15.4373195183
112 | 


--------------------------------------------------------------------------------
/matmul_times/p2-float16.csv:
--------------------------------------------------------------------------------
  1 | 1,0.0000000067
  2 | 1,0.0000000072
  3 | 1,0.0000000069
  4 | 1,0.0000000069
  5 | 1,0.0000000072
  6 | 1,0.0000000072
  7 | 1,0.0000000072
  8 | 1,0.0000000069
  9 | 2,0.0000000827
 10 | 2,0.0000000814
 11 | 2,0.0000000822
 12 | 2,0.0000000868
 13 | 2,0.0000000825
 14 | 3,0.0000002982
 15 | 3,0.0000002975
 16 | 3,0.0000003082
 17 | 4,0.0000007697
 18 | 4,0.0000007342
 19 | 4,0.0000007368
 20 | 5,0.0000014666
 21 | 5,0.0000015306
 22 | 6,0.0000026006
 23 | 6,0.0000027080
 24 | 7,0.0000041761
 25 | 8,0.0000063052
 26 | 8,0.0000066057
 27 | 9,0.0000089852
 28 | 10,0.0000134527
 29 | 11,0.0000164623
 30 | 12,0.0000217909
 31 | 13,0.0000285714
 32 | 14,0.0000345181
 33 | 16,0.0000524895
 34 | 17,0.0000587160
 35 | 19,0.0000842484
 36 | 20,0.0000992910
 37 | 22,0.0001303871
 38 | 24,0.0001693906
 39 | 26,0.0002135477
 40 | 29,0.0002919552
 41 | 32,0.0003960148
 42 | 34,0.0004729200
 43 | 38,0.0006482468
 44 | 41,0.0008219702
 45 | 45,0.0010681646
 46 | 49,0.0013798360
 47 | 53,0.0017229605
 48 | 58,0.0022514637
 49 | 64,0.0030171800
 50 | 69,0.0037498750
 51 | 76,0.0050644601
 52 | 82,0.0060690625
 53 | 90,0.0078262982
 54 | 98,0.0099887460
 55 | 107,0.0128728047
 56 | 117,0.0163691077
 57 | 128,0.0214060202
 58 | 139,0.0268555938
 59 | 152,0.0346719627
 60 | 165,0.0429776979
 61 | 181,0.0556467780
 62 | 197,0.0694491538
 63 | 215,0.0887498153
 64 | 234,0.1106947986
 65 | 256,0.1407321169
 66 | 279,0.1764994256
 67 | 304,0.2221952034
 68 | 331,0.2682980740
 69 | 362,0.3377952514
 70 | 394,0.4052823128
 71 | 430,0.5053214475
 72 | 469,0.6015845170
 73 | 512,0.7474339226
 74 | 558,0.8127210424
 75 | 608,0.9427789145
 76 | 663,0.8139654736
 77 | 724,0.9727766023
 78 | 789,1.1184685269
 79 | 861,1.2510366606
 80 | 939,1.1916444116
 81 | 1024,1.3306496804
 82 | 1116,1.2824753318
 83 | 1217,1.3783316294
 84 | 1327,1.4173754597
 85 | 1448,1.4487053520
 86 | 1579,1.4887979924
 87 | 1722,1.5300201882
 88 | 1878,1.5429387891
 89 | 2048,1.6414710251
 90 | 2233,1.6066053016
 91 | 2435,1.5576891764
 92 | 2655,1.6047121464
 93 | 2896,1.5857469506
 94 | 3158,1.6055466919
 95 | 3444,1.6241025204
 96 | 3756,1.6241688208
 97 | 4096,1.6478262379
 98 | 4466,1.6268917333
 99 | 4870,1.6020022079
100 | 5311,1.5891106124
101 | 5792,1.5323465126
102 | 6316,1.4899392695
103 | 6888,1.4858769476
104 | 7512,1.4761556421
105 | 8192,1.5168820074
106 | 8933,1.4726656810
107 | 9741,1.4652164240
108 | 10623,1.4744597313
109 | 11585,1.4591969299
110 | 12633,1.4463543210
111 | 13777,1.4481929981
112 | 15024,1.4724910144
113 | 16384,1.4893490224
114 | 


--------------------------------------------------------------------------------
/matmul_times/p2-float32.csv:
--------------------------------------------------------------------------------
  1 | 1,0.0000000062
  2 | 1,0.0000000068
  3 | 1,0.0000000065
  4 | 1,0.0000000064
  5 | 1,0.0000000065
  6 | 1,0.0000000064
  7 | 1,0.0000000065
  8 | 1,0.0000000065
  9 | 2,0.0000000753
 10 | 2,0.0000000749
 11 | 2,0.0000000752
 12 | 2,0.0000000716
 13 | 2,0.0000000746
 14 | 3,0.0000002802
 15 | 3,0.0000002800
 16 | 3,0.0000002791
 17 | 4,0.0000006948
 18 | 4,0.0000006977
 19 | 4,0.0000006917
 20 | 5,0.0000013994
 21 | 5,0.0000013979
 22 | 6,0.0000024555
 23 | 6,0.0000023712
 24 | 7,0.0000037908
 25 | 8,0.0000057672
 26 | 8,0.0000059771
 27 | 9,0.0000087234
 28 | 10,0.0000117764
 29 | 11,0.0000151661
 30 | 12,0.0000202780
 31 | 13,0.0000260046
 32 | 14,0.0000323737
 33 | 16,0.0000465056
 34 | 17,0.0000536368
 35 | 19,0.0000743973
 36 | 20,0.0000867669
 37 | 22,0.0001151412
 38 | 24,0.0001512901
 39 | 26,0.0001919364
 40 | 29,0.0002657910
 41 | 32,0.0003578811
 42 | 34,0.0004240251
 43 | 38,0.0005969277
 44 | 41,0.0007429814
 45 | 45,0.0009892200
 46 | 49,0.0012556719
 47 | 53,0.0016050206
 48 | 58,0.0021076201
 49 | 64,0.0028424239
 50 | 69,0.0034972628
 51 | 76,0.0046591594
 52 | 82,0.0058126913
 53 | 90,0.0079179318
 54 | 98,0.0098811798
 55 | 107,0.0127129541
 56 | 117,0.0175084783
 57 | 128,0.0224006348
 58 | 139,0.0278861869
 59 | 152,0.0368665318
 60 | 165,0.0469158064
 61 | 181,0.0609087915
 62 | 197,0.0734990603
 63 | 215,0.0931482866
 64 | 234,0.1193392140
 65 | 256,0.1542684148
 66 | 279,0.1888398340
 67 | 304,0.2380925611
 68 | 331,0.2674025149
 69 | 362,0.3328934757
 70 | 394,0.3741520061
 71 | 430,0.4676547051
 72 | 469,0.5209803082
 73 | 512,0.6465247933
 74 | 558,0.7790807773
 75 | 608,0.8420655397
 76 | 663,1.0007404240
 77 | 724,1.0769312388
 78 | 789,1.2099749766
 79 | 861,1.2660685126
 80 | 939,1.2971475261
 81 | 1024,1.5355688801
 82 | 1116,1.5382477777
 83 | 1217,1.5904825608
 84 | 1327,1.6077547562
 85 | 1448,1.7429501426
 86 | 1579,1.8778230197
 87 | 1722,1.8987380664
 88 | 1878,1.8820152242
 89 | 2048,1.9533219189
 90 | 2233,2.3653780610
 91 | 2435,2.3538986067
 92 | 2655,3.0718712072
 93 | 2896,3.2225252910
 94 | 3158,3.1766847222
 95 | 3444,3.2306806382
 96 | 3756,3.2422227956
 97 | 4096,3.2994986714
 98 | 4466,3.3113207461
 99 | 4870,3.1896135517
100 | 5311,3.2127539480
101 | 5792,3.3224748444
102 | 6316,3.0620875510
103 | 6888,3.1754668235
104 | 7512,3.1448125620
105 | 8192,3.1791185412
106 | 8933,2.9275908341
107 | 9741,2.8994841052
108 | 10623,2.8849807107
109 | 11585,2.8143021900
110 | 


--------------------------------------------------------------------------------
/mavelin/machine1.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import tensorflow as tf
 3 | 
 4 | cluster_spec = tf.train.ClusterSpec({
 5 |   "a": { 0: "localhost:8000" },
 6 |   "b": { 0: "localhost:8001" },
 7 | })
 8 | 
 9 | jobname = "a"
10 | taskid = 0
11 | server = tf.train.Server(cluster_spec, jobname, taskid)
12 | 
13 | with tf.device("/job:a/task:0/cpu:0"):
14 |   queue = tf.FIFOQueue(
15 |     capacity=100, dtypes=[tf.int64],
16 |     shapes=[[]], shared_name="a_queue", name="a_queue")
17 | 
18 | if jobname == "a" and taskid == 0:
19 |   enqueue_op = queue.enqueue(10)
20 |   sess = tf.Session(server.target)
21 |   while True:
22 |     sess.run(enqueue_op)
23 | else:
24 |   dequeue_op = queue.dequeue()
25 |   sess = tf.Session(server.target)
26 |   while True:
27 |     print(sess.run(dequeue_op))
28 | 


--------------------------------------------------------------------------------
/mavelin/machine3.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import tensorflow as tf
 3 | 
 4 | cluster_spec = tf.train.ClusterSpec({
 5 |   "a": { 0: "localhost:8000" },
 6 |   "b": { 1: "localhost:8001" },
 7 | })
 8 | 
 9 | DOFAIL=True
10 | 
11 | jobname = "b"
12 | taskid = 1
13 | server = tf.train.Server(cluster_spec, jobname, taskid)
14 | 
15 | with tf.device("/job:a/task:0/cpu:0"):
16 |   queue = tf.FIFOQueue(
17 |     capacity=100, dtypes=[tf.int64],
18 |     shapes=[[]], shared_name="a_queue", name="a_queue")
19 | 
20 | if jobname == "a" and taskid == 0:
21 |   enqueue_op = queue.enqueue(10)
22 |   sess = tf.Session(server.target)
23 |   while True:
24 |     sess.run(enqueue_op)
25 | else:
26 |   with tf.device("/job:b/task:1"):
27 |     out = queue.dequeue()
28 |     queue_b = tf.FIFOQueue(capacity=100, dtypes=[tf.int64], shapes=[[]], name="b_queue")
29 |     if DOFAIL:
30 |         out = tf.cond(tf.equal(out, 10), lambda: queue_b.enqueue(out), lambda: tf.no_op())
31 |         g = tf.get_default_graph()
32 |         from tensorflow.core.framework import attr_value_pb2
33 |         op = g.get_operation_by_name('cond/b_queue_enqueue/Switch_1')
34 |         op.node_def.attr["_class"].CopyFrom(attr_value_pb2.AttrValue(
35 |           list=attr_value_pb2.AttrValue.ListValue(s=[])))
36 | 
37 |         op = g.get_operation_by_name('cond/b_queue_enqueue/Switch')
38 |         op.node_def.attr["_class"].CopyFrom(attr_value_pb2.AttrValue(
39 |           list=attr_value_pb2.AttrValue.ListValue(s=[])))
40 |         
41 |         with open('fail.pbtxt', 'w') as outf:
42 |             outf.write(str(tf.get_default_graph().as_graph_def()))
43 |     else:
44 |         enq = queue_b.enqueue(out)
45 |         no_op = tf.no_op()
46 |         out = tf.cond(tf.equal(out, 10), lambda: enq, lambda: no_op)
47 |         with open('pass.pbtxt', 'w') as outf:
48 |             outf.write(str(tf.get_default_graph().as_graph_def()))
49 |         
50 | 
51 |   sess = tf.Session(server.target)
52 |   while True:
53 |     print(sess.run(out))
54 | 


--------------------------------------------------------------------------------
/notebook_util.py:
--------------------------------------------------------------------------------
 1 | import subprocess, re, os, sys
 2 | 
 3 | # GPU picking
 4 | # http://stackoverflow.com/a/41638727/419116
 5 | # Nvidia-smi GPU memory parsing.
 6 | # Tested on nvidia-smi 370.23
 7 | 
 8 | def run_command(cmd):
 9 |     """Run command, return output as string."""
10 |     
11 |     output = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True).communicate()[0]
12 |     return output.decode("ascii")
13 | 
14 | def list_available_gpus():
15 |     """Returns list of available GPU ids."""
16 |     
17 |     output = run_command("nvidia-smi -L")
18 |     # lines of the form GPU 0: TITAN X
19 |     gpu_regex = re.compile(r"GPU (?P<gpu_id>\d+):")
20 |     result = []
21 |     for line in output.strip().split("\n"):
22 |         m = gpu_regex.match(line)
23 |         assert m, "Couldnt parse "+line
24 |         result.append(int(m.group("gpu_id")))
25 |     return result
26 | 
27 | def gpu_memory_map():
28 |     """Returns map of GPU id to memory allocated on that GPU."""
29 | 
30 |     output = run_command("nvidia-smi")
31 |     gpu_output = output[output.find("GPU Memory"):]
32 |     # lines of the form
33 |     # |    0      8734    C   python                                       11705MiB |
34 |     memory_regex = re.compile(r"[|]\s+?(?P<gpu_id>\d+)\D+?(?P<pid>\d+).+[ ](?P<gpu_memory>\d+)MiB")
35 |     rows = gpu_output.split("\n")
36 |     result = {gpu_id: 0 for gpu_id in list_available_gpus()}
37 |     for row in gpu_output.split("\n"):
38 |         m = memory_regex.search(row)
39 |         if not m:
40 |             continue
41 |         gpu_id = int(m.group("gpu_id"))
42 |         gpu_memory = int(m.group("gpu_memory"))
43 |         result[gpu_id] += gpu_memory
44 |     return result
45 | 
46 | def pick_gpu_lowest_memory():
47 |     """Returns GPU with the least allocated memory"""
48 | 
49 |     memory_gpu_map = [(memory, gpu_id) for (gpu_id, memory) in gpu_memory_map().items()]
50 |     best_memory, best_gpu = sorted(memory_gpu_map)[0]
51 |     return best_gpu
52 | 
53 | def setup_one_gpu():
54 |     assert not 'tensorflow' in sys.modules, "GPU setup must happen before importing TensorFlow"
55 |     gpu_id = pick_gpu_lowest_memory()
56 |     print("Picking GPU "+str(gpu_id))
57 |     os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
58 |     os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
59 | 
60 | def setup_no_gpu():
61 |     if 'tensorflow' in sys.modules:
62 |         print("Warning, GPU setup must happen before importing TensorFlow")
63 |     os.environ["CUDA_VISIBLE_DEVICES"] = ''
64 | 


--------------------------------------------------------------------------------
/queue_mismatch.py:
--------------------------------------------------------------------------------
 1 | # from http://stackoverflow.com/questions/41920371/tensorflow-multi-threaded-queuerunner?noredirect=1#comment71036438_41920371
 2 | 
 3 | import tensorflow as tf
 4 | import numpy as np
 5 | import time
 6 | 
 7 | batch_size = 4
 8 | iters = 100
 9 | a = tf.train.range_input_producer(10, shuffle=False, name="a", capacity=batch_size*iters).dequeue()
10 | b = tf.train.range_input_producer(10, shuffle=False, name="b", capacity=batch_size*iters).dequeue()
11 | c1, c2 = tf.train.batch([a,b], num_threads=batch_size, batch_size=batch_size, capacity=iters)
12 | config = tf.ConfigProto()
13 | config.operation_timeout_in_ms=5000   # terminate on long hangs
14 | #import pdb; pdb.set_trace()
15 | sess = tf.InteractiveSession(config=config)
16 | sess.run([tf.initialize_all_variables()])
17 | 
18 | coord = tf.train.Coordinator()
19 | threads = tf.train.start_queue_runners(sess, coord)
20 | 
21 | 
22 | time.sleep(1)
23 | coord.request_stop()
24 | coord.join(threads)
25 | #print("Queue runners: ")
26 | #for qr in tf.get_default_graph().get_collection(tf.GraphKeys.QUEUE_RUNNERS):
27 | #    print("name: %s" %(qr.name))
28 | #    print("queue_name: %s" %(qr.queue.name))
29 | #    print("number of enqueue ops: %d"%(len(qr.enqueue_ops),))
30 | 
31 | results = []
32 | for i in range(iters):
33 |     d1,list1,list2 = sess.run([tf.reduce_all(tf.equal(c1, c2)), c1, c2])
34 |     if not d1:
35 |         print(list1)
36 |         print(list2)
37 |     results.append(d1)
38 | print("mismatches: %d/%d"%(iters-sum(results), iters))
39 | 
40 | 
41 | coord.request_stop()
42 | 


--------------------------------------------------------------------------------
/queues_talk/slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yaroslavvb/stuff/a8024ead315aa1b5d6976940b3a062178f0e499d/queues_talk/slides.pdf


--------------------------------------------------------------------------------
/resnet_leak_report.py:
--------------------------------------------------------------------------------
 1 | # test whether memory gets cleared on creating new sessions
 2 | import sys, os, math, random
 3 | os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
 4 | 
 5 | 
 6 | import tensorflow as tf
 7 | import numpy as np
 8 | 
 9 | if __name__=='__main__':
10 |   for i in range(10):
11 |     tf.reset_default_graph()
12 |     sess = tf.InteractiveSession()
13 | 
14 |     size = 12000
15 |     example_queue = tf.FIFOQueue(1, dtypes=[tf.float32], shapes=[[size]])
16 |     from tensorflow.python.ops import gen_random_ops
17 |     image = tf.random_uniform([size])
18 |     example_enqueue_op = example_queue.enqueue([image])
19 |     sess.run(example_enqueue_op)
20 |     sess.run(example_queue.close())
21 | 
22 |     images = example_queue.dequeue_many(1)
23 |     images = tf.concat([images]*size, axis=0)
24 |     var = tf.Variable(tf.ones_like(images))
25 | 
26 |     sess.run(tf.global_variables_initializer())
27 |     sess.run(tf.local_variables_initializer())
28 |     def relu(x):
29 |       return tf.where(tf.less(x, 0.0), x, x, name='leaky_relu')
30 |     cost = tf.reduce_sum(relu(images+var))
31 | 
32 |     grads = tf.gradients(cost, var)
33 |     _, memuse = sess.run([grads, tf.contrib.memory_stats.MaxBytesInUse()])
34 |     print("Run %d, GBs in use %.1f"%(i, memuse/10**9))
35 | 
36 |     sess.close()
37 |     del sess
38 | 


--------------------------------------------------------------------------------
/resnet_leak_report2.py:
--------------------------------------------------------------------------------
 1 | # test whether memory gets cleared on creating new sessions
 2 | import sys, os, math, random
 3 | 
 4 | 
 5 | import tensorflow as tf
 6 | import numpy as np
 7 | 
 8 | if __name__=='__main__':
 9 |   try:
10 | 
11 |     from tensorflow.core.protobuf import rewriter_config_pb2
12 |     rewrite_options = rewriter_config_pb2.RewriterConfig(
13 |       disable_model_pruning=True,
14 |       constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
15 |       memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL)
16 |     optimizer_options = tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0)
17 |     graph_options=tf.GraphOptions(optimizer_options=optimizer_options,
18 |                                   rewrite_options=rewrite_options)
19 |     config = tf.ConfigProto(graph_options=graph_options)
20 |     sess = tf.Session(config=config)
21 |     
22 |     size = 12000
23 |     num_runs = 10
24 | 
25 |     images = tf.random_uniform([size, size])
26 |     var = tf.Variable(tf.ones_like(images))
27 |     sess.run(var.initializer)
28 |     for i in range(10):
29 |       def relu(x):
30 |         return tf.where(tf.less(x, 0.0), x, x, name='leaky_relu')
31 |       cost = tf.reduce_sum(relu(images+var))
32 | 
33 |       grads = tf.gradients(cost, var)
34 |       _, memuse, memuse2 = sess.run([grads, tf.contrib.memory_stats.MaxBytesInUse(), tf.contrib.memory_stats.BytesInUse()])
35 |       print("Run %d, GBs in use %.2f, %.2f"%(i, memuse/10**9,memuse2/10**9))
36 |   except:
37 |     pass
38 |   finally:
39 |     [memuse] = sess.run([tf.contrib.memory_stats.MaxBytesInUse()])
40 |     print("Memory GBs in use %.2f"%(memuse/10**9,))
41 |     
42 | 
43 | #    576000000
44 | # 2017-09-21 14:53:23.483412: I tensorflow/core/framework/log_memory.cc:35] __LOG_MEMORY__ MemoryLogTensorOutput { step_id: 2 kernel_name: "gradients/leaky_relu_grad/zeros_like" tensor { dtype: DT_FLOAT shape { dim { size: 144000000 } } allocation_description { requested_bytes: 576000000 allocated_bytes: 576000000 allocator_name: "GPU_0_bfc" allocation_id: 6 ptr: 1109438113536 } } }
45 | 


--------------------------------------------------------------------------------
/resource_variable_test.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.python.ops import resource_variable_ops
 3 | import portpicker
 4 | 
 5 | port = portpicker.pick_unused_port()
 6 | host = "127.0.0.1"
 7 | job_name = "worker"
 8 | cluster = {job_name: [host+":"+str(port)]}
 9 | cluster_spec = tf.train.ClusterSpec(cluster).as_cluster_def()
10 | 
11 | server = tf.train.Server(cluster_spec, job_name=job_name)
12 | sess = tf.Session(server.target)
13 | 
14 | x = tf.get_variable("x", shape=[], dtype=tf.float32,
15 |                     initializer=tf.constant_initializer(2), use_resource=True)
16 | sess.run(tf.global_variables_initializer())
17 | print(sess.run(x))
18 | 


--------------------------------------------------------------------------------
/simple_train.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.data import (DataLoader, SequentialSampler,
 3 |                               TensorDataset)
 4 | 
 5 | from pytorch_pretrained_bert import OpenAIGPTTokenizer, OpenAIGPTLMHeadModel
 6 | 
 7 | 
 8 | def main():
 9 |   # 3 examples
10 |   train_dataset = 'small brown fox jumps over the lazy dog\n' \
11 |                   'small brown fox jumps over the lazy dog\n' \
12 |                   'small brown fox jumps over the lazy dog\n'
13 |   tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt',
14 |                                                  special_tokens=[])
15 |   tokenized = [tokenizer.tokenize(t) for t in train_dataset.strip().split('\n')]
16 | 
17 |   encoded=[tokenizer.convert_tokens_to_ids(t) for t in tokenized]  # 3x8
18 |   dataset = TensorDataset(torch.tensor(encoded))
19 |   sampler = SequentialSampler(dataset)
20 |   dataloader = DataLoader(dataset, sampler=sampler, batch_size=1)
21 |   model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
22 | 
23 |   optimizer = torch.optim.SGD(model.parameters(), lr = 0.0001, momentum=0.9)
24 | 
25 |   batch = next(iter(dataloader))
26 |   batch=batch[0]   # dataloader gives [batch] instead of batch...why?
27 |  
28 |   for i in range(20):
29 |     loss = model(input_ids=batch, lm_labels=batch)
30 |     print(loss.detach().numpy())
31 |     loss.backward()
32 |     optimizer.step()
33 |     optimizer.zero_grad()
34 | 
35 | # Should produce this
36 | #  6.134997
37 | #  5.3747735
38 | #  5.164842
39 | #  4.8581843
40 | #  4.346232
41 | #  4.158811
42 | #  3.7503657
43 | #  3.29156
44 | #  2.8858535
45 | #  2.760832
46 | #  2.562772
47 | #  2.0645103
48 | #  1.6837901
49 | #  1.6822727
50 | #  1.5878279
51 | #  1.3873199
52 | #  1.158909
53 | #  0.92595655
54 | #  0.8487712
55 | #  0.82774204
56 | 
57 | 
58 | if __name__=='__main__':
59 |   main()
60 |   
61 | 


--------------------------------------------------------------------------------
/svd_benchmark.py:
--------------------------------------------------------------------------------
 1 | # Fastest way to compute eigenvectors for 4k matrix?
 2 | #
 3 | # Xeon V3 benchmarks:
 4 | # n=4096 eigs  min: 27758.34, median: 28883.69
 5 | # n=4096 gesdd min: 7241.70, median: 8477.95
 6 | # n=4096 gesvd min=20487.48, median: 22057.64,
 7 | # n=4096 inv min: 556.67, median: 579.25,
 8 | # n=4096 linsolve: min: 534.40, median: 558.06, mean: 579.19
 9 | #
10 | # Xeon V4:
11 | # n=4096 gesdd min: 5586.02, median: 6032.16
12 | #
13 | #
14 | # i7-5820K CPU @ 3.30GHz
15 | # n=4096 gesdd 7288.02, median: 7397.23, mean: 7478.78
16 | # n=4096 inv 520 msec
17 | #
18 | # after upgrading things
19 | # b'Intel(R) Math Kernel Library Version 2017.0.3 Product Build 20170413 for Intel(R) 64 architecture applications'
20 | # n=4096 inv 1427.54
21 | 
22 | 
23 | from scipy import linalg  # for svd
24 | import numpy as np
25 | import time
26 | import sys
27 | 
28 | methods = ['gesdd', 'gesvd', 'eigh', 'inv', 'inv2', 'linsolve']
29 | 
30 | if len(sys.argv)<2:
31 |   method = methods[0]
32 | else:
33 |   method = sys.argv[1]
34 | 
35 | # from @eamartin
36 | def empty_aligned(n, align):
37 |   """Get n bytes of memory wih alignment align."""
38 |   a = np.empty(n + (align - 1), dtype=np.float32)
39 |   data_align = a.ctypes.data % align
40 |   offset = 0 if data_align == 0 else (align - data_align)
41 |   return a[offset : offset + n]
42 | 
43 | assert method in methods
44 | 
45 | n=4096
46 | #n=1024
47 | x_old = np.random.randn(n*n).reshape((n,n)).astype(dtype=np.float32)
48 | x = empty_aligned(n*n, 32).reshape((n, n))
49 | x[:] = x_old
50 | x = x @ x.T
51 | 
52 | x0 = np.random.randn(n).reshape((n,1)).astype(dtype=np.float32)
53 | 
54 | start_time = time.time()
55 | times = []
56 | 
57 | print("n=%d %s "%(n, method))
58 | for i in range(9):
59 |   if method == 'gesdd':
60 |     result = linalg.svd(x)
61 |   elif method == 'gesvd':
62 |     result = linalg.svd(x, lapack_driver='gesvd')
63 |   elif method == 'eigh':
64 |     result = linalg.eigh(x)
65 |   elif method == 'inv':
66 |     result = linalg.inv(x)
67 |   elif method == 'inv2':
68 |     result = linalg.inv(x, overwrite_a=True)
69 |   elif method == 'linsolve':
70 |     result = linalg.solve(x, x0)
71 |   else:
72 |     assert False
73 |   new_time = time.time()
74 |   elapsed_time = 1000*(new_time - start_time)
75 |   print("%.2f msec" %(elapsed_time))
76 |   start_time = new_time
77 |   times.append(elapsed_time)
78 | 
79 | print("Times: min: %.2f, median: %.2f, mean: %.2f"%(np.min(times), np.median(times), np.mean(times)))
80 | 
81 | 
82 | # Other timings: svd
83 | # n=1000 Times: min: 126.04, median: 132.48
84 | # n=2000 Times: min: 573.03, median: 621.49
85 | # n=4096 Times: min: 5586.02, median: 6032.16
86 | # Other timings: inv
87 | # Times: min: 17.87, median: 23.41, mean: 27.90
88 | 


--------------------------------------------------------------------------------
/svd_noconverge.py:
--------------------------------------------------------------------------------
 1 | import scipy
 2 | import scipy.linalg as linalg
 3 | import numpy as np
 4 | import os
 5 | import sys
 6 | import ctypes
 7 | import numpy as np
 8 | 
 9 | def mklVersion():
10 |     ver = np.zeros(199, dtype=np.uint8)
11 |     mkl = ctypes.cdll.LoadLibrary("libmkl_rt.so")
12 |     mkl.MKL_Get_Version_String(ver.ctypes.data_as(ctypes.c_char_p), 198)
13 |     return ver[ver != 0].tostring()
14 | 
15 | # mklVersion()
16 | 
17 | def download_if_needed(fn,target_length=0,bucket="yaroslavvb_stuff"):
18 |   import urllib.request
19 |   url="https://storage.googleapis.com/%s/%s"%(bucket, fn)
20 |   response = urllib.request.urlopen(url)
21 |   body = response.read()
22 |   print("Read %d bytes from %s"%(len(body), url))
23 |   if target_length:
24 |     assert len(body)==target_length
25 |     
26 |   open(fn, "wb").write(body)
27 | 
28 | fn='badsvd0'
29 | download_if_needed(fn, 2458624)
30 | target0 = np.fromfile(fn, np.float32).reshape(784,784)
31 | 
32 | success = True
33 | try:
34 |   u0, s0, vt0 = linalg.svd(target0)
35 | except Exception as e:
36 |   print("SVD failure")
37 |   print(repr(e))
38 |   success = False
39 | else:
40 |   print("SVD success")
41 | 
42 | print("Scipy version: ", scipy.version.full_version)
43 | print("Numpy version: ", np.version.full_version)
44 | print("Python version: ", sys.version)
45 | print("Python binary: ", sys.executable)
46 | 
47 | print("-"*80)
48 | print("MKL version:")
49 | print(mklVersion())
50 | print("-"*80)
51 | print("Conda version:")
52 | os.system("conda list --explicit")
53 | print("-"*80)
54 | print("CPU version")
55 | for l in open("/proc/cpuinfo").read().split('\n'):
56 |   if 'model name' in l:
57 |     print(l)
58 |     break
59 | 
60 | if success:
61 |   print("Success.")
62 | else:
63 |   print("Failure.")
64 |   
65 | # Upload notes:
66 | # export fullname=badsvd0
67 | # export bucket=yaroslavvb_stuff
68 | # gsutil cp $fullname gs://$bucket
69 | # gsutil acl set public-read gs://$bucket/$fullname
70 | # echo https://storage.googleapis.com/$bucket/$fullname
71 | 


--------------------------------------------------------------------------------
/tensorflow-memory-talk.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yaroslavvb/stuff/a8024ead315aa1b5d6976940b3a062178f0e499d/tensorflow-memory-talk.pdf


--------------------------------------------------------------------------------
/tiny_runs/qr_test.py:
--------------------------------------------------------------------------------
 1 | # qr on 4096 x 4096
 2 | # tf 6.89
 3 | # np openblas 11.38
 4 | # np mkl: 2.36
 5 | 
 6 | import tensorflow as tf
 7 | import time
 8 | import numpy as np
 9 | 
10 | np.__config__.show()
11 | 
12 | try:
13 |   tf.reset_default_graph()
14 |   n = 2048*2
15 |   mat = tf.Variable(tf.random_uniform((n,n)))
16 |   qr = tf.qr(mat)
17 |   sess = tf.Session(config = tf.ConfigProto(graph_options=tf.GraphOptions(optimizer_options=tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0))))
18 |   sess.run(tf.initialize_all_variables())
19 |   sess.run(qr[0].op)
20 |   start_time = time.time()
21 |   sess.run(qr[0].op)
22 |   end_time = time.time()
23 |   print("TF QR on %d by %d matrix in %.2f seconds"%(n, n, end_time-start_time))
24 | except:
25 |   print("No tf")
26 | 
27 | a = np.random.randn(n, n)
28 | start_time = time.time()
29 | q, r = np.linalg.qr(a)
30 | end_time = time.time()
31 | print("numpy QR on %d by %d matrix in %.2f seconds"%(n, n, end_time-start_time))
32 | 


--------------------------------------------------------------------------------