├── examples ├── __init__.py ├── saved_logs │ └── example_test_tube_data │ │ ├── demo_test_0 │ │ ├── version_0 │ │ │ ├── meta_tags.json │ │ │ ├── media │ │ │ │ └── jpg_0.jpg │ │ │ └── metrics.csv │ │ └── version_1 │ │ │ ├── meta_tags.json │ │ │ ├── media │ │ │ └── jpg_0.jpg │ │ │ └── metrics.csv │ │ └── demo_test_1 │ │ ├── version_0 │ │ ├── meta_tags.json │ │ ├── media │ │ │ └── jpg_0.jpg │ │ └── metrics.csv │ │ └── version_1 │ │ ├── meta_tags.json │ │ ├── media │ │ └── jpg_0.jpg │ │ └── metrics.csv ├── tensorflow_example.py ├── pytorch_hpc_example.py └── hpc_cpu_example.py ├── test_tube ├── hyper_opt_utils │ ├── __init__.py │ └── strategies.py ├── .DS_Store ├── __init__.py ├── hyperopt.py ├── argparse_hopt.py ├── hpc.py └── log.py ├── .DS_Store ├── imgs ├── viz_a.png └── test_tube_logo.png ├── docs ├── img │ └── viz_a.png ├── index.md ├── experiment_tracking │ └── experiment.md ├── hyperparameter_optimization │ └── HyperOptArgumentParser.md └── hpc │ └── SlurmCluster.md ├── site ├── img │ ├── viz_a.png │ └── favicon.ico ├── sitemap.xml.gz ├── fonts │ ├── fontawesome-webfont.eot │ ├── fontawesome-webfont.ttf │ └── fontawesome-webfont.woff ├── sitemap.xml ├── search │ ├── main.js │ └── worker.js ├── js │ ├── theme.js │ └── modernizr-2.8.3.min.js ├── css │ └── theme_extra.css ├── 404.html ├── search.html ├── index.html ├── experiment_tracking │ └── experiment │ │ └── index.html ├── hyperparameter_optimization │ └── HyperOptArgumentParser │ │ └── index.html └── hpc │ └── SlurmCluster │ └── index.html ├── requirements.txt ├── tests ├── log_test.py ├── argparse_hopt_test.py ├── hpc_test.py └── strategies_test.py ├── update.sh ├── mkdocs.yml ├── setup.cfg ├── .travis.yml ├── LICENSE ├── setup.py ├── .gitignore └── README.md /examples/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test_tube/hyper_opt_utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/.DS_Store -------------------------------------------------------------------------------- /imgs/viz_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/imgs/viz_a.png -------------------------------------------------------------------------------- /docs/img/viz_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/docs/img/viz_a.png -------------------------------------------------------------------------------- /site/img/viz_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/site/img/viz_a.png -------------------------------------------------------------------------------- /site/sitemap.xml.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/site/sitemap.xml.gz -------------------------------------------------------------------------------- /test_tube/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/test_tube/.DS_Store -------------------------------------------------------------------------------- /site/img/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/site/img/favicon.ico -------------------------------------------------------------------------------- /examples/saved_logs/example_test_tube_data/demo_test_0/version_0/meta_tags.json: -------------------------------------------------------------------------------- 1 | {"tag_b": "s", "tag_a": 2} -------------------------------------------------------------------------------- /examples/saved_logs/example_test_tube_data/demo_test_0/version_1/meta_tags.json: -------------------------------------------------------------------------------- 1 | {"tag_a": 2, "tag_b": "s"} -------------------------------------------------------------------------------- /examples/saved_logs/example_test_tube_data/demo_test_1/version_0/meta_tags.json: -------------------------------------------------------------------------------- 1 | {"tag_b": "s", "tag_a": 2} -------------------------------------------------------------------------------- /examples/saved_logs/example_test_tube_data/demo_test_1/version_1/meta_tags.json: -------------------------------------------------------------------------------- 1 | {"tag_a": 2, "tag_b": "s"} -------------------------------------------------------------------------------- /imgs/test_tube_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/imgs/test_tube_logo.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas>=0.20.3 2 | numpy>=1.13.3 3 | imageio>=2.3.0 4 | tensorboard>=1.15.0 5 | torch>=1.1.0 6 | future -------------------------------------------------------------------------------- /site/fonts/fontawesome-webfont.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/site/fonts/fontawesome-webfont.eot -------------------------------------------------------------------------------- /site/fonts/fontawesome-webfont.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/site/fonts/fontawesome-webfont.ttf -------------------------------------------------------------------------------- /site/fonts/fontawesome-webfont.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/site/fonts/fontawesome-webfont.woff -------------------------------------------------------------------------------- /tests/log_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | def test_hello(): 5 | assert 4==4 6 | 7 | if __name__ == '__main__': 8 | pytest.main([__file__]) 9 | -------------------------------------------------------------------------------- /tests/argparse_hopt_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | def test_hello(): 5 | assert 4==4 6 | 7 | if __name__ == '__main__': 8 | pytest.main([__file__]) 9 | -------------------------------------------------------------------------------- /examples/saved_logs/example_test_tube_data/demo_test_0/version_0/media/jpg_0.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/examples/saved_logs/example_test_tube_data/demo_test_0/version_0/media/jpg_0.jpg -------------------------------------------------------------------------------- /examples/saved_logs/example_test_tube_data/demo_test_0/version_1/media/jpg_0.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/examples/saved_logs/example_test_tube_data/demo_test_0/version_1/media/jpg_0.jpg -------------------------------------------------------------------------------- /examples/saved_logs/example_test_tube_data/demo_test_1/version_0/media/jpg_0.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/examples/saved_logs/example_test_tube_data/demo_test_1/version_0/media/jpg_0.jpg -------------------------------------------------------------------------------- /examples/saved_logs/example_test_tube_data/demo_test_1/version_1/media/jpg_0.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/examples/saved_logs/example_test_tube_data/demo_test_1/version_1/media/jpg_0.jpg -------------------------------------------------------------------------------- /test_tube/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Experiment logger module 3 | """ 4 | 5 | from .argparse_hopt import HyperOptArgumentParser 6 | from .hpc import SlurmCluster 7 | from .hyperopt import HyperParamOptimizer 8 | from .log import Experiment 9 | -------------------------------------------------------------------------------- /examples/saved_logs/example_test_tube_data/demo_test_0/version_0/metrics.csv: -------------------------------------------------------------------------------- 1 | created_at,fake_jpg,row_3,test 2 | 2017-10-13 02:07:28.005016,/Users/waf/test_tube_data/demo_test_0/version_0/media/jpg_0.jpg,,2 3 | 2017-10-13 02:07:28.005031,,3,2 4 | -------------------------------------------------------------------------------- /examples/saved_logs/example_test_tube_data/demo_test_0/version_1/metrics.csv: -------------------------------------------------------------------------------- 1 | created_at,fake_jpg,row_3,test 2 | 2017-10-13 02:07:37.395603,/Users/waf/Developer/log_suite/test_tube/test_tube/test_tube_data/demo_test_0/version_1/media/jpg_0.jpg,,2 3 | 2017-10-13 02:07:37.395635,,3,2 4 | -------------------------------------------------------------------------------- /examples/saved_logs/example_test_tube_data/demo_test_1/version_0/metrics.csv: -------------------------------------------------------------------------------- 1 | created_at,fake_jpg,row_3,test 2 | 2017-10-13 02:07:28.035057,/Users/waf/Developer/log_suite/test_tube/test_tube/test_tube_data/demo_test_1/version_0/media/jpg_0.jpg,,2 3 | 2017-10-13 02:07:28.035086,,3,2 4 | -------------------------------------------------------------------------------- /examples/saved_logs/example_test_tube_data/demo_test_1/version_1/metrics.csv: -------------------------------------------------------------------------------- 1 | created_at,fake_jpg,row_3,test 2 | 2017-10-13 02:07:37.443175,/Users/waf/Developer/log_suite/test_tube/test_tube/test_tube_data/demo_test_1/version_1/media/jpg_0.jpg,,2 3 | 2017-10-13 02:07:37.443252,,3,2 4 | -------------------------------------------------------------------------------- /update.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | version=$1 4 | 5 | git commit -am "release v$version" 6 | git tag $version -m "test_tube v$version" 7 | git push --tags origin master 8 | 9 | # push to pypi 10 | rm -rf ./dist/* 11 | python3 setup.py sdist 12 | twine upload dist/* 13 | 14 | 15 | 16 | # to update docs 17 | # cd to root dir 18 | # mkdocs gh-deploy 19 | 20 | 21 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: Test tube Documentation 2 | theme: readthedocs 3 | docs_dir: docs 4 | repo_url: https://github.com/williamFalcon/test_tube 5 | site_dir: 'site' 6 | site_description: 'Documentation for Test Tube, the Python Deep Learning and Machine Learning experiment tracking and tuning framework.' 7 | 8 | dev_addr: '0.0.0.0:8000' 9 | #google_analytics: ['UA-aasd', 'sitename'] -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | 4 | [yapf] 5 | align_closing_bracket_with_visual_indent = True 6 | # Put braces on their own line. 7 | dedent_closing_brackets = True 8 | split_before_closing_bracket = True 9 | indent_width = 4 10 | coalesce_brackets = True 11 | allow_multiline_lambdas = True 12 | join_multiple_lines = True 13 | spaces_around_power_operator = False 14 | column_limit = 100 15 | -------------------------------------------------------------------------------- /site/sitemap.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | None 5 | 2019-08-03 6 | daily 7 | 8 | 9 | None 10 | 2019-08-03 11 | daily 12 | 13 | 14 | None 15 | 2019-08-03 16 | daily 17 | 18 | 19 | None 20 | 2019-08-03 21 | daily 22 | 23 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | # command to install dependencies 3 | cache: pip 4 | 5 | matrix: 6 | include: 7 | - python: 3.6 8 | dist: xenial # Ubuntu 16.04 9 | env: 10 | - MIN_REQUIREMENTS=1 11 | - python: 3.6 12 | dist: bionic # Ubuntu 18.04 13 | - python: 3.7 14 | dist: bionic # Ubuntu 18.04 15 | 16 | install: 17 | - if [[ "${MIN_REQUIREMENTS}" == "1" ]]; then 18 | python -c "req = open('requirements.txt').read().replace('>', '=') ; open('requirements-ci.txt', 'w').write(req)" ; 19 | pip install -r requirements-ci.txt ; 20 | fi 21 | - pip install -e . 22 | 23 | 24 | # command to run tests 25 | script: 26 | - pytest # or py.test for Python versions 3.5 and below 27 | 28 | notifications: 29 | email: false 30 | -------------------------------------------------------------------------------- /tests/hpc_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from test_tube.argparse_hopt import HyperOptArgumentParser 4 | from test_tube.hpc import SlurmCluster 5 | 6 | 7 | def test_slurm_time_to_seconds(): 8 | parser = HyperOptArgumentParser() 9 | parsed = parser.parse_args() 10 | cluster = SlurmCluster(log_path='/home/travis', hyperparam_optimizer=parsed) 11 | 12 | assert cluster.slurm_time_to_seconds('15:00') == 900 13 | assert cluster.slurm_time_to_seconds('1-12:20:12') == 130812 14 | assert cluster.slurm_time_to_seconds('1:20:12') == 4812 15 | assert cluster.slurm_time_to_seconds('00:20:12') == 1212 16 | assert cluster.slurm_time_to_seconds('00:00:12') == 12 17 | assert cluster.slurm_time_to_seconds('12') == 12 18 | 19 | 20 | if __name__ == '__main__': 21 | pytest.main([__file__]) 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2017-2018 William Falcon 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | from setuptools import find_packages, setup 4 | 5 | version = '0.7.5' 6 | PATH_ROOT = os.path.dirname(__file__) 7 | 8 | 9 | def load_requirements(path_dir=PATH_ROOT, comment_char='#'): 10 | with open(os.path.join(path_dir, 'requirements.txt'), 'r') as file: 11 | lines = [ln.strip() for ln in file.readlines()] 12 | reqs = [] 13 | for ln in lines: 14 | # filer all comments 15 | if comment_char in ln: 16 | ln = ln[:ln.index(comment_char)] 17 | if ln: # if requirement is not empty 18 | reqs.append(ln) 19 | return reqs 20 | 21 | 22 | setup( 23 | name='test_tube', 24 | packages=find_packages(), 25 | version=version, 26 | description='Experiment logger and visualizer', 27 | author='William Falcon', 28 | install_requires=load_requirements(PATH_ROOT), 29 | author_email='will@hacstudios.com', 30 | url='https://github.com/williamFalcon/test_tube', 31 | download_url='https://github.com/williamFalcon/test_tube/archive/{}.tar.gz'.format(version), 32 | keywords=[ 33 | 'testing', 34 | 'machine learning', 35 | 'deep learning', 36 | 'prototyping', 37 | 'experimenting', 38 | 'modeling', 39 | ], 40 | ) 41 | -------------------------------------------------------------------------------- /tests/strategies_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from test_tube.hyper_opt_utils import strategies 4 | 5 | GRID_SEARCH = 'grid_search' 6 | RANDOM_SEARCH = 'random_search' 7 | 8 | FLAT_PARAMS = [ 9 | [ 10 | {'idx': 0, 'val': 0.0001, 'name': 'learning_rate'}, 11 | {'idx': 1, 'val': 0.001, 'name': 'learning_rate'}, 12 | {'idx': 2, 'val': 0.01, 'name': 'learning_rate'}, 13 | {'idx': 3, 'val': 0.1, 'name': 'learning_rate'} 14 | ], 15 | [ 16 | {'idx': 4, 'val': 0.99, 'name': 'decay'}, 17 | {'idx': 5, 'val': 0.999, 'name': 'decay'}, 18 | ] 19 | ] 20 | def test_unknown_strategy(): 21 | with pytest.raises(ValueError): 22 | strategies.generate_trials( 23 | 'unknown_strategy', FLAT_PARAMS, nb_trials=None) 24 | 25 | def test_grid_search_no_limit(): 26 | trials = strategies.generate_trials( 27 | GRID_SEARCH, FLAT_PARAMS, nb_trials=None) 28 | assert len(trials) == len(FLAT_PARAMS[0]) * len(FLAT_PARAMS[1]) 29 | 30 | def test_grid_search_limit(): 31 | trials = strategies.generate_trials( 32 | GRID_SEARCH, FLAT_PARAMS, nb_trials=5) 33 | assert len(trials) == 5 34 | 35 | 36 | def test_random_search(): 37 | trials = strategies.generate_trials( 38 | RANDOM_SEARCH, FLAT_PARAMS, nb_trials=5) 39 | assert len(trials) == 5 40 | 41 | def test_random_search_unbounded_error(): 42 | with pytest.raises(TypeError): 43 | trials = strategies.generate_trials( 44 | RANDOM_SEARCH, FLAT_PARAMS, nb_trials=None) 45 | 46 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | data/ 6 | test_tube_data/ 7 | *.experiment 8 | test.py 9 | example.json 10 | .pytest_cache/ 11 | talk/ 12 | .DS_Store 13 | 14 | # C extensions 15 | *.so 16 | 17 | src 18 | 19 | # Distribution / packaging 20 | .Python 21 | env/ 22 | build/ 23 | develop-eggs/ 24 | dist/ 25 | downloads/ 26 | eggs/ 27 | .eggs/ 28 | lib/ 29 | lib64/ 30 | parts/ 31 | sdist/ 32 | var/ 33 | *.egg-info/ 34 | .installed.cfg 35 | *.egg 36 | 37 | # PyInstaller 38 | # Usually these files are written by a python script from a template 39 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 40 | *.manifest 41 | *.spec 42 | 43 | # Installer logs 44 | pip-log.txt 45 | pip-delete-this-directory.txt 46 | 47 | # Unit test / coverage reports 48 | htmlcov/ 49 | .tox/ 50 | .coverage 51 | .coverage.* 52 | .cache 53 | nosetests.xml 54 | coverage.xml 55 | *,cover 56 | .hypothesis/ 57 | 58 | .idea 59 | 60 | # Translations 61 | *.mo 62 | *.pot 63 | 64 | # Django stuff: 65 | *.log 66 | local_settings.py 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | target/ 80 | 81 | # IPython Notebook 82 | .ipynb_checkpoints 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # celery beat schedule file 88 | celerybeat-schedule 89 | 90 | # dotenv 91 | .env 92 | 93 | # virtualenv 94 | venv/ 95 | ENV/ 96 | 97 | # Spyder project settings 98 | .spyderproject 99 | 100 | # Rope project settings 101 | .ropeproject 102 | -------------------------------------------------------------------------------- /examples/tensorflow_example.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from test_tube import Experiment, HyperOptArgumentParser 4 | 5 | """ 6 | This script demonstrates how to do a hyperparameter search over 2 parameters in tensorflow 7 | on 4 simultaneous GPUs. Each trial will also save its own experiment logs. 8 | 9 | A single trial gets allocated on a single GPU until all trials have completed. 10 | This means for 10 trials and 4 GPUs, we'll run 4 in parallel twice and the last 2 trials in parallel. 11 | """ 12 | 13 | 14 | # main training function (very simple) 15 | def train(hparams): 16 | # init exp and track all the parameters from the HyperOptArgumentParser 17 | exp = Experiment( 18 | name=hparams.test_tube_exp_name, 19 | save_dir=hparams.log_path, 20 | autosave=False, 21 | ) 22 | exp.argparse(hparams) 23 | 24 | # define tensorflow graph 25 | x = tf.placeholder(dtype=tf.int32, name='x') 26 | y = tf.placeholder(dtype=tf.int32, name='y') 27 | out = x * y 28 | 29 | sess = tf.Session() 30 | 31 | # Run the tf op 32 | for train_step in range(0, 100): 33 | output = sess.run(out, feed_dict={x: hparams.x_val, y: hparams.y_val}) 34 | exp.log({'fake_err': output}) 35 | 36 | # save exp when we're done 37 | exp.save() 38 | 39 | 40 | # set up our argparser and make the y_val tunable 41 | parser = HyperOptArgumentParser(strategy='random_search') 42 | parser.add_argument('--test_tube_exp_name', default='my_test') 43 | parser.add_argument('--log_path', default='/Users/waf/Desktop/test') 44 | parser.opt_list('--y_val', default=12, options=[1, 2, 3, 4], tunable=True) 45 | parser.opt_list('--x_val', default=12, options=[20, 12, 30, 45], tunable=True) 46 | hyperparams = parser.parse_args() 47 | 48 | 49 | # optimize on 4 gpus at the same time 50 | # each gpu will get 1 experiment with a set of hyperparams 51 | hyperparams.optimize_parallel_gpu(train, gpu_ids=['1', '0', '3', '2'], nb_trials=4, nb_workers=4) 52 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # Test Tube: Easily log and tune Deep Learning experiments 2 | 3 | Test Tube allows you to easily log metadata and track your machine 4 | learning experiments. 5 | 6 | Use Test Tube if you need to: 7 | 8 | - Track many [Experiments](experiment_tracking/experiment.md) across 9 | models. 10 | - Visualize and compare different 11 | experiments without uploading anywhere. 12 | - [Optimize your 13 | hyperparameters](hyperparameter_optimization/HyperOptArgumentParser/) 14 | using grid search or random search. 15 | - Automatically track ALL parameters for a particular training run. 16 | 17 | Test Tube is compatible with: Python 2 and 3 18 | 19 | ## Getting started 20 | 21 | ------------------------------------------------------------------------ 22 | 23 | ### Create an [Experiment](experiment_tracking/experiment.md) 24 | 25 | ``` {.python} 26 | from test_tube import Experiment 27 | 28 | exp = Experiment(name='dense_model', 29 | debug=False, 30 | save_dir='/Desktop/test_tube') 31 | 32 | exp.tag({'learning_rate': 0.002, 'nb_layers': 2}) 33 | 34 | for step in training_steps: 35 | tng_err = model.eval(tng_x, tng_y) 36 | 37 | exp.log('tng_err': tng_err) 38 | 39 | # training complete! 40 | # all your logs and data are ready to be visualized at testtube.williamfalcon.com 41 | ``` 42 | 43 | ------------------------------------------------------------------------ 44 | 45 | ### Optimize your [hyperparameters](hyperparameter_optimization/HyperOptArgumentParser/) 46 | 47 | ``` {.python} 48 | from test_tube import HyperOptArgumentParser 49 | 50 | # subclass of argparse 51 | parser = HyperOptArgumentParser(strategy='random_search') 52 | parser.add_argument('--learning_rate', default=0.002, type=float, help='the learning rate') 53 | 54 | # let's enable optimizing over the number of layers in the network 55 | parser.opt_list('--nb_layers', default=2, type=int, tunable=True, options=[2, 4, 8]) 56 | 57 | # and tune the number of units in each layer 58 | parser.opt_range('--neurons', default=50, type=int, tunable=True, low=100, high=800, nb_samples=10) 59 | 60 | # compile (because it's argparse underneath) 61 | hparams = parser.parse_args() 62 | 63 | # run 20 trials of random search over the hyperparams 64 | for hparam_trial in hparams.trials(20): 65 | train_network(hparam_trial) 66 | ``` 67 | 68 | ------------------------------------------------------------------------ 69 | 70 | ### Visualize 71 | 72 | ``` {.python} 73 | import pandas as pd 74 | import matplotlib 75 | 76 | # each experiment is saved to a metrics.csv file which can be imported anywhere 77 | # images save to exp/version/images 78 | df = pd.read_csv('../some/dir/test_tube_data/dense_model/version_0/metrics.csv') 79 | df.tng_err.plot() 80 | ``` 81 | -------------------------------------------------------------------------------- /examples/pytorch_hpc_example.py: -------------------------------------------------------------------------------- 1 | """Example launcher for a hyperparameter search on SLURM. 2 | 3 | This example shows how to use gpus on SLURM with PyTorch. 4 | """ 5 | import torch 6 | 7 | from test_tube import Experiment, HyperOptArgumentParser, SlurmCluster 8 | 9 | 10 | def train(hparams, *args): 11 | """Train your awesome model. 12 | 13 | :param hparams: The arguments to run the model with. 14 | """ 15 | # Initialize experiments and track all the hyperparameters 16 | exp = Experiment( 17 | name=hparams.test_tube_exp_name, 18 | # Location to save the metrics. 19 | save_dir=hparams.log_path, 20 | autosave=False, 21 | ) 22 | exp.argparse(hparams) 23 | 24 | # Pretend to train. 25 | x = torch.rand((1, hparams.x_val)) 26 | for train_step in range(0, 100): 27 | y = torch.rand((hparams.x_val, 1)) 28 | out = x.mm(y) 29 | exp.log({'fake_err': out.item()}) 30 | 31 | # Save exp when . 32 | exp.save() 33 | 34 | 35 | if __name__ == '__main__': 36 | # Set up our argparser and make the y_val tunable. 37 | parser = HyperOptArgumentParser(strategy='random_search') 38 | parser.add_argument('--test_tube_exp_name', default='my_test') 39 | parser.add_argument('--log_path', default='/some/path/to/log') 40 | parser.opt_list('--y_val', 41 | default=12, options=[1, 2, 3, 4, 5, 6], tunable=True) 42 | parser.opt_list('--x_val', 43 | default=12, options=[20, 12, 30, 45], tunable=True) 44 | hyperparams = parser.parse_args() 45 | 46 | # Enable cluster training. 47 | cluster = SlurmCluster( 48 | hyperparam_optimizer=hyperparams, 49 | log_path=hyperparams.log_path, 50 | python_cmd='python3', 51 | test_tube_exp_name=hyperparams.test_tube_exp_name 52 | ) 53 | 54 | # Email results if your hpc supports it. 55 | cluster.notify_job_status( 56 | email='some@email.com', on_done=True, on_fail=True) 57 | 58 | # SLURM Module to load. 59 | cluster.load_modules([ 60 | 'python-3', 61 | 'anaconda3' 62 | ]) 63 | 64 | # Add commands to the non-SLURM portion. 65 | cluster.add_command('source activate myCondaEnv') 66 | 67 | # Add custom SLURM commands which show up as: 68 | # #comment 69 | # #SBATCH --cmd=value 70 | # ############ 71 | # cluster.add_slurm_cmd( 72 | # cmd='cpus-per-task', value='1', comment='CPUS per task.') 73 | 74 | # Set job compute details (this will apply PER set of hyperparameters.) 75 | cluster.per_experiment_nb_gpus = 4 76 | cluster.per_experiment_nb_nodes = 2 77 | cluster.gpu_type = '1080ti' 78 | 79 | # Each hyperparameter combination will use 8 gpus. 80 | cluster.optimize_parallel_cluster_gpu( 81 | # Function to execute: 82 | train, 83 | # Number of hyperparameter combinations to search: 84 | nb_trials=24, 85 | # This is what will display in the slurm queue: 86 | job_name='first_tt_job') 87 | -------------------------------------------------------------------------------- /examples/hpc_cpu_example.py: -------------------------------------------------------------------------------- 1 | """Example launcher for a hyperparameter search on SLURM.""" 2 | from test_tube import Experiment, HyperOptArgumentParser, SlurmCluster 3 | 4 | 5 | def train(hparams, *args): 6 | """Train your awesome model. 7 | 8 | :param hparams: The arguments to run the model with. 9 | """ 10 | # Initialize experiments and track all the hyperparameters 11 | exp = Experiment( 12 | name=hparams.test_tube_exp_name, 13 | # Location to save the metrics. 14 | save_dir=hparams.log_path, 15 | # The experiment version is optional, but using the one 16 | # from SLURM means the exp will not collide with other 17 | # versions if SLURM runs multiple at once. 18 | version=hparams.hpc_exp_number, 19 | autosave=False, 20 | ) 21 | exp.argparse(hparams) 22 | 23 | # Pretend to train. 24 | x = hparams.x_val 25 | for train_step in range(0, 100): 26 | y = hparams.y_val 27 | out = x * y 28 | exp.log({'fake_err': out.item()}) # Log metrics. 29 | 30 | # Save exp when done. 31 | exp.save() 32 | 33 | 34 | if __name__ == '__main__': 35 | # Set up our argparser and make the y_val tunable. 36 | parser = HyperOptArgumentParser(strategy='random_search') 37 | parser.add_argument('--test_tube_exp_name', default='my_test') 38 | parser.add_argument('--log_path', default='/some/path/to/log') 39 | parser.opt_list('--y_val', 40 | default=12, options=[1, 2, 3, 4, 5, 6], tunable=True) 41 | parser.opt_list('--x_val', 42 | default=12, options=[20, 12, 30, 45], tunable=True) 43 | hyperparams = parser.parse_args() 44 | 45 | # Enable cluster training. 46 | cluster = SlurmCluster( 47 | hyperparam_optimizer=hyperparams, 48 | log_path=hyperparams.log_path, 49 | python_cmd='python3', 50 | test_tube_exp_name=hyperparams.test_tube_exp_name 51 | ) 52 | 53 | # Email results if your hpc supports it. 54 | cluster.notify_job_status( 55 | email='some@email.com', on_done=True, on_fail=True) 56 | 57 | # SLURM Module to load. 58 | cluster.load_modules([ 59 | 'python-3', 60 | 'anaconda3' 61 | ]) 62 | 63 | # Add commands to the non-SLURM portion. 64 | cluster.add_command('source activate myCondaEnv') 65 | 66 | # Add custom SLURM commands which show up as: 67 | # #comment 68 | # #SBATCH --cmd=value 69 | # ############ 70 | # cluster.add_slurm_cmd( 71 | # cmd='cpus-per-task', value='1', comment='CPUS per task.') 72 | 73 | # Set job compute details (this will apply PER set of hyperparameters.) 74 | cluster.per_experiment_nb_cpus = 20 75 | cluster.per_experiment_nb_nodes = 10 76 | 77 | # Each hyperparameter combination will use 200 cpus. 78 | cluster.optimize_parallel_cluster_cpu( 79 | # Function to execute: 80 | train, 81 | # Number of hyperparameter combinations to search: 82 | nb_trials=24, 83 | job_name='first_tt_job', 84 | # This is what will display in the slurm queue: 85 | job_display_name='short_name') 86 | -------------------------------------------------------------------------------- /site/search/main.js: -------------------------------------------------------------------------------- 1 | function getSearchTermFromLocation() { 2 | var sPageURL = window.location.search.substring(1); 3 | var sURLVariables = sPageURL.split('&'); 4 | for (var i = 0; i < sURLVariables.length; i++) { 5 | var sParameterName = sURLVariables[i].split('='); 6 | if (sParameterName[0] == 'q') { 7 | return decodeURIComponent(sParameterName[1].replace(/\+/g, '%20')); 8 | } 9 | } 10 | } 11 | 12 | function joinUrl (base, path) { 13 | if (path.substring(0, 1) === "/") { 14 | // path starts with `/`. Thus it is absolute. 15 | return path; 16 | } 17 | if (base.substring(base.length-1) === "/") { 18 | // base ends with `/` 19 | return base + path; 20 | } 21 | return base + "/" + path; 22 | } 23 | 24 | function formatResult (location, title, summary) { 25 | return '

'+ title + '

' + summary +'

'; 26 | } 27 | 28 | function displayResults (results) { 29 | var search_results = document.getElementById("mkdocs-search-results"); 30 | while (search_results.firstChild) { 31 | search_results.removeChild(search_results.firstChild); 32 | } 33 | if (results.length > 0){ 34 | for (var i=0; i < results.length; i++){ 35 | var result = results[i]; 36 | var html = formatResult(result.location, result.title, result.summary); 37 | search_results.insertAdjacentHTML('beforeend', html); 38 | } 39 | } else { 40 | search_results.insertAdjacentHTML('beforeend', "

No results found

"); 41 | } 42 | } 43 | 44 | function doSearch () { 45 | var query = document.getElementById('mkdocs-search-query').value; 46 | if (query.length > 2) { 47 | if (!window.Worker) { 48 | displayResults(search(query)); 49 | } else { 50 | searchWorker.postMessage({query: query}); 51 | } 52 | } else { 53 | // Clear results for short queries 54 | displayResults([]); 55 | } 56 | } 57 | 58 | function initSearch () { 59 | var search_input = document.getElementById('mkdocs-search-query'); 60 | if (search_input) { 61 | search_input.addEventListener("keyup", doSearch); 62 | } 63 | var term = getSearchTermFromLocation(); 64 | if (term) { 65 | search_input.value = term; 66 | doSearch(); 67 | } 68 | } 69 | 70 | function onWorkerMessage (e) { 71 | if (e.data.allowSearch) { 72 | initSearch(); 73 | } else if (e.data.results) { 74 | var results = e.data.results; 75 | displayResults(results); 76 | } 77 | } 78 | 79 | if (!window.Worker) { 80 | console.log('Web Worker API not supported'); 81 | // load index in main thread 82 | $.getScript(joinUrl(base_url, "search/worker.js")).done(function () { 83 | console.log('Loaded worker'); 84 | init(); 85 | window.postMessage = function (msg) { 86 | onWorkerMessage({data: msg}); 87 | }; 88 | }).fail(function (jqxhr, settings, exception) { 89 | console.error('Could not load worker.js'); 90 | }); 91 | } else { 92 | // Wrap search in a web worker 93 | var searchWorker = new Worker(joinUrl(base_url, "search/worker.js")); 94 | searchWorker.postMessage({init: true}); 95 | searchWorker.onmessage = onWorkerMessage; 96 | } 97 | -------------------------------------------------------------------------------- /test_tube/hyper_opt_utils/strategies.py: -------------------------------------------------------------------------------- 1 | """Hyperparameter search strategies.""" 2 | import itertools 3 | import json 4 | import random 5 | 6 | 7 | def generate_trials(strategy, flat_params, nb_trials=None): 8 | r"""Generates the parameter combinations to search. 9 | 10 | Two search strategies are implemented: 11 | 1. `grid_search`: creates a search space that consists of the 12 | product of all flat_params. If `nb_trials` is specified 13 | the first `nb_trials` combinations are searched. 14 | 2. `random_search`: Creates random combinations of the 15 | hyperparameters. Can be used for a more efficient search. 16 | See (Bergstra and Bengio, 2012) for more details. 17 | 18 | :param strategy: The hyperparameter search to strategy. Can be 19 | one of: {`grid_search`, `random`}. 20 | :param flat_params: The hyperparameter arguments to iterate over. 21 | :param nb_trials: The number of hyperparameter combinations to try. 22 | Generates the parameter combinations for each requested trial 23 | :param strategy: 24 | :param flat_params: 25 | :param nb_trials: The number of trials to un. 26 | :return: 27 | """ 28 | if strategy == 'grid_search': 29 | trials = generate_grid_search_trials(flat_params, nb_trials) 30 | return trials 31 | elif strategy == 'random_search': 32 | trials = generate_random_search_trials(flat_params, nb_trials) 33 | return trials 34 | else: 35 | raise ValueError( 36 | ('Unknown strategy "{}". Must be one of ' 37 | '{{grid_search, random_search}}').format(strategy)) 38 | 39 | 40 | def generate_grid_search_trials(flat_params, nb_trials): 41 | """ 42 | Standard grid search. Takes the product of `flat_params` 43 | to generate the search space. 44 | 45 | :param params: The hyperparameters options to search. 46 | :param nb_trials: Returns the first `nb_trials` from the 47 | combinations space. If this is None, all combinations 48 | are returned. 49 | :return: A dict containing the hyperparameters. 50 | """ 51 | trials = list(itertools.product(*flat_params)) 52 | if nb_trials: 53 | trials = trials[0:nb_trials] 54 | return trials 55 | 56 | 57 | def generate_random_search_trials(params, nb_trials): 58 | """ 59 | Generates random combination of hyperparameters to try. 60 | See (Bergstra and Bengio, 2012) for more details. 61 | 62 | :param params: The hyperparameters options to search. 63 | :param nb_trials: The number of trials to run. 64 | :return: A dict containing the hyperparameters. 65 | """ 66 | if nb_trials is None: 67 | raise TypeError( 68 | '`random_search` strategy requires nb_trails to be an int.') 69 | results = [] 70 | 71 | # ensures we have unique results 72 | seen_trials = set() 73 | 74 | # shuffle each param list 75 | potential_trials = 1 76 | for param in params: 77 | random.shuffle(param) 78 | potential_trials *= len(param) 79 | 80 | # we can't sample more trials than are possible 81 | max_iters = min(potential_trials, nb_trials) 82 | 83 | # then for the nb of trials requested, create a new param tuple 84 | # by picking a random integer at each param level 85 | while len(results) < max_iters: 86 | trial = [] 87 | for param in params: 88 | sampled_param = random.sample(param, 1)[0] 89 | trial.append(sampled_param) 90 | 91 | # verify this is a unique trial so we 92 | # don't duplicate work 93 | trial_str = json.dumps(trial) 94 | if trial_str not in seen_trials: 95 | seen_trials.add(trial_str) 96 | results.append(trial) 97 | 98 | return results 99 | -------------------------------------------------------------------------------- /site/js/theme.js: -------------------------------------------------------------------------------- 1 | $( document ).ready(function() { 2 | // Shift nav in mobile when clicking the menu. 3 | $(document).on('click', "[data-toggle='wy-nav-top']", function() { 4 | $("[data-toggle='wy-nav-shift']").toggleClass("shift"); 5 | $("[data-toggle='rst-versions']").toggleClass("shift"); 6 | }); 7 | 8 | // Close menu when you click a link. 9 | $(document).on('click', ".wy-menu-vertical .current ul li a", function() { 10 | $("[data-toggle='wy-nav-shift']").removeClass("shift"); 11 | $("[data-toggle='rst-versions']").toggleClass("shift"); 12 | }); 13 | 14 | // Keyboard navigation 15 | document.addEventListener("keydown", function(e) { 16 | var key = e.which || e.keyCode || window.event && window.event.keyCode; 17 | var page; 18 | switch (key) { 19 | case 78: // n 20 | page = $('[role="navigation"] a:contains(Next):first').prop('href'); 21 | break; 22 | case 80: // p 23 | page = $('[role="navigation"] a:contains(Previous):first').prop('href'); 24 | break; 25 | case 13: // enter 26 | if (e.target === document.getElementById('mkdocs-search-query')) { 27 | e.preventDefault(); 28 | } 29 | break; 30 | default: break; 31 | } 32 | if ($(e.target).is(':input')) { 33 | return true; 34 | } else if (page) { 35 | window.location.href = page; 36 | } 37 | }); 38 | 39 | $(document).on('click', "[data-toggle='rst-current-version']", function() { 40 | $("[data-toggle='rst-versions']").toggleClass("shift-up"); 41 | }); 42 | 43 | // Make tables responsive 44 | $("table.docutils:not(.field-list)").wrap("
"); 45 | 46 | $('table').addClass('docutils'); 47 | }); 48 | 49 | window.SphinxRtdTheme = (function (jquery) { 50 | var stickyNav = (function () { 51 | var navBar, 52 | win, 53 | stickyNavCssClass = 'stickynav', 54 | applyStickNav = function () { 55 | if (navBar.height() <= win.height()) { 56 | navBar.addClass(stickyNavCssClass); 57 | } else { 58 | navBar.removeClass(stickyNavCssClass); 59 | } 60 | }, 61 | enable = function () { 62 | applyStickNav(); 63 | win.on('resize', applyStickNav); 64 | }, 65 | init = function () { 66 | navBar = jquery('nav.wy-nav-side:first'); 67 | win = jquery(window); 68 | }; 69 | jquery(init); 70 | return { 71 | enable : enable 72 | }; 73 | }()); 74 | return { 75 | StickyNav : stickyNav 76 | }; 77 | }($)); 78 | 79 | // The code below is a copy of @seanmadsen code posted Jan 10, 2017 on issue 803. 80 | // https://github.com/mkdocs/mkdocs/issues/803 81 | // This just incorporates the auto scroll into the theme itself without 82 | // the need for additional custom.js file. 83 | // 84 | $(function() { 85 | $.fn.isFullyWithinViewport = function(){ 86 | var viewport = {}; 87 | viewport.top = $(window).scrollTop(); 88 | viewport.bottom = viewport.top + $(window).height(); 89 | var bounds = {}; 90 | bounds.top = this.offset().top; 91 | bounds.bottom = bounds.top + this.outerHeight(); 92 | return ( ! ( 93 | (bounds.top <= viewport.top) || 94 | (bounds.bottom >= viewport.bottom) 95 | ) ); 96 | }; 97 | if( $('li.toctree-l1.current').length && !$('li.toctree-l1.current').isFullyWithinViewport() ) { 98 | $('.wy-nav-side') 99 | .scrollTop( 100 | $('li.toctree-l1.current').offset().top - 101 | $('.wy-nav-side').offset().top - 102 | 60 103 | ); 104 | } 105 | }); 106 | -------------------------------------------------------------------------------- /site/search/worker.js: -------------------------------------------------------------------------------- 1 | var base_path = 'function' === typeof importScripts ? '.' : '/search/'; 2 | var allowSearch = false; 3 | var index; 4 | var documents = {}; 5 | var lang = ['en']; 6 | var data; 7 | 8 | function getScript(script, callback) { 9 | console.log('Loading script: ' + script); 10 | $.getScript(base_path + script).done(function () { 11 | callback(); 12 | }).fail(function (jqxhr, settings, exception) { 13 | console.log('Error: ' + exception); 14 | }); 15 | } 16 | 17 | function getScriptsInOrder(scripts, callback) { 18 | if (scripts.length === 0) { 19 | callback(); 20 | return; 21 | } 22 | getScript(scripts[0], function() { 23 | getScriptsInOrder(scripts.slice(1), callback); 24 | }); 25 | } 26 | 27 | function loadScripts(urls, callback) { 28 | if( 'function' === typeof importScripts ) { 29 | importScripts.apply(null, urls); 30 | callback(); 31 | } else { 32 | getScriptsInOrder(urls, callback); 33 | } 34 | } 35 | 36 | function onJSONLoaded () { 37 | data = JSON.parse(this.responseText); 38 | var scriptsToLoad = ['lunr.js']; 39 | if (data.config && data.config.lang && data.config.lang.length) { 40 | lang = data.config.lang; 41 | } 42 | if (lang.length > 1 || lang[0] !== "en") { 43 | scriptsToLoad.push('lunr.stemmer.support.js'); 44 | if (lang.length > 1) { 45 | scriptsToLoad.push('lunr.multi.js'); 46 | } 47 | for (var i=0; i < lang.length; i++) { 48 | if (lang[i] != 'en') { 49 | scriptsToLoad.push(['lunr', lang[i], 'js'].join('.')); 50 | } 51 | } 52 | } 53 | loadScripts(scriptsToLoad, onScriptsLoaded); 54 | } 55 | 56 | function onScriptsLoaded () { 57 | console.log('All search scripts loaded, building Lunr index...'); 58 | if (data.config && data.config.separator && data.config.separator.length) { 59 | lunr.tokenizer.separator = new RegExp(data.config.separator); 60 | } 61 | if (data.index) { 62 | index = lunr.Index.load(data.index); 63 | data.docs.forEach(function (doc) { 64 | documents[doc.location] = doc; 65 | }); 66 | console.log('Lunr pre-built index loaded, search ready'); 67 | } else { 68 | index = lunr(function () { 69 | if (lang.length === 1 && lang[0] !== "en" && lunr[lang[0]]) { 70 | this.use(lunr[lang[0]]); 71 | } else if (lang.length > 1) { 72 | this.use(lunr.multiLanguage.apply(null, lang)); // spread operator not supported in all browsers: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/Spread_operator#Browser_compatibility 73 | } 74 | this.field('title'); 75 | this.field('text'); 76 | this.ref('location'); 77 | 78 | for (var i=0; i < data.docs.length; i++) { 79 | var doc = data.docs[i]; 80 | this.add(doc); 81 | documents[doc.location] = doc; 82 | } 83 | }); 84 | console.log('Lunr index built, search ready'); 85 | } 86 | allowSearch = true; 87 | postMessage({allowSearch: allowSearch}); 88 | } 89 | 90 | function init () { 91 | var oReq = new XMLHttpRequest(); 92 | oReq.addEventListener("load", onJSONLoaded); 93 | var index_path = base_path + '/search_index.json'; 94 | if( 'function' === typeof importScripts ){ 95 | index_path = 'search_index.json'; 96 | } 97 | oReq.open("GET", index_path); 98 | oReq.send(); 99 | } 100 | 101 | function search (query) { 102 | if (!allowSearch) { 103 | console.error('Assets for search still loading'); 104 | return; 105 | } 106 | 107 | var resultDocuments = []; 108 | var results = index.search(query); 109 | for (var i=0; i < results.length; i++){ 110 | var result = results[i]; 111 | doc = documents[result.ref]; 112 | doc.summary = doc.text.substring(0, 200); 113 | resultDocuments.push(doc); 114 | } 115 | return resultDocuments; 116 | } 117 | 118 | if( 'function' === typeof importScripts ) { 119 | onmessage = function (e) { 120 | if (e.data.init) { 121 | init(); 122 | } else if (e.data.query) { 123 | postMessage({ results: search(e.data.query) }); 124 | } else { 125 | console.error("Worker - Unrecognized message: " + e); 126 | } 127 | }; 128 | } 129 | -------------------------------------------------------------------------------- /docs/experiment_tracking/experiment.md: -------------------------------------------------------------------------------- 1 | # Experiment class API 2 | 3 | [[Github Code](https://github.com/williamFalcon/test-tube/blob/master/test_tube/log.py)] 4 | 5 | An Experiment holds metadata and the results of the training run, you 6 | can instantiate an `Experiment` via: 7 | 8 | ``` {.python} 9 | from test_tube import Experiment 10 | 11 | exp = Experiment(name='dense_model', 12 | debug=False, 13 | save_dir='/Desktop/test_tube') 14 | 15 | exp.tag({'learning_rate': 0.002, 'nb_layers': 2}) 16 | 17 | for step in training_steps: 18 | tng_err = model.eval(tng_x, tng_y) 19 | 20 | exp.log({'tng_err': tng_err}) 21 | 22 | # training complete! 23 | # all your logs and data are ready to be visualized at testtube.williamfalcon.com 24 | ``` 25 | 26 | ------------------------------------------------------------------------ 27 | 28 | ## init options 29 | 30 | ### version 31 | 32 | The same Experiment can have multiple versions. Test tube generates 33 | these automatically each time you run your model. To set your own 34 | version use: 35 | 36 | ``` {.python} 37 | exp = Experiment(name='dense_model',version=1) 38 | ``` 39 | 40 | ### debug 41 | 42 | If you're debugging and don't want to create a log file turn debug to 43 | True 44 | 45 | ``` {.python} 46 | exp = Experiment(name='dense_model',debug=True) 47 | ``` 48 | 49 | ### autosave 50 | 51 | If you only want to save at the end of training, turn autosave off: 52 | 53 | ``` {.python} 54 | exp = Experiment(name='dense_model', autosave=False) 55 | 56 | # run long training... 57 | 58 | # first time any logs are saved 59 | exp.save() 60 | ``` 61 | 62 | ### `create_git_tag` 63 | 64 | Ever wanted a flashback to your code when you ran an experiment? 65 | Snapshot your code for this experiment using git tags: 66 | 67 | ``` {.python} 68 | exp = Experiment(name='dense_model', create_git_tag=True) 69 | ``` 70 | 71 | ------------------------------------------------------------------------ 72 | 73 | ## Methods 74 | 75 | ### tag 76 | 77 | ``` {.python} 78 | exp.tag({k: v}) 79 | ``` 80 | 81 | Adds an arbitrary dictionary of tags to the experiment 82 | 83 | **Example** 84 | 85 | ``` {.python} 86 | exp.tag({'dataset_name': 'imagenet_1', 'learning_rate': 0.0002}) 87 | ``` 88 | 89 | ### log 90 | 91 | ``` {.python} 92 | exp.log({k:v}) 93 | ``` 94 | 95 | Adds a row of data to the experiments 96 | 97 | **Example** 98 | 99 | ``` {.python} 100 | exp.log({'val_loss': 0.22, 'epoch_nb': 1, 'batch_nb': 12}) 101 | 102 | # you can also add other rows that have separate information 103 | exp.log({'tng_loss': 0.01}) 104 | 105 | # or even a numpy array image 106 | image = np.imread('image.png') 107 | exp.log({'fake_png': image}) 108 | ``` 109 | 110 | **Saving images Example** 111 | 112 | ``` {.python} 113 | # name must have either jpg, png or jpeg in it 114 | img = np.imread('a.jpg') 115 | exp.log('test_jpg': img, 'val_err': 0.2) 116 | 117 | # saves image to ../exp/version/media/test_0.jpg 118 | # csv has file path to that image in that cell 119 | ``` 120 | 121 | To save an image, add `jpg`, `png` or `jpeg` to the key corresponding 122 | with the image array. The image must be formatted the same as skimage's 123 | [imsave](http://scikit-image.org/docs/dev/api/skimage.io.html#skimage.io.imsave) 124 | function 125 | 126 | ### argparse 127 | 128 | ``` {.python} 129 | exp.argparse(hparams) 130 | ``` 131 | 132 | Transfers hyperparam information from Argparser or 133 | HyperOptArgumentParser 134 | 135 | **Example** 136 | 137 | ``` {.python} 138 | from test_tube import HyperOptArgumentParser 139 | 140 | # parse args 141 | parser = HyperOptArgumentParser() 142 | parser.add_argument('--learning_rate', default=0.002, type=float, help='the learning rate') 143 | hparams = parser.parse_args() 144 | 145 | # learning_rate is now a meta tag for your experiment 146 | exp.argparse(hparams) 147 | ``` 148 | 149 | ### save 150 | 151 | ``` {.python} 152 | exp.save() 153 | ``` 154 | 155 | Saves the exp to disk (including images) 156 | 157 | **Example** 158 | 159 | ``` {.python} 160 | exp = Experiment(name='dense_model', autosave=False) 161 | 162 | # run long training... 163 | 164 | # first time any logs are saved 165 | exp.save() 166 | ``` 167 | -------------------------------------------------------------------------------- /site/css/theme_extra.css: -------------------------------------------------------------------------------- 1 | /* 2 | * Sphinx doesn't have support for section dividers like we do in 3 | * MkDocs, this styles the section titles in the nav 4 | * 5 | * https://github.com/mkdocs/mkdocs/issues/175 6 | */ 7 | .wy-menu-vertical span { 8 | line-height: 18px; 9 | padding: 0.4045em 1.618em; 10 | display: block; 11 | position: relative; 12 | font-size: 90%; 13 | color: #838383; 14 | } 15 | 16 | .wy-menu-vertical .subnav a { 17 | padding: 0.4045em 2.427em; 18 | } 19 | 20 | /* 21 | * Long navigations run off the bottom of the screen as the nav 22 | * area doesn't scroll. 23 | * 24 | * https://github.com/mkdocs/mkdocs/pull/202 25 | * 26 | * Builds upon pull 202 https://github.com/mkdocs/mkdocs/pull/202 27 | * to make toc scrollbar end before navigations buttons to not be overlapping. 28 | */ 29 | .wy-nav-side { 30 | height: calc(100% - 45px); 31 | overflow-y: auto; 32 | min-height: 0; 33 | } 34 | 35 | .rst-versions{ 36 | border-top: 0; 37 | height: 45px; 38 | } 39 | 40 | @media screen and (max-width: 768px) { 41 | .wy-nav-side { 42 | height: 100%; 43 | } 44 | } 45 | 46 | /* 47 | * readthedocs theme hides nav items when the window height is 48 | * too small to contain them. 49 | * 50 | * https://github.com/mkdocs/mkdocs/issues/#348 51 | */ 52 | .wy-menu-vertical ul { 53 | margin-bottom: 2em; 54 | } 55 | 56 | /* 57 | * Wrap inline code samples otherwise they shoot of the side and 58 | * can't be read at all. 59 | * 60 | * https://github.com/mkdocs/mkdocs/issues/313 61 | * https://github.com/mkdocs/mkdocs/issues/233 62 | * https://github.com/mkdocs/mkdocs/issues/834 63 | */ 64 | code { 65 | white-space: pre-wrap; 66 | word-wrap: break-word; 67 | padding: 2px 5px; 68 | } 69 | 70 | /** 71 | * Make code blocks display as blocks and give them the appropriate 72 | * font size and padding. 73 | * 74 | * https://github.com/mkdocs/mkdocs/issues/855 75 | * https://github.com/mkdocs/mkdocs/issues/834 76 | * https://github.com/mkdocs/mkdocs/issues/233 77 | */ 78 | pre code { 79 | white-space: pre; 80 | word-wrap: normal; 81 | display: block; 82 | padding: 12px; 83 | font-size: 12px; 84 | } 85 | 86 | /* 87 | * Fix link colors when the link text is inline code. 88 | * 89 | * https://github.com/mkdocs/mkdocs/issues/718 90 | */ 91 | a code { 92 | color: #2980B9; 93 | } 94 | a:hover code { 95 | color: #3091d1; 96 | } 97 | a:visited code { 98 | color: #9B59B6; 99 | } 100 | 101 | /* 102 | * The CSS classes from highlight.js seem to clash with the 103 | * ReadTheDocs theme causing some code to be incorrectly made 104 | * bold and italic. 105 | * 106 | * https://github.com/mkdocs/mkdocs/issues/411 107 | */ 108 | pre .cs, pre .c { 109 | font-weight: inherit; 110 | font-style: inherit; 111 | } 112 | 113 | /* 114 | * Fix some issues with the theme and non-highlighted code 115 | * samples. Without and highlighting styles attached the 116 | * formatting is broken. 117 | * 118 | * https://github.com/mkdocs/mkdocs/issues/319 119 | */ 120 | .no-highlight { 121 | display: block; 122 | padding: 0.5em; 123 | color: #333; 124 | } 125 | 126 | 127 | /* 128 | * Additions specific to the search functionality provided by MkDocs 129 | */ 130 | 131 | .search-results { 132 | margin-top: 23px; 133 | } 134 | 135 | .search-results article { 136 | border-top: 1px solid #E1E4E5; 137 | padding-top: 24px; 138 | } 139 | 140 | .search-results article:first-child { 141 | border-top: none; 142 | } 143 | 144 | form .search-query { 145 | width: 100%; 146 | border-radius: 50px; 147 | padding: 6px 12px; /* csslint allow: box-model */ 148 | border-color: #D1D4D5; 149 | } 150 | 151 | .wy-menu-vertical li ul { 152 | display: inherit; 153 | } 154 | 155 | .wy-menu-vertical li ul.subnav ul.subnav{ 156 | padding-left: 1em; 157 | } 158 | 159 | .wy-menu-vertical .subnav li.current > a { 160 | padding-left: 2.42em; 161 | } 162 | .wy-menu-vertical .subnav li.current > ul li a { 163 | padding-left: 3.23em; 164 | } 165 | 166 | /* 167 | * Improve inline code blocks within admonitions. 168 | * 169 | * https://github.com/mkdocs/mkdocs/issues/656 170 | */ 171 | .admonition code { 172 | color: #404040; 173 | border: 1px solid #c7c9cb; 174 | border: 1px solid rgba(0, 0, 0, 0.2); 175 | background: #f8fbfd; 176 | background: rgba(255, 255, 255, 0.7); 177 | } 178 | 179 | /* 180 | * Account for wide tables which go off the side. 181 | * Override borders to avoid wierdness on narrow tables. 182 | * 183 | * https://github.com/mkdocs/mkdocs/issues/834 184 | * https://github.com/mkdocs/mkdocs/pull/1034 185 | */ 186 | .rst-content .section .docutils { 187 | width: 100%; 188 | overflow: auto; 189 | display: block; 190 | border: none; 191 | } 192 | 193 | td, th { 194 | border: 1px solid #e1e4e5 !important; /* csslint allow: important */ 195 | border-collapse: collapse; 196 | } 197 | 198 | -------------------------------------------------------------------------------- /site/404.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | Test tube Documentation 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 |
28 | 29 | 30 | 86 | 87 |
88 | 89 | 90 | 94 | 95 | 96 |
97 |
98 |
99 |
    100 |
  • Docs »
  • 101 | 102 | 103 |
  • 104 | 105 |
  • 106 |
107 |
108 |
109 |
110 |
111 | 112 | 113 |

404

114 | 115 |

Page not found

116 | 117 | 118 |
119 |
120 | 132 | 133 |
134 |
135 | 136 |
137 | 138 |
139 | 140 |
141 | 142 | 143 | GitHub 144 | 145 | 146 | 147 | 148 |
149 | 150 | 151 | 152 | 153 | 154 | 155 | -------------------------------------------------------------------------------- /site/search.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | Test tube Documentation 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 |
28 | 29 | 30 | 86 | 87 |
88 | 89 | 90 | 94 | 95 | 96 |
97 |
98 |
99 |
    100 |
  • Docs »
  • 101 | 102 | 103 |
  • 104 | 105 |
  • 106 |
107 |
108 |
109 |
110 |
111 | 112 | 113 |

Search Results

114 | 115 | 119 | 120 |
121 | Searching... 122 |
123 | 124 | 125 |
126 |
127 | 139 | 140 |
141 |
142 | 143 |
144 | 145 |
146 | 147 |
148 | 149 | 150 | GitHub 151 | 152 | 153 | 154 | 155 |
156 | 157 | 158 | 159 | 160 | 161 | 162 | -------------------------------------------------------------------------------- /docs/hyperparameter_optimization/HyperOptArgumentParser.md: -------------------------------------------------------------------------------- 1 | # HyperOptArgumentParser class API 2 | 3 | [[Github Code](https://github.com/williamFalcon/test-tube/blob/master/test_tube/argparse_hopt.py)] 4 | 5 | The HyperOptArgumentParser is a subclass of python's 6 | [argparse](https://docs.python.org/3/library/argparse.html), with added 7 | finctionality to change parameters on the fly as determined by a 8 | sampling strategy. 9 | 10 | You can instantiate an `HyperOptArgumentParser` via: 11 | 12 | ``` {.python} 13 | from test_tube import HyperOptArgumentParser 14 | 15 | # subclass of argparse 16 | parser = HyperOptArgumentParser(strategy='random_search') 17 | parser.add_argument('--learning_rate', default=0.002, type=float, help='the learning rate') 18 | 19 | # let's enable optimizing over the number of layers in the network 20 | parser.opt_list('--nb_layers', default=2, type=int, tunable=True, options=[2, 4, 8]) 21 | 22 | # and tune the number of units in each layer 23 | parser.opt_range('--neurons', default=50, type=int, tunable=True, low=100, high=800, nb_samples=10) 24 | 25 | # compile (because it's argparse underneath) 26 | hparams = parser.parse_args() 27 | 28 | # run 20 trials of random search over the hyperparams 29 | for hparam_trial in hparams.trials(20): 30 | train_network(hparam_trial) 31 | ``` 32 | 33 | ------------------------------------------------------------------------ 34 | 35 | ## init options 36 | 37 | ### `strategy` 38 | 39 | Use either [random 40 | search](http://www.jmlr.org/papers/volume13/bergstra12a/bergstra12a.pdf) 41 | or [grid 42 | search](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html) 43 | for tuning: 44 | 45 | ``` {.python} 46 | parser = HyperOptArgumentParser(strategy='grid_search') 47 | ``` 48 | 49 | ------------------------------------------------------------------------ 50 | 51 | ## Methods 52 | 53 | All the functionality from argparse works but we've added the following 54 | functionality: 55 | 56 | ### `opt_list` 57 | 58 | ``` {.python} 59 | parser.opt_list('--nb_layers', default=2, type=int, tunable=True, options=[2, 4, 8]) 60 | ``` 61 | 62 | Enables searching over a list of values for this parameter. The tunable 63 | values ONLY replace the argparse values when running a hyperparameter 64 | optimization search. This is on purpose so your code doesn't have to 65 | change when you want to tune it. 66 | 67 | **Example** 68 | 69 | ``` {.python} 70 | parser.opt_list('--nb_layers', default=2, type=int, tunable=True, options=[2, 4, 8]) 71 | hparams = parser.parse_args() 72 | # hparams.nb_layers = 2 73 | 74 | for trial in hparams.trials(2): 75 | # trial.nb_layers is now a value in [2, 4, 8] 76 | # but hparams.nb_layers is still 2 77 | ``` 78 | 79 | ### `opt_range` 80 | 81 | ``` {.python} 82 | parser.opt_range('--neurons', default=50, type=int, tunable=True, low=100, high=800, nb_samples=8, log_base=None) 83 | ``` 84 | 85 | Enables searching over a range of values chosen randomly using the 86 | `nb_samples` given. The tunable values *only* replace the argparse 87 | values when running a hyperparameter optimization search. This is on 88 | purpose so your code doesn't have to change when you want to tune it. 89 | 90 | If `log_base` is set to a positive number, it will randomly search over 91 | a log scale, where the log base is `log_base`. This is better for search 92 | over several orders of magnitude efficiently. 93 | 94 | **Example** 95 | 96 | ``` {.python} 97 | parser.opt_range('--neurons', default=50, type=int, tunable=True, low=100, high=800, nb_samples=8) 98 | hparams = parser.parse_args() 99 | # hparams.neurons = 50 100 | 101 | for trial in hparams.trials(2): 102 | # trial.nb_layers is now a value in [100, 200, 300, 400, 500, 600 700, 800] 103 | # but hparams.neurons is still 50 104 | ``` 105 | 106 | ### `json_config` 107 | 108 | ``` {.python} 109 | parser.json_config('--config', default='example.json') 110 | ``` 111 | 112 | Replaces default values in the parser with those read from the json file 113 | 114 | **Example** 115 | 116 | *example.json* 117 | 118 | ``` {.json} 119 | { 120 | "learning_rate": 200 121 | } 122 | ``` 123 | 124 | ``` {.python} 125 | parser.add_argument('--learning_rate', default=0.002, type=float, help='the learning rate') 126 | parser.json_config('--config', default='example.json') 127 | hparams = parser.parse_args() 128 | 129 | # hparams.learning_rate = 200 130 | ``` 131 | 132 | ### trials 133 | 134 | ``` {.python} 135 | trial_generator = hparams.trials(2) 136 | ``` 137 | 138 | Computes the trials needed for these experiments and serves them via a 139 | generator 140 | 141 | **Example** 142 | 143 | ``` {.python} 144 | hparams = parser.parse_args() 145 | for trial_hparams in hparams.trials(2): 146 | # trial_hparams now has values sampled from the training routine 147 | ``` 148 | 149 | ### `optimize_parallel_gpu` 150 | 151 | ``` {.python} 152 | hparams = parser.parse_args() 153 | hparams.optimize_parallel_gpu(function_to_optimize, gpu_ids=['1', '0, 2']) 154 | ``` 155 | 156 | Parallelize the trials across `nb_workers` processes. Auto assign the 157 | correct gpus. Argument passed into the `function_to_optimize` is the 158 | `trial_params` argument and the gpu_ids. 159 | 160 | **Example** 161 | 162 | ``` {.python} 163 | # parallelize tuning on 2 gpus 164 | # this will place each trial in n into a given gpu 165 | def train_main(trial_params, gpu_ids): 166 | # train your model, etc here... 167 | 168 | hparams = parser.parse_args() 169 | hparams.optimize_parallel_gpu(train_main, gpu_ids=['1', '0, 2']) 170 | 171 | # at the end of the optimize_parallel function, all 20 trials will be completed 172 | # in this case by running 10 sets of 2 trials in parallel 173 | ``` 174 | 175 | ### `optimize_parallel_cpu` 176 | 177 | ``` {.python} 178 | hparams = parser.parse_args() 179 | hparams.optimize_parallel_cpu(function_to_optimize, nb_trials=20, nb_workers=2) 180 | ``` 181 | 182 | Parallelize the trials across `nb_workers` cpus. Argument passed into 183 | the `function_to_optimize` is the `trial_params` argument. 184 | 185 | **Example** 186 | 187 | ``` {.python} 188 | # parallelize tuning on 2 cpus 189 | # this will place each trial in n into a given gpu 190 | def train_main(trial_params): 191 | # train your model, etc here... 192 | 193 | hparams = parser.parse_args() 194 | hparams.optimize_parallel_cpu(train_main, nb_trials=20, nb_workers=2) 195 | 196 | # at the end of the optimize_parallel function, all 20 trials will be completed 197 | # in this case by running 10 sets of 2 trials in parallel 198 | ``` 199 | -------------------------------------------------------------------------------- /test_tube/hyperopt.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import json 3 | import random 4 | 5 | 6 | class HyperParamOptimizer(object): 7 | 8 | def __init__(self, method='grid_search', enabled=True, experiment=None): 9 | """ 10 | :param method: 'grid_search', 'random_search' 11 | :param enabled: 12 | """ 13 | self.method = method 14 | self.enabled = enabled 15 | self.experiment = experiment 16 | self.seen_params = {} 17 | self.current_iteration = 0 18 | 19 | # the params to use at each trial 20 | self.trials = None 21 | 22 | # total iterations we're doing 23 | self.nb_iterations = None 24 | 25 | # details about each param 26 | self.params = [] 27 | 28 | # ----------------------------- 29 | # PARAMETER CHOICES 30 | # ----------------------------- 31 | def tune_uniform(self, low, high, samples, default, name): 32 | # how this fx samples for the data 33 | def gen_samples(): 34 | vals = [random.uniform(low, high) for i in range(samples)] 35 | return vals 36 | 37 | return self.__resolve_param(gen_samples, default, name) 38 | 39 | def tune_odds(self, low, high, default, name): 40 | start = low if low %2 != 0 else low + 1 41 | def gen_samples(): 42 | return range(start, high+1, 2) 43 | 44 | return self.__resolve_param(gen_samples, default, name) 45 | 46 | def tune_evens(self, low, high, default, name): 47 | start = low if low %2 == 0 else low + 1 48 | def gen_samples(): 49 | return range(start, high+1, 2) 50 | 51 | return self.__resolve_param(gen_samples, default, name) 52 | 53 | def tune_choice(self, options, default, name): 54 | def gen_samples(): 55 | return options 56 | 57 | return self.__resolve_param(gen_samples, default, name) 58 | 59 | def __resolve_param(self, gen_fx, default, name): 60 | # case when no action was requested 61 | if not self.enabled: 62 | return default 63 | 64 | # create the param when it's new 65 | # return the first value in this case 66 | if name not in self.seen_params: 67 | vals = gen_fx() 68 | param = {'vals': vals, 'name': name} 69 | self.seen_params[name] = {'idx': len(self.params)} 70 | self.params.append(param) 71 | return vals[0] 72 | 73 | # not the first iteration so return the ith element 74 | # in the possible values 75 | iteration_params = self.trials[self.current_iteration] 76 | param_i = self.seen_params[name]['idx'] 77 | param = iteration_params[param_i] 78 | return param['val'] 79 | 80 | # ----------------------------- 81 | # OPTIMIZATION 82 | # ----------------------------- 83 | def optimize(self, fx, nb_iterations=None): 84 | """ 85 | Primary entry point into the optimization 86 | :param fx: 87 | :param nb_iterations: 88 | :return: 89 | """ 90 | self.nb_iterations = nb_iterations 91 | 92 | # run first iteration 93 | result = fx(self) 94 | 95 | # log if requested 96 | if self.experiment is not None: 97 | result['hypo_iter_nb'] = self.current_iteration 98 | self.experiment.log(result) 99 | 100 | self.current_iteration += 1 101 | 102 | # generate the rest of the training seq 103 | # we couldn't do this before because we don't know 104 | # how many params the user needed 105 | self.__generate_trials() 106 | 107 | # run trials for the rest of the iterations 108 | # we either know the iterations or they're 109 | # calculated from the strategy used 110 | for i in range(1, len(self.trials)): 111 | result = fx(self) 112 | result['hypo_iter_nb'] = self.current_iteration 113 | 114 | # log if requested 115 | if self.experiment is not None: 116 | self.experiment.log(result) 117 | 118 | self.current_iteration += 1 119 | 120 | # ----------------------------- 121 | # INTERFACE WITH LOGGER 122 | # ----------------------------- 123 | def get_current_trial_meta(self): 124 | meta_results = [] 125 | 126 | # when we have trials, means we've already done 1 run 127 | # we can just get the params that are about to be run 128 | # otherwise we need to infer params from the current param list 129 | # this assumes the user feeds the opt into the experiment after 130 | # they're done setting up the params 131 | is_first_trial = self.trials is not None and len(self.trials) > 0 132 | if is_first_trial: 133 | trial_params = self.trials[self.current_iteration] 134 | for trial_param in trial_params: 135 | root_param = self.params[trial_param['idx']] 136 | meta_results.append({'hypo_' + root_param['name']: trial_param['val']}) 137 | 138 | # if we haven't done a pass through the data yet, 139 | # we need to infer from the params in the list 140 | else: 141 | for param in self.params: 142 | meta_results.append({'hypo_' + param['name']: param['vals'][0]}) 143 | 144 | # add shared meta 145 | meta_results.append({'hypo_iter_nb': self.current_iteration}) 146 | return meta_results 147 | 148 | # ----------------------------- 149 | # TRIALS HELPER 150 | # ----------------------------- 151 | def __generate_trials(self): 152 | """ 153 | Generates the parameter combinations for each requested trial 154 | :return: 155 | """ 156 | flat_params = self.__flatten_params(self.params) 157 | 158 | # permute for grid search 159 | if self.method == 'grid_search': 160 | self.trials = list(itertools.product(*flat_params)) 161 | 162 | if self.nb_iterations is not None: 163 | self.trials = self.trials[0: self.nb_iterations] 164 | 165 | if self.method == 'random_search': 166 | self.trials = self.__generate_random_search_trials(flat_params) 167 | 168 | def __flatten_params(self, params): 169 | """ 170 | Turns a list of parameters with values into a flat tuple list of lists 171 | so we can permute 172 | :param params: 173 | :return: 174 | """ 175 | flat_params = [] 176 | for i, param in enumerate(params): 177 | param_groups = [] 178 | for val in param['vals']: 179 | param_groups.append({'idx': i, 'val': val}) 180 | flat_params.append(param_groups) 181 | return flat_params 182 | 183 | def __generate_random_search_trials(self, params): 184 | results = [] 185 | 186 | # ensures we have unique results 187 | seen_trials = set() 188 | 189 | # shuffle each param list 190 | potential_trials = 1 191 | for p in params: 192 | random.shuffle(p) 193 | potential_trials *= len(p) 194 | 195 | # we can't sample more trials than are possible 196 | max_iters = min(potential_trials, self.nb_iterations) 197 | 198 | # then for the nb of trials requested, create a new param tuple 199 | # by picking a random integer at each param level 200 | while len(results) < max_iters: 201 | trial = [] 202 | for param in params: 203 | p = random.sample(param, 1)[0] 204 | trial.append(p) 205 | 206 | # verify this is a unique trial so we 207 | # don't duplicate work 208 | trial_str = json.dumps(trial) 209 | if trial_str not in seen_trials: 210 | seen_trials.add(trial_str) 211 | results.append(trial) 212 | 213 | return results 214 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | react-router 4 | 5 |

6 |

7 | Test Tube 8 |

9 |

10 | Log, organize and parallelize hyperparameter search for Deep Learning experiments 11 |

12 |

13 | PyPI version 14 | 15 | 16 | 17 |

18 | 19 | ## Docs 20 | 21 | **[View the docs here](https://williamfalcon.github.io/test-tube/)** 22 | 23 | --- 24 | 25 | Test tube is a python library to track and parallelize hyperparameter 26 | search for Deep Learning and ML experiments. It's framework agnostic and 27 | built on top of the python argparse API for ease of use. 28 | 29 | ``` {.bash} 30 | pip install test_tube 31 | ``` 32 | 33 | --- 34 | 35 | ### Main test-tube uses 36 | 37 | - [Parallelize hyperparameter 38 | optimization](https://williamfalcon.github.io/test-tube/hyperparameter_optimization/HyperOptArgumentParser/) 39 | (across multiple gpus or cpus). 40 | - [Parallelize hyperparameter 41 | optimization](https://williamfalcon.github.io/test-tube/hyperparameter_optimization/HyperOptArgumentParser/) 42 | across HPC cluster using SLURM. 43 | - Log experiment hyperparameters and experiment data. 44 | [Experiments](https://williamfalcon.github.io/test-tube/experiment_tracking/experiment/) 45 | across models. 46 | - Visualize with [tensorboard](https://www.tensorflow.org/guide/summaries_and_tensorboard) 47 | 48 | Compatible with Python any Python ML library like Tensorflow, Keras, Pytorch, Caffe, Caffe2, Chainer, MXNet, Theano, Scikit-learn 49 | 50 | --- 51 | ### Examples 52 | The Experiment object is a subclass of Pytorch.SummaryWriter. 53 | 54 | **Log and visualize with Tensorboard** 55 | 56 | ```{.python} 57 | from test-tube import Experiment 58 | import torch 59 | 60 | exp = Experiment('/some/path') 61 | exp.tag({'learning_rate': 0.02, 'layers': 4}) 62 | 63 | # exp is superclass of SummaryWriter 64 | features = torch.Tensor(100, 784) 65 | writer.add_embedding(features, metadata=label, label_img=images.unsqueeze(1)) 66 | 67 | # simulate training 68 | for n_iter in range(2000): 69 | e.log({'testtt': n_iter * np.sin(n_iter)}) 70 | 71 | # save and close 72 | exp.save() 73 | exp.close() 74 | ``` 75 | 76 | ```{.bash} 77 | pip install tensorflow 78 | 79 | tensorboard --logdir /some/path 80 | ``` 81 | 82 | **Run grid search on SLURM GPU cluster** 83 | 84 | ``` {.python} 85 | from test_tube.hpc import SlurmCluster 86 | 87 | # hyperparameters is a test-tube hyper params object 88 | hyperparams = args.parse() 89 | 90 | # init cluster 91 | cluster = SlurmCluster( 92 | hyperparam_optimizer=hyperparams, 93 | log_path='/path/to/log/results/to', 94 | python_cmd='python3' 95 | ) 96 | 97 | # let the cluster know where to email for a change in job status (ie: complete, fail, etc...) 98 | cluster.notify_job_status(email='some@email.com', on_done=True, on_fail=True) 99 | 100 | # set the job options. In this instance, we'll run 20 different models 101 | # each with its own set of hyperparameters giving each one 1 GPU (ie: taking up 20 GPUs) 102 | cluster.per_experiment_nb_gpus = 1 103 | cluster.per_experiment_nb_nodes = 1 104 | 105 | # run the models on the cluster 106 | cluster.optimize_parallel_cluster_gpu(train, nb_trials=20, job_name='first_tt_batch', job_display_name='my_batch') 107 | 108 | # we just ran 20 different hyperparameters on 20 GPUs in the HPC cluster!! 109 | ``` 110 | 111 | **Optimize hyperparameters across GPUs** 112 | 113 | ``` {.python} 114 | from test_tube import HyperOptArgumentParser 115 | 116 | # subclass of argparse 117 | parser = HyperOptArgumentParser(strategy='random_search') 118 | parser.add_argument('--learning_rate', default=0.002, type=float, help='the learning rate') 119 | 120 | # let's enable optimizing over the number of layers in the network 121 | parser.opt_list('--nb_layers', default=2, type=int, tunable=True, options=[2, 4, 8]) 122 | 123 | # and tune the number of units in each layer 124 | parser.opt_range('--neurons', default=50, type=int, tunable=True, low=100, high=800, nb_samples=10) 125 | 126 | # compile (because it's argparse underneath) 127 | hparams = parser.parse_args() 128 | 129 | # optimize across 4 gpus 130 | # use 2 gpus together and the other two separately 131 | hparams.optimize_parallel_gpu(MyModel.fit, gpu_ids=['1', '2,3', '0'], nb_trials=192, nb_workers=4) 132 | ``` 133 | 134 | Or... across CPUs 135 | 136 | ``` {.python} 137 | hparams.optimize_parallel_cpu(MyModel.fit, nb_trials=192, nb_workers=12) 138 | ``` 139 | 140 | You can also optimize on a *log* scale to allow better search over 141 | magnitudes of hyperparameter values, with a chosen base (disabled by 142 | default). Keep in mind that the range you search over must be strictly 143 | positive. 144 | 145 | ``` {.python} 146 | from test_tube import HyperOptArgumentParser 147 | 148 | # subclass of argparse 149 | parser = HyperOptArgumentParser(strategy='random_search') 150 | 151 | # Randomly searches over the (log-transformed) range [100,800). 152 | 153 | parser.opt_range('--neurons', default=50, type=int, tunable=True, low=100, high=800, nb_samples=10, log_base=10) 154 | 155 | 156 | # compile (because it's argparse underneath) 157 | hparams = parser.parse_args() 158 | 159 | # run 20 trials of random search over the hyperparams 160 | for hparam_trial in hparams.trials(20): 161 | train_network(hparam_trial) 162 | ``` 163 | 164 | ### Convert your argparse params into searchable params by changing 1 line 165 | 166 | ``` {.python} 167 | import argparse 168 | from test_tube import HyperOptArgumentParser 169 | 170 | # these lines are equivalent 171 | parser = argparse.ArgumentParser(description='Process some integers.') 172 | parser = HyperOptArgumentParser(description='Process some integers.', strategy='grid_search') 173 | 174 | # do normal argparse stuff 175 | ... 176 | ``` 177 | 178 | ### Log images inline with metrics 179 | 180 | ``` {.python} 181 | # name must have either jpg, png or jpeg in it 182 | img = np.imread('a.jpg') 183 | exp.log('test_jpg': img, 'val_err': 0.2) 184 | 185 | # saves image to ../exp/version/media/test_0.jpg 186 | # csv has file path to that image in that cell 187 | ``` 188 | 189 | ## Demos 190 | 191 | - [Hyperparameter optimization for PyTorch across 20 cluster GPUs](https://github.com/williamFalcon/test-tube/blob/master/examples/pytorch_hpc_example.py) 192 | - [Hyperparameter optimization across 20 cluster CPUs](https://github.com/williamFalcon/test-tube/blob/master/examples/hpc_cpu_example.py) 193 | - [Experiments and hyperparameter optimization for tensorflow across 4 GPUs simultaneously](https://github.com/williamFalcon/test-tube/blob/master/examples/tensorflow_example.py) 194 | 195 | ## How to contribute 196 | 197 | Feel free to fix bugs and make improvements! 1. Check out the [current 198 | bugs here](https://github.com/williamFalcon/test-tube/issues) or 199 | [feature 200 | requests](https://github.com/williamFalcon/test-tube/projects/1). 2. To 201 | work on a bug or feature, head over to our [project 202 | page](https://github.com/williamFalcon/test-tube/projects/1) and assign 203 | yourself the bug. 3. We'll add contributor names periodically as people 204 | improve the library! 205 | 206 | ## Bibtex 207 | 208 | To cite the framework use: 209 | 210 | @misc{Falcon2017, 211 | author = {Falcon, W.A.}, 212 | title = {Test Tube}, 213 | year = {2017}, 214 | publisher = {GitHub}, 215 | journal = {GitHub repository}, 216 | howpublished = {\url{https://github.com/williamfalcon/test-tube}} 217 | } 218 | 219 | ## License 220 | In addition to the terms outlined in the license, this software is U.S. Patent Pending. 221 | -------------------------------------------------------------------------------- /site/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | Test Tube: Easily log and tune Deep Learning experiments - Test tube Documentation 12 | 13 | 14 | 15 | 16 | 17 | 18 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 |
35 | 36 | 37 | 105 | 106 |
107 | 108 | 109 | 113 | 114 | 115 |
116 |
117 |
118 |
    119 |
  • Docs »
  • 120 | 121 | 122 | 123 |
  • Test Tube: Easily log and tune Deep Learning experiments
  • 124 |
  • 125 | 126 | Edit on GitHub 128 | 129 |
  • 130 |
131 |
132 |
133 |
134 |
135 | 136 |

Test Tube: Easily log and tune Deep Learning experiments

137 |

Test Tube allows you to easily log metadata and track your machine 138 | learning experiments.

139 |

Use Test Tube if you need to:

140 |
    141 |
  • Track many Experiments across 142 | models.
  • 143 |
  • Visualize and compare different 144 | experiments without uploading anywhere.
  • 145 |
  • Optimize your 146 | hyperparameters 147 | using grid search or random search.
  • 148 |
  • Automatically track ALL parameters for a particular training run.
  • 149 |
150 |

Test Tube is compatible with: Python 2 and 3

151 |

Getting started

152 |
153 |

Create an Experiment

154 |
from test_tube import Experiment
155 | 
156 | exp = Experiment(name='dense_model',
157 |                  debug=False,
158 |                  save_dir='/Desktop/test_tube')
159 | 
160 | exp.tag({'learning_rate': 0.002, 'nb_layers': 2})
161 | 
162 | for step in training_steps:
163 |     tng_err = model.eval(tng_x, tng_y)
164 | 
165 |     exp.log('tng_err': tng_err)
166 | 
167 | # training complete!
168 | # all your logs and data are ready to be visualized at testtube.williamfalcon.com
169 | 
170 | 171 |
172 |

Optimize your hyperparameters

173 |
from test_tube import HyperOptArgumentParser
174 | 
175 | # subclass of argparse
176 | parser = HyperOptArgumentParser(strategy='random_search')
177 | parser.add_argument('--learning_rate', default=0.002, type=float, help='the learning rate')
178 | 
179 | # let's enable optimizing over the number of layers in the network
180 | parser.opt_list('--nb_layers', default=2, type=int, tunable=True, options=[2, 4, 8])
181 | 
182 | # and tune the number of units in each layer
183 | parser.opt_range('--neurons', default=50, type=int, tunable=True, low=100, high=800, nb_samples=10)
184 | 
185 | # compile (because it's argparse underneath)
186 | hparams = parser.parse_args()
187 | 
188 | # run 20 trials of random search over the hyperparams
189 | for hparam_trial in hparams.trials(20):
190 |     train_network(hparam_trial)
191 | 
192 | 193 |
194 |

Visualize

195 |
import pandas as pd
196 | import matplotlib
197 | 
198 | # each experiment is saved to a metrics.csv file which can be imported anywhere
199 | # images save to exp/version/images
200 | df = pd.read_csv('../some/dir/test_tube_data/dense_model/version_0/metrics.csv')
201 | df.tng_err.plot()
202 | 
203 | 204 |
205 |
206 | 225 | 226 |
227 |
228 | 229 |
230 | 231 |
232 | 233 |
234 | 235 | 236 | GitHub 237 | 238 | 239 | 240 | Next » 241 | 242 | 243 |
244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 255 | -------------------------------------------------------------------------------- /site/js/modernizr-2.8.3.min.js: -------------------------------------------------------------------------------- 1 | window.Modernizr=function(e,t,n){function r(e){b.cssText=e}function o(e,t){return r(S.join(e+";")+(t||""))}function a(e,t){return typeof e===t}function i(e,t){return!!~(""+e).indexOf(t)}function c(e,t){for(var r in e){var o=e[r];if(!i(o,"-")&&b[o]!==n)return"pfx"==t?o:!0}return!1}function s(e,t,r){for(var o in e){var i=t[e[o]];if(i!==n)return r===!1?e[o]:a(i,"function")?i.bind(r||t):i}return!1}function u(e,t,n){var r=e.charAt(0).toUpperCase()+e.slice(1),o=(e+" "+k.join(r+" ")+r).split(" ");return a(t,"string")||a(t,"undefined")?c(o,t):(o=(e+" "+T.join(r+" ")+r).split(" "),s(o,t,n))}function l(){p.input=function(n){for(var r=0,o=n.length;o>r;r++)j[n[r]]=!!(n[r]in E);return j.list&&(j.list=!(!t.createElement("datalist")||!e.HTMLDataListElement)),j}("autocomplete autofocus list placeholder max min multiple pattern required step".split(" ")),p.inputtypes=function(e){for(var r,o,a,i=0,c=e.length;c>i;i++)E.setAttribute("type",o=e[i]),r="text"!==E.type,r&&(E.value=x,E.style.cssText="position:absolute;visibility:hidden;",/^range$/.test(o)&&E.style.WebkitAppearance!==n?(g.appendChild(E),a=t.defaultView,r=a.getComputedStyle&&"textfield"!==a.getComputedStyle(E,null).WebkitAppearance&&0!==E.offsetHeight,g.removeChild(E)):/^(search|tel)$/.test(o)||(r=/^(url|email)$/.test(o)?E.checkValidity&&E.checkValidity()===!1:E.value!=x)),P[e[i]]=!!r;return P}("search tel url email datetime date month week time datetime-local number range color".split(" "))}var d,f,m="2.8.3",p={},h=!0,g=t.documentElement,v="modernizr",y=t.createElement(v),b=y.style,E=t.createElement("input"),x=":)",w={}.toString,S=" -webkit- -moz- -o- -ms- ".split(" "),C="Webkit Moz O ms",k=C.split(" "),T=C.toLowerCase().split(" "),N={svg:"http://www.w3.org/2000/svg"},M={},P={},j={},$=[],D=$.slice,F=function(e,n,r,o){var a,i,c,s,u=t.createElement("div"),l=t.body,d=l||t.createElement("body");if(parseInt(r,10))for(;r--;)c=t.createElement("div"),c.id=o?o[r]:v+(r+1),u.appendChild(c);return a=["­",'"].join(""),u.id=v,(l?u:d).innerHTML+=a,d.appendChild(u),l||(d.style.background="",d.style.overflow="hidden",s=g.style.overflow,g.style.overflow="hidden",g.appendChild(d)),i=n(u,e),l?u.parentNode.removeChild(u):(d.parentNode.removeChild(d),g.style.overflow=s),!!i},z=function(t){var n=e.matchMedia||e.msMatchMedia;if(n)return n(t)&&n(t).matches||!1;var r;return F("@media "+t+" { #"+v+" { position: absolute; } }",function(t){r="absolute"==(e.getComputedStyle?getComputedStyle(t,null):t.currentStyle).position}),r},A=function(){function e(e,o){o=o||t.createElement(r[e]||"div"),e="on"+e;var i=e in o;return i||(o.setAttribute||(o=t.createElement("div")),o.setAttribute&&o.removeAttribute&&(o.setAttribute(e,""),i=a(o[e],"function"),a(o[e],"undefined")||(o[e]=n),o.removeAttribute(e))),o=null,i}var r={select:"input",change:"input",submit:"form",reset:"form",error:"img",load:"img",abort:"img"};return e}(),L={}.hasOwnProperty;f=a(L,"undefined")||a(L.call,"undefined")?function(e,t){return t in e&&a(e.constructor.prototype[t],"undefined")}:function(e,t){return L.call(e,t)},Function.prototype.bind||(Function.prototype.bind=function(e){var t=this;if("function"!=typeof t)throw new TypeError;var n=D.call(arguments,1),r=function(){if(this instanceof r){var o=function(){};o.prototype=t.prototype;var a=new o,i=t.apply(a,n.concat(D.call(arguments)));return Object(i)===i?i:a}return t.apply(e,n.concat(D.call(arguments)))};return r}),M.flexbox=function(){return u("flexWrap")},M.flexboxlegacy=function(){return u("boxDirection")},M.canvas=function(){var e=t.createElement("canvas");return!(!e.getContext||!e.getContext("2d"))},M.canvastext=function(){return!(!p.canvas||!a(t.createElement("canvas").getContext("2d").fillText,"function"))},M.webgl=function(){return!!e.WebGLRenderingContext},M.touch=function(){var n;return"ontouchstart"in e||e.DocumentTouch&&t instanceof DocumentTouch?n=!0:F(["@media (",S.join("touch-enabled),("),v,")","{#modernizr{top:9px;position:absolute}}"].join(""),function(e){n=9===e.offsetTop}),n},M.geolocation=function(){return"geolocation"in navigator},M.postmessage=function(){return!!e.postMessage},M.websqldatabase=function(){return!!e.openDatabase},M.indexedDB=function(){return!!u("indexedDB",e)},M.hashchange=function(){return A("hashchange",e)&&(t.documentMode===n||t.documentMode>7)},M.history=function(){return!(!e.history||!history.pushState)},M.draganddrop=function(){var e=t.createElement("div");return"draggable"in e||"ondragstart"in e&&"ondrop"in e},M.websockets=function(){return"WebSocket"in e||"MozWebSocket"in e},M.rgba=function(){return r("background-color:rgba(150,255,150,.5)"),i(b.backgroundColor,"rgba")},M.hsla=function(){return r("background-color:hsla(120,40%,100%,.5)"),i(b.backgroundColor,"rgba")||i(b.backgroundColor,"hsla")},M.multiplebgs=function(){return r("background:url(https://),url(https://),red url(https://)"),/(url\s*\(.*?){3}/.test(b.background)},M.backgroundsize=function(){return u("backgroundSize")},M.borderimage=function(){return u("borderImage")},M.borderradius=function(){return u("borderRadius")},M.boxshadow=function(){return u("boxShadow")},M.textshadow=function(){return""===t.createElement("div").style.textShadow},M.opacity=function(){return o("opacity:.55"),/^0.55$/.test(b.opacity)},M.cssanimations=function(){return u("animationName")},M.csscolumns=function(){return u("columnCount")},M.cssgradients=function(){var e="background-image:",t="gradient(linear,left top,right bottom,from(#9f9),to(white));",n="linear-gradient(left top,#9f9, white);";return r((e+"-webkit- ".split(" ").join(t+e)+S.join(n+e)).slice(0,-e.length)),i(b.backgroundImage,"gradient")},M.cssreflections=function(){return u("boxReflect")},M.csstransforms=function(){return!!u("transform")},M.csstransforms3d=function(){var e=!!u("perspective");return e&&"webkitPerspective"in g.style&&F("@media (transform-3d),(-webkit-transform-3d){#modernizr{left:9px;position:absolute;height:3px;}}",function(t){e=9===t.offsetLeft&&3===t.offsetHeight}),e},M.csstransitions=function(){return u("transition")},M.fontface=function(){var e;return F('@font-face {font-family:"font";src:url("https://")}',function(n,r){var o=t.getElementById("smodernizr"),a=o.sheet||o.styleSheet,i=a?a.cssRules&&a.cssRules[0]?a.cssRules[0].cssText:a.cssText||"":"";e=/src/i.test(i)&&0===i.indexOf(r.split(" ")[0])}),e},M.generatedcontent=function(){var e;return F(["#",v,"{font:0/0 a}#",v,':after{content:"',x,'";visibility:hidden;font:3px/1 a}'].join(""),function(t){e=t.offsetHeight>=3}),e},M.video=function(){var e=t.createElement("video"),n=!1;try{(n=!!e.canPlayType)&&(n=new Boolean(n),n.ogg=e.canPlayType('video/ogg; codecs="theora"').replace(/^no$/,""),n.h264=e.canPlayType('video/mp4; codecs="avc1.42E01E"').replace(/^no$/,""),n.webm=e.canPlayType('video/webm; codecs="vp8, vorbis"').replace(/^no$/,""))}catch(r){}return n},M.audio=function(){var e=t.createElement("audio"),n=!1;try{(n=!!e.canPlayType)&&(n=new Boolean(n),n.ogg=e.canPlayType('audio/ogg; codecs="vorbis"').replace(/^no$/,""),n.mp3=e.canPlayType("audio/mpeg;").replace(/^no$/,""),n.wav=e.canPlayType('audio/wav; codecs="1"').replace(/^no$/,""),n.m4a=(e.canPlayType("audio/x-m4a;")||e.canPlayType("audio/aac;")).replace(/^no$/,""))}catch(r){}return n},M.localstorage=function(){try{return localStorage.setItem(v,v),localStorage.removeItem(v),!0}catch(e){return!1}},M.sessionstorage=function(){try{return sessionStorage.setItem(v,v),sessionStorage.removeItem(v),!0}catch(e){return!1}},M.webworkers=function(){return!!e.Worker},M.applicationcache=function(){return!!e.applicationCache},M.svg=function(){return!!t.createElementNS&&!!t.createElementNS(N.svg,"svg").createSVGRect},M.inlinesvg=function(){var e=t.createElement("div");return e.innerHTML="",(e.firstChild&&e.firstChild.namespaceURI)==N.svg},M.smil=function(){return!!t.createElementNS&&/SVGAnimate/.test(w.call(t.createElementNS(N.svg,"animate")))},M.svgclippaths=function(){return!!t.createElementNS&&/SVGClipPath/.test(w.call(t.createElementNS(N.svg,"clipPath")))};for(var H in M)f(M,H)&&(d=H.toLowerCase(),p[d]=M[H](),$.push((p[d]?"":"no-")+d));return p.input||l(),p.addTest=function(e,t){if("object"==typeof e)for(var r in e)f(e,r)&&p.addTest(r,e[r]);else{if(e=e.toLowerCase(),p[e]!==n)return p;t="function"==typeof t?t():t,"undefined"!=typeof h&&h&&(g.className+=" "+(t?"":"no-")+e),p[e]=t}return p},r(""),y=E=null,function(e,t){function n(e,t){var n=e.createElement("p"),r=e.getElementsByTagName("head")[0]||e.documentElement;return n.innerHTML="x",r.insertBefore(n.lastChild,r.firstChild)}function r(){var e=y.elements;return"string"==typeof e?e.split(" "):e}function o(e){var t=v[e[h]];return t||(t={},g++,e[h]=g,v[g]=t),t}function a(e,n,r){if(n||(n=t),l)return n.createElement(e);r||(r=o(n));var a;return a=r.cache[e]?r.cache[e].cloneNode():p.test(e)?(r.cache[e]=r.createElem(e)).cloneNode():r.createElem(e),!a.canHaveChildren||m.test(e)||a.tagUrn?a:r.frag.appendChild(a)}function i(e,n){if(e||(e=t),l)return e.createDocumentFragment();n=n||o(e);for(var a=n.frag.cloneNode(),i=0,c=r(),s=c.length;s>i;i++)a.createElement(c[i]);return a}function c(e,t){t.cache||(t.cache={},t.createElem=e.createElement,t.createFrag=e.createDocumentFragment,t.frag=t.createFrag()),e.createElement=function(n){return y.shivMethods?a(n,e,t):t.createElem(n)},e.createDocumentFragment=Function("h,f","return function(){var n=f.cloneNode(),c=n.createElement;h.shivMethods&&("+r().join().replace(/[\w\-]+/g,function(e){return t.createElem(e),t.frag.createElement(e),'c("'+e+'")'})+");return n}")(y,t.frag)}function s(e){e||(e=t);var r=o(e);return!y.shivCSS||u||r.hasCSS||(r.hasCSS=!!n(e,"article,aside,dialog,figcaption,figure,footer,header,hgroup,main,nav,section{display:block}mark{background:#FF0;color:#000}template{display:none}")),l||c(e,r),e}var u,l,d="3.7.0",f=e.html5||{},m=/^<|^(?:button|map|select|textarea|object|iframe|option|optgroup)$/i,p=/^(?:a|b|code|div|fieldset|h1|h2|h3|h4|h5|h6|i|label|li|ol|p|q|span|strong|style|table|tbody|td|th|tr|ul)$/i,h="_html5shiv",g=0,v={};!function(){try{var e=t.createElement("a");e.innerHTML="",u="hidden"in e,l=1==e.childNodes.length||function(){t.createElement("a");var e=t.createDocumentFragment();return"undefined"==typeof e.cloneNode||"undefined"==typeof e.createDocumentFragment||"undefined"==typeof e.createElement}()}catch(n){u=!0,l=!0}}();var y={elements:f.elements||"abbr article aside audio bdi canvas data datalist details dialog figcaption figure footer header hgroup main mark meter nav output progress section summary template time video",version:d,shivCSS:f.shivCSS!==!1,supportsUnknownElements:l,shivMethods:f.shivMethods!==!1,type:"default",shivDocument:s,createElement:a,createDocumentFragment:i};e.html5=y,s(t)}(this,t),p._version=m,p._prefixes=S,p._domPrefixes=T,p._cssomPrefixes=k,p.mq=z,p.hasEvent=A,p.testProp=function(e){return c([e])},p.testAllProps=u,p.testStyles=F,p.prefixed=function(e,t,n){return t?u(e,t,n):u(e,"pfx")},g.className=g.className.replace(/(^|\s)no-js(\s|$)/,"$1$2")+(h?" js "+$.join(" "):""),p}(this,this.document); -------------------------------------------------------------------------------- /site/experiment_tracking/experiment/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | Experiment class API - Test tube Documentation 12 | 13 | 14 | 15 | 16 | 17 | 18 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 |
35 | 36 | 37 | 107 | 108 |
109 | 110 | 111 | 115 | 116 | 117 |
118 |
119 |
120 |
    121 |
  • Docs »
  • 122 | 123 | 124 | 125 |
  • Experiment tracking »
  • 126 | 127 | 128 | 129 |
  • Experiment class API
  • 130 |
  • 131 | 132 | Edit on GitHub 134 | 135 |
  • 136 |
137 |
138 |
139 |
140 |
141 | 142 |

Experiment class API

143 |

[Github Code]

144 |

An Experiment holds metadata and the results of the training run, you 145 | can instantiate an Experiment via:

146 |
from test_tube import Experiment
147 | 
148 | exp = Experiment(name='dense_model',
149 |                  debug=False,
150 |                  save_dir='/Desktop/test_tube')
151 | 
152 | exp.tag({'learning_rate': 0.002, 'nb_layers': 2})
153 | 
154 | for step in training_steps:
155 |     tng_err = model.eval(tng_x, tng_y)
156 | 
157 |     exp.log('tng_err': tng_err)
158 | 
159 | # training complete!
160 | # all your logs and data are ready to be visualized at testtube.williamfalcon.com
161 | 
162 | 163 |
164 |

init options

165 |

version

166 |

The same Experiment can have multiple versions. Test tube generates 167 | these automatically each time you run your model. To set your own 168 | version use:

169 |
exp = Experiment(name='dense_model',version=1)
170 | 
171 | 172 |

debug

173 |

If you're debugging and don't want to create a log file turn debug to 174 | True

175 |
exp = Experiment(name='dense_model',debug=True)
176 | 
177 | 178 |

autosave

179 |

If you only want to save at the end of training, turn autosave off:

180 |
exp = Experiment(name='dense_model', autosave=False)
181 | 
182 | # run long training...
183 | 
184 | # first time any logs are saved
185 | exp.save()
186 | 
187 | 188 |

create_git_tag

189 |

Ever wanted a flashback to your code when you ran an experiment? 190 | Snapshot your code for this experiment using git tags:

191 |
exp = Experiment(name='dense_model', create_git_tag=True)
192 | 
193 | 194 |
195 |

Methods

196 |

tag

197 |
exp.tag({k: v})
198 | 
199 | 200 |

Adds an arbitrary dictionary of tags to the experiment

201 |

Example

202 |
exp.tag({'dataset_name': 'imagenet_1', 'learning_rate': 0.0002})
203 | 
204 | 205 |

log

206 |
exp.log({k:v})
207 | 
208 | 209 |

Adds a row of data to the experiments

210 |

Example

211 |
exp.log({'val_loss': 0.22, 'epoch_nb': 1, 'batch_nb': 12})
212 | 
213 | # you can also add other rows that have separate information
214 | exp.log({'tng_loss': 0.01})
215 | 
216 | # or even a numpy array image
217 | image = np.imread('image.png')
218 | exp.log({'fake_png': image})
219 | 
220 | 221 |

Saving images Example

222 |
# name must have either jpg, png or jpeg in it
223 | img = np.imread('a.jpg')
224 | exp.log('test_jpg': img, 'val_err': 0.2)
225 | 
226 | # saves image to ../exp/version/media/test_0.jpg
227 | # csv has file path to that image in that cell
228 | 
229 | 230 |

To save an image, add jpg, png or jpeg to the key corresponding 231 | with the image array. The image must be formatted the same as skimage's 232 | imsave 233 | function

234 |

argparse

235 |
exp.argparse(hparams)
236 | 
237 | 238 |

Transfers hyperparam information from Argparser or 239 | HyperOptArgumentParser

240 |

Example

241 |
from test_tube import HyperOptArgumentParser
242 | 
243 | # parse args
244 | parser = HyperOptArgumentParser()
245 | parser.add_argument('--learning_rate', default=0.002, type=float, help='the learning rate')
246 | hparams = parser.parse_args()
247 | 
248 | # learning_rate is now a meta tag for your experiment
249 | exp.argparse(hparams)
250 | 
251 | 252 |

save

253 |
exp.save()
254 | 
255 | 256 |

Saves the exp to disk (including images)

257 |

Example

258 |
exp = Experiment(name='dense_model', autosave=False)
259 | 
260 | # run long training...
261 | 
262 | # first time any logs are saved
263 | exp.save()
264 | 
265 | 266 |
267 |
268 | 289 | 290 |
291 |
292 | 293 |
294 | 295 |
296 | 297 |
298 | 299 | 300 | GitHub 301 | 302 | 303 | « Previous 304 | 305 | 306 | Next » 307 | 308 | 309 |
310 | 311 | 312 | 313 | 314 | 315 | 316 | -------------------------------------------------------------------------------- /docs/hpc/SlurmCluster.md: -------------------------------------------------------------------------------- 1 | # SlurmCluster class API 2 | 3 | [[Github Code](https://github.com/williamFalcon/test-tube/blob/master/test_tube/hpc.py)] 4 | 5 | The SlurmCluster class enables hyperparameter search parallelization on a cluster managed via [Slurm workload manager](https://slurm.schedmd.com/). 6 | 7 | At a high level, the SlurmCluster creates a submit script for each permutation of hyperparameters requested. If the job hits the walltime but has not completed, the SlurmManager will checkpoint the model and submit a new job to continue training using the saved weights. 8 | 9 | - Here's a [full GPU PyTorch example](https://github.com/williamFalcon/test-tube/blob/master/examples/pytorch_hpc_example.py). 10 | - Here's a [full CPU example](https://github.com/williamFalcon/test-tube/blob/master/examples/hpc_cpu_example.py). 11 | 12 | You can instantiate a `SlurmCluster` via: 13 | 14 | ``` {.python} 15 | from test_tube.hpc import SlurmCluster 16 | 17 | # hyperparameters is a test-tube hyper params object 18 | # see https://williamfalcon.github.io/test-tube/hyperparameter_optimization/HyperOptArgumentParser/ 19 | hyperparams = args.parse() 20 | 21 | # init cluster 22 | cluster = SlurmCluster( 23 | hyperparam_optimizer=hyperparams, 24 | log_path='/path/to/log/results/to', 25 | python_cmd='python3' 26 | ) 27 | 28 | # let the cluster know where to email for a change in job status (ie: complete, fail, etc...) 29 | cluster.notify_job_status(email='some@email.com', on_done=True, on_fail=True) 30 | 31 | # set the job options. In this instance, we'll run 20 different models 32 | # each with its own set of hyperparameters giving each one 1 GPU (ie: taking up 20 GPUs) 33 | cluster.per_experiment_nb_gpus = 1 34 | cluster.per_experiment_nb_nodes = 1 35 | 36 | # we'll request 10GB of memory per node 37 | cluster.memory_mb_per_node = 10000 38 | 39 | # set a walltime of 10 minues 40 | cluster.job_time = '10:00' 41 | 42 | # 1 minute before walltime is up, SlurmCluster will launch a continuation job and kill this job. 43 | # you must provide your own loading and saving function which the cluster object will call 44 | cluster.minutes_to_checkpoint_before_walltime = 1 45 | 46 | # run the models on the cluster 47 | cluster.optimize_parallel_cluster_gpu(train, nb_trials=20, job_name='first_tt_batch', job_display_name='my_batch') 48 | ``` 49 | 50 | ------------------------------------------------------------------------ 51 | 52 | ## init options 53 | 54 | ### `hyperparam_optimizer` 55 | 56 | A [HyperOptArgumentParser](https://williamfalcon.github.io/test-tube/hyperparameter_optimization/HyperOptArgumentParser/) object 57 | which contains all permutations of model hyperparameters to run. 58 | 59 | ### `log_path` 60 | 61 | Path to save the slurm scripts, error logs and out logs created. Usually this would be the experiments folder path where test tube saves [Experiment](https://williamfalcon.github.io/test-tube/experiment_tracking/experiment/) information. 62 | 63 | ### `python_cmd` 64 | 65 | This is the command that starts the python program. Normally it is: 66 | 67 | ``` {.python} 68 | # python 2 69 | python main.py 70 | 71 | # python 3 72 | python3 main.py 73 | ``` 74 | 75 | ### `enable_log_err` 76 | 77 | If true, saves slurm error logs to the path at *log_path*. If anything goes wrong in your job, you'll find the error here. 78 | 79 | ### `enable_log_out` 80 | 81 | If true, saves slurm output logs to the path at *log_path*. This file contains all outputs that would show up on the console normally. 82 | 83 | ### `test_tube_exp_name` 84 | 85 | When this is given, it structures the files in a nice format to fit with the folder structure of the [Experiment](https://williamfalcon.github.io/test-tube/experiment_tracking/experiment/) object's output. 86 | 87 | ## Properties 88 | 89 | `job_time` 90 | String. Walltime requested. Examples: 91 | ```{.python} 92 | # 1 hour and 10 minutes 93 | cluster.job_time = '1:10:00' 94 | 95 | # 1 day and 1 hour and 10 minutes 96 | cluster.job_time = '1-1:10:00' 97 | 98 | # 1 day and 1 hour and 10 minutes 99 | cluster.job_time = '25:10:00' 100 | 101 | # 10 minutes 102 | cluster.job_time = '10:00' 103 | 104 | # 10 seconds 105 | cluster.job_time = '10' 106 | ``` 107 | 108 | `minutes_to_checkpoint_before_walltime` 109 | Int. Minutes before walltime when a continuation job will be auto-submitted. 110 | ```{.python} 111 | cluster.job_time = '10:00' 112 | cluster.minutes_to_checkpoint_before_walltime = 2 113 | 114 | # New job will be submited to continue training after 8 minutes of the job running. 115 | ``` 116 | 117 | `per_experiment_nb_gpus` 118 | Int. Number of GPUs each job will get. 119 | ```{.python} 120 | # EACH job will get 2 GPUs (ie: if a model runs over two GPUs at the same time). 121 | cluster.per_experiment_nb_gpus = 2 122 | ``` 123 | 124 | `per_experiment_nb_cpus` 125 | Int. Number of CPUs each job will get. 126 | ```{.python} 127 | cluster.per_experiment_nb_cpus = 1 128 | ``` 129 | 130 | `per_experiment_nb_nodes` 131 | Int. Number of nodes each job will get. 132 | ```{.python} 133 | cluster.per_experiment_nb_nodes = 1 134 | ``` 135 | 136 | `gpu_type` 137 | String. Gpu type requested. Example: 138 | ```{.python} 139 | cluster.gpu_type = '1080ti' 140 | ``` 141 | 142 | ------------------------------------------------------------------------ 143 | 144 | ## Methods 145 | 146 | ### `set_checkpoint_save_function` 147 | 148 | ``` {.python} 149 | cluster.set_checkpoint_save_function(fx, kwargs) 150 | ``` 151 | 152 | Called if the model isn't finished training *minutes_to_checkpoint_before_walltime* before the walltime. If walltime = '15:00' and minutes_to_checkpoint_before_walltime = '1:00' the SlurmCluster will call your save function after 14 minutes of training. 153 | 154 | - ```fx``` A python function. 155 | - ```kwargs``` Dictionary where keys are the literal argument names to the function. Dictionary values are the values of the arguments. 156 | 157 | **Example** 158 | 159 | ``` {.python} 160 | def my_save_function(arg_1, arg_k): 161 | # ... save my model here 162 | 163 | cluster.set_checkpoint_save_function(my_save_function, kwargs={'arg_1': 'whatever', 'arg_k': 'you_want'}) 164 | 165 | ``` 166 | 167 | ### `set_checkpoint_load_function` 168 | 169 | ``` {.python} 170 | cluster.set_checkpoint_load_function(fx, kwargs) 171 | ``` 172 | 173 | Called internally when a job is auto-submitted by the SlurmCluster to give your program a chance to load the model weights or whatever you need to continue training. 174 | This will call your load function immediately whenever you call this method AND training is continuing. 175 | 176 | - ```fx``` A python function. 177 | - ```kwargs``` Dictionary where keys are the literal argument names to the function. Dictionary values are the values of the arguments. 178 | 179 | **Example** 180 | 181 | ``` {.python} 182 | def my_load_function(arg_1, arg_k): 183 | # ... restore my model here 184 | 185 | cluster.set_checkpoint_save_function(my_load_function, kwargs={'arg_1': 'whatever', 'arg_k': 'you_want'}) 186 | 187 | ``` 188 | 189 | ### `add_slurm_cmd` 190 | 191 | ``` {.python} 192 | cluster.add_slurm_cmd(cmd, value, comment) 193 | ``` 194 | 195 | Adds whatever Slurm command you need manually to the generated script. All possible commands are listed [here](https://slurm.schedmd.com/pdfs/summary.pdf). 196 | 197 | - ```cmd``` String with the bash command. 198 | - ```value``` String value for the command. Numericals need to be in single quotes ```'1'``` 199 | - ```comment``` String with the command comment. 200 | 201 | **Example** 202 | 203 | ``` {.python} 204 | cluster.add_slurm_cmd(cmd='cpus-per-task', value='1', comment='nb cpus per task') 205 | 206 | # the above command will add an entry like this to the slurm script 207 | 208 | # #nb cpus per task 209 | # #SBATCH --cpus-per-task=1 210 | # ############ 211 | 212 | ``` 213 | 214 | ### `add_command` 215 | 216 | ``` {.python} 217 | cluster.add_command(cmd) 218 | ``` 219 | 220 | Adds arbitrary bash commands to the script. Use this to activate conda environments, install packages, whatever else you would think about calling on bash. 221 | 222 | - ```cmd``` String with your bash command. 223 | 224 | **Example** 225 | 226 | 227 | ``` {.python} 228 | # load the anaconda package on the launch node 229 | cluster.add_command('module load anaconda') 230 | 231 | # activate the environment on the launch node 232 | cluster.add_command('source activate myCondaEnv') 233 | ``` 234 | 235 | ### `load_modules` 236 | 237 | ``` {.python} 238 | cluster.load_modules(modules) 239 | ``` 240 | 241 | Loads modules needed to run the job. Your Slurm documentation should have a list of available modules. You can also get those by running ```module avail```. 242 | - ```modules``` Array of module names. 243 | 244 | **Example** 245 | 246 | 247 | ``` {.python} 248 | cluster.load_modules([ 249 | 'python-3', 250 | 'anaconda3' 251 | ]) 252 | ``` 253 | 254 | ### `notify_job_status` 255 | 256 | ``` {.python} 257 | cluster.notify_job_status(email, on_done, on_fail) 258 | ``` 259 | 260 | Loads modules needed to run the job. Your Slurm documentation should have a list of available modules. You can also get those by running ```module avail```. 261 | 262 | - ```email``` String. Email address to get notifications. 263 | - ```on_done``` Boolean. If true, you'll get an email when the job completes. 264 | - ```on_fail``` Boolean. If true, you'll get an email if the job fails. 265 | 266 | **Example** 267 | 268 | 269 | ``` {.python} 270 | cluster.notify_job_status(email='some@email.com', on_done=True, on_fail=True) 271 | ``` 272 | 273 | ### `optimize_parallel_cluster_gpu` 274 | 275 | ``` {.python} 276 | cluster.optimize_parallel_cluster_gpu(train_function, nb_trials, job_name, job_display_name=None) 277 | ``` 278 | 279 | Launches the hyperparameter search across the cluster nodes. 280 | - ```train_function``` The entry point to start your training routine. 281 | - ```nb_trials``` Number of trials to launch. This is the number of hyperparameter configurations to train over. 282 | - ```job_name``` Folder name where the slurm scripts will save to. This should be the same as your [Experiment](https://williamfalcon.github.io/test-tube/experiment_tracking/experiment/) name. 283 | - ```job_display_name``` Visible name when slurm lists running jobs (ie: through ```squeue -u user_name```). This should be really short (if using a test tube Experiment, it'll put the experiment version at the end). 284 | 285 | **Example** 286 | 287 | 288 | ``` {.python} 289 | def main(hparams, cluster, return_dict): 290 | # do your own generic training code here... 291 | # init model 292 | model = model_build(hparams) 293 | 294 | # set the load and save fxs 295 | cluster.set_checkpoint_save_function(fx, {}) 296 | cluster.set_checkpoint_load_function(fx, {}) 297 | 298 | # train ... 299 | 300 | 301 | cluster.optimize_parallel_cluster_gpu(main, nb_trials=20, job_name='my_job', job_display_name='mj') 302 | ``` 303 | 304 | Now if you get the job information, you'll see this: 305 | ``` {.bash} 306 | (conda_env) [user@node dir]$ squeue -u my_name 307 | JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) 308 | 104040 all mjv0 my_name R 58:22 1 nodeName 309 | 104041 all mjv1 my_name R 58:22 1 nodeName 310 | 104042 all mjv2 my_name R 58:22 1 nodeName 311 | 104043 all mjv3 my_name R 58:22 1 nodeName 312 | ``` 313 | 314 | ### `optimize_parallel_cluster_cpu` 315 | 316 | ``` {.python} 317 | cluster.optimize_parallel_cluster_cpu(train_function, nb_trials, job_name, job_display_name=None) 318 | ``` 319 | 320 | Launches the hyperparameter search across the cluster nodes using cpus. 321 | - ```train_function``` The entry point to start your training routine. 322 | - ```nb_trials``` Number of trials to launch. This is the number of hyperparameter configurations to train over. 323 | - ```job_name``` Folder name where the slurm scripts will save to. This should be the same as your [Experiment](https://williamfalcon.github.io/test-tube/experiment_tracking/experiment/) name. 324 | - ```job_display_name``` Visible name when slurm lists running jobs (ie: through ```squeue -u user_name```). This should be really short (if using a test tube Experiment, it'll put the experiment version at the end). 325 | 326 | **Example** 327 | 328 | 329 | ``` {.python} 330 | def main(hparams, cluster, return_dict): 331 | # do your own generic training code here... 332 | # init model 333 | model = model_build(hparams) 334 | 335 | # set the load and save fxs 336 | cluster.set_checkpoint_save_function(fx, {}) 337 | cluster.set_checkpoint_load_function(fx, {}) 338 | 339 | # train ... 340 | 341 | 342 | cluster.optimize_parallel_cluster_cpu(main, nb_trials=20, job_name='my_job', job_display_name='mj') 343 | ``` 344 | 345 | Now if you get the job information, you'll see this: 346 | ``` {.bash} 347 | (conda_env) [user@node dir]$ squeue -u my_name 348 | JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) 349 | 104040 all mjv0 my_name R 58:22 1 nodeName 350 | 104041 all mjv1 my_name R 58:22 1 nodeName 351 | 104042 all mjv2 my_name R 58:22 1 nodeName 352 | 104043 all mjv3 my_name R 58:22 1 nodeName 353 | ``` 354 | -------------------------------------------------------------------------------- /site/hyperparameter_optimization/HyperOptArgumentParser/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | HyperOptArgumentParser class API - Test tube Documentation 12 | 13 | 14 | 15 | 16 | 17 | 18 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 |
35 | 36 | 37 | 107 | 108 |
109 | 110 | 111 | 115 | 116 | 117 |
118 |
119 |
120 |
    121 |
  • Docs »
  • 122 | 123 | 124 | 125 |
  • Hyperparameter optimization »
  • 126 | 127 | 128 | 129 |
  • HyperOptArgumentParser class API
  • 130 |
  • 131 | 132 | Edit on GitHub 134 | 135 |
  • 136 |
137 |
138 |
139 |
140 |
141 | 142 |

HyperOptArgumentParser class API

143 |

[Github Code]

144 |

The HyperOptArgumentParser is a subclass of python's 145 | argparse, with added 146 | finctionality to change parameters on the fly as determined by a 147 | sampling strategy.

148 |

You can instantiate an HyperOptArgumentParser via:

149 |
from test_tube import HyperOptArgumentParser
150 | 
151 | # subclass of argparse
152 | parser = HyperOptArgumentParser(strategy='random_search')
153 | parser.add_argument('--learning_rate', default=0.002, type=float, help='the learning rate')
154 | 
155 | # let's enable optimizing over the number of layers in the network
156 | parser.opt_list('--nb_layers', default=2, type=int, tunable=True, options=[2, 4, 8])
157 | 
158 | # and tune the number of units in each layer
159 | parser.opt_range('--neurons', default=50, type=int, tunable=True, low=100, high=800, nb_samples=10)
160 | 
161 | # compile (because it's argparse underneath)
162 | hparams = parser.parse_args()
163 | 
164 | # run 20 trials of random search over the hyperparams
165 | for hparam_trial in hparams.trials(20):
166 |     train_network(hparam_trial)
167 | 
168 | 169 |
170 |

init options

171 |

strategy

172 |

Use either random 173 | search 174 | or grid 175 | search 176 | for tuning:

177 |
parser = HyperOptArgumentParser(strategy='grid_search')
178 | 
179 | 180 |
181 |

Methods

182 |

All the functionality from argparse works but we've added the following 183 | functionality:

184 |

opt_list

185 |
parser.opt_list('--nb_layers', default=2, type=int, tunable=True, options=[2, 4, 8])
186 | 
187 | 188 |

Enables searching over a list of values for this parameter. The tunable 189 | values ONLY replace the argparse values when running a hyperparameter 190 | optimization search. This is on purpose so your code doesn't have to 191 | change when you want to tune it.

192 |

Example

193 |
parser.opt_list('--nb_layers', default=2, type=int, tunable=True, options=[2, 4, 8])
194 | hparams = parser.parse_args()
195 | # hparams.nb_layers = 2
196 | 
197 | for trial in hparams.trials(2):
198 |     # trial.nb_layers is now a value in [2, 4, 8]
199 |     # but hparams.nb_layers is still 2
200 | 
201 | 202 |

opt_range

203 |
parser.opt_range('--neurons', default=50, type=int, tunable=True, low=100, high=800, nb_samples=8, log_base=None)
204 | 
205 | 206 |

Enables searching over a range of values chosen randomly using the 207 | nb_samples given. The tunable values only replace the argparse 208 | values when running a hyperparameter optimization search. This is on 209 | purpose so your code doesn't have to change when you want to tune it.

210 |

If log_base is set to a positive number, it will randomly search over 211 | a log scale, where the log base is log_base. This is better for search 212 | over several orders of magnitude efficiently.

213 |

Example

214 |
parser.opt_range('--neurons', default=50, type=int, tunable=True, low=100, high=800, nb_samples=8)
215 | hparams = parser.parse_args()
216 | # hparams.neurons = 50
217 | 
218 | for trial in hparams.trials(2):
219 |     # trial.nb_layers is now a value in [100, 200, 300, 400, 500, 600 700, 800]
220 |     # but hparams.neurons is still 50
221 | 
222 | 223 |

json_config

224 |
parser.json_config('--config', default='example.json')
225 | 
226 | 227 |

Replaces default values in the parser with those read from the json file

228 |

Example

229 |

example.json

230 |
{
231 |     "learning_rate": 200
232 | }
233 | 
234 | 235 |
parser.add_argument('--learning_rate', default=0.002, type=float, help='the learning rate')
236 | parser.json_config('--config', default='example.json')
237 | hparams = parser.parse_args()
238 | 
239 | # hparams.learning_rate = 200
240 | 
241 | 242 |

trials

243 |
trial_generator = hparams.trials(2)
244 | 
245 | 246 |

Computes the trials needed for these experiments and serves them via a 247 | generator

248 |

Example

249 |
hparams = parser.parse_args()
250 | for trial_hparams in hparams.trials(2):
251 |     # trial_hparams now has values sampled from the training routine
252 | 
253 | 254 |

optimize_parallel_gpu

255 |
hparams = parser.parse_args()
256 | hparams.optimize_parallel_gpu(function_to_optimize, gpu_ids=['1', '0, 2'])
257 | 
258 | 259 |

Parallelize the trials across nb_workers processes. Auto assign the 260 | correct gpus. Argument passed into the function_to_optimize is the 261 | trial_params argument and the gpu_ids.

262 |

Example

263 |
# parallelize tuning on 2 gpus
264 | # this will place each trial in n into a given gpu
265 | def train_main(trial_params, gpu_ids):
266 |     # train your model, etc here...
267 | 
268 | hparams = parser.parse_args()
269 | hparams.optimize_parallel_gpu(train_main, gpu_ids=['1', '0, 2'])
270 | 
271 | # at the end of the optimize_parallel function, all 20 trials will be completed
272 | # in this case by running 10 sets of 2 trials in parallel
273 | 
274 | 275 |

optimize_parallel_cpu

276 |
hparams = parser.parse_args()
277 | hparams.optimize_parallel_cpu(function_to_optimize, nb_trials=20, nb_workers=2)
278 | 
279 | 280 |

Parallelize the trials across nb_workers cpus. Argument passed into 281 | the function_to_optimize is the trial_params argument.

282 |

Example

283 |
# parallelize tuning on 2 cpus
284 | # this will place each trial in n into a given gpu
285 | def train_main(trial_params):
286 |     # train your model, etc here...
287 | 
288 | hparams = parser.parse_args()
289 | hparams.optimize_parallel_cpu(train_main, nb_trials=20, nb_workers=2)
290 | 
291 | # at the end of the optimize_parallel function, all 20 trials will be completed
292 | # in this case by running 10 sets of 2 trials in parallel
293 | 
294 | 295 |
296 |
297 | 316 | 317 |
318 |
319 | 320 |
321 | 322 |
323 | 324 |
325 | 326 | 327 | GitHub 328 | 329 | 330 | « Previous 331 | 332 | 333 | 334 |
335 | 336 | 337 | 338 | 339 | 340 | 341 | -------------------------------------------------------------------------------- /test_tube/argparse_hopt.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import math 4 | import os 5 | import random 6 | import re 7 | import traceback 8 | from argparse import ArgumentParser 9 | from copy import deepcopy 10 | from gettext import gettext as _ 11 | from multiprocessing import Pool, Queue 12 | from time import sleep 13 | 14 | import numpy as np 15 | 16 | from .hyper_opt_utils import strategies 17 | 18 | # needed to work with pytorch multiprocess 19 | try: 20 | import torch 21 | import multiprocessing 22 | # multiprocessing.set_start_method('spawn', force=True) 23 | except ModuleNotFoundError: 24 | pass 25 | 26 | 27 | def optimize_parallel_gpu_private(args): 28 | trial_params, train_function = args[0], args[1] 29 | 30 | # get set of gpu ids 31 | gpu_id_set = g_gpu_id_q.get(block=True) 32 | 33 | try: 34 | 35 | # enable the proper gpus 36 | os.environ["CUDA_VISIBLE_DEVICES"] = gpu_id_set 37 | 38 | # run training fx on the specific gpus 39 | results = train_function(trial_params, gpu_id_set) 40 | 41 | return [trial_params, results] 42 | 43 | except Exception as e: 44 | print('Caught exception in worker thread', e) 45 | 46 | # This prints the type, value, and stack trace of the 47 | # current exception being handled. 48 | traceback.print_exc() 49 | return [trial_params, None] 50 | 51 | finally: 52 | g_gpu_id_q.put(gpu_id_set) 53 | 54 | 55 | def optimize_parallel_cpu_private(args): 56 | trial_params, train_function = args[0], args[1] 57 | 58 | sleep(random.randint(0, 4)) 59 | 60 | # run training fx on the specific gpus 61 | results = train_function(trial_params) 62 | 63 | # True = completed 64 | return [trial_params, results] 65 | 66 | 67 | class HyperOptArgumentParser(ArgumentParser): 68 | """ 69 | Subclass of argparse ArgumentParser which adds optional calls to sample from lists or ranges 70 | Also enables running optimizations across parallel processes 71 | """ 72 | 73 | # these are commands injected by test tube from cluster operations 74 | TRIGGER_CMD = 'test_tube_from_cluster_hopt' 75 | SLURM_CMD_PATH = 'test_tube_slurm_cmd_path' 76 | SLURM_EXP_CMD = 'hpc_exp_number' 77 | SLURM_LOAD_CMD = 'test_tube_do_checkpoint_load' 78 | CMD_MAP = { 79 | TRIGGER_CMD: bool, 80 | SLURM_CMD_PATH: str, 81 | SLURM_EXP_CMD: int, 82 | SLURM_LOAD_CMD: bool 83 | } 84 | 85 | def __init__(self, strategy='grid_search', **kwargs): 86 | """ 87 | 88 | :param strategy: 'grid_search', 'random_search' 89 | :param enabled: 90 | :param experiment: 91 | :param kwargs: 92 | """ 93 | ArgumentParser.__init__(self, **kwargs) 94 | 95 | self.strategy = strategy 96 | self.trials = [] 97 | self.parsed_args = None 98 | self.opt_args = {} 99 | self.json_config_arg_name = None 100 | self.pool = None 101 | 102 | def __getstate__(self): 103 | # capture what is normally pickled 104 | state = self.__dict__.copy() 105 | 106 | # remove all functions from the namespace 107 | clean_state = {} 108 | for k, v in state.items(): 109 | if not hasattr(v, '__call__'): 110 | clean_state[k] = v 111 | 112 | # what we return here will be stored in the pickle 113 | return clean_state 114 | 115 | def __setstate__(self, newstate): 116 | # re-instate our __dict__ state from the pickled state 117 | self.__dict__.update(newstate) 118 | 119 | def add_argument(self, *args, **kwargs): 120 | super(HyperOptArgumentParser, self).add_argument(*args, **kwargs) 121 | 122 | def opt_list(self, *args, **kwargs): 123 | options = kwargs.pop("options", None) 124 | tunable = kwargs.pop("tunable", False) 125 | self.add_argument(*args, **kwargs) 126 | for i in range(len(args)): 127 | arg_name = args[i] 128 | self.opt_args[arg_name] = OptArg(obj_id=arg_name, opt_values=options, tunable=tunable) 129 | 130 | def opt_range( 131 | self, 132 | *args, 133 | **kwargs 134 | ): 135 | low = kwargs.pop("low", None) 136 | high = kwargs.pop("high", None) 137 | arg_type = kwargs["type"] 138 | nb_samples = kwargs.pop("nb_samples", 10) 139 | tunable = kwargs.pop("tunable", False) 140 | log_base = kwargs.pop("log_base", None) 141 | 142 | self.add_argument(*args, **kwargs) 143 | arg_name = args[-1] 144 | self.opt_args[arg_name] = OptArg( 145 | obj_id=arg_name, 146 | opt_values=[low, high], 147 | arg_type=arg_type, 148 | nb_samples=nb_samples, 149 | tunable=tunable, 150 | log_base=log_base, 151 | ) 152 | 153 | def json_config(self, *args, **kwargs): 154 | self.add_argument(*args, **kwargs) 155 | self.json_config_arg_name = re.sub('-', '', args[-1]) 156 | 157 | def __parse_args(self, args=None, namespace=None): 158 | # allow bypassing certain missing params which other parts of test tube may introduce 159 | args, argv = self.parse_known_args(args, namespace) 160 | args, argv = self.__whitelist_cluster_commands(args, argv) 161 | if argv: 162 | msg = _('unrecognized arguments: %s') 163 | self.error(msg % ' '.join(argv)) 164 | return args 165 | 166 | def __whitelist_cluster_commands(self, args, argv): 167 | parsed = {} 168 | 169 | # build a dict where key = arg, value = value of the arg or None if just a flag 170 | for i, arg_candidate in enumerate(argv): 171 | arg = None 172 | value = None 173 | 174 | # only look at --keys 175 | if '--' not in arg_candidate: 176 | continue 177 | 178 | # skip items not on the white list 179 | if arg_candidate[2:] not in HyperOptArgumentParser.CMD_MAP: 180 | continue 181 | 182 | arg = arg_candidate[2:] 183 | # pull out the value of the argument if given 184 | if i + 1 <= len(argv) - 1: 185 | if '--' not in argv[i + 1]: 186 | value = argv[i + 1] 187 | 188 | if arg is not None: 189 | parsed[arg] = value 190 | else: 191 | if arg is not None: 192 | parsed[arg] = value 193 | 194 | # add the whitelist cmds to the args 195 | all_values = set() 196 | for k, v in args.__dict__.items(): 197 | all_values.add(k) 198 | all_values.add(v) 199 | 200 | for arg, v in parsed.items(): 201 | v_parsed = self.__parse_primitive_arg_val(v) 202 | all_values.add(v) 203 | all_values.add(arg) 204 | args.__setattr__(arg, v_parsed) 205 | 206 | # make list with only the unknown args 207 | unk_args = [] 208 | for arg in argv: 209 | arg_candidate = re.sub('--', '', arg) 210 | is_bool = arg_candidate == 'True' or arg_candidate == 'False' 211 | if is_bool: continue 212 | 213 | if arg_candidate not in all_values: 214 | unk_args.append(arg) 215 | 216 | # when no bad args are left, return none to be consistent with super api 217 | if len(unk_args) == 0: 218 | unk_args = None 219 | 220 | # add hpc_exp_number if not passed in so we can never get None 221 | if HyperOptArgumentParser.SLURM_EXP_CMD not in args: 222 | args.__setattr__(HyperOptArgumentParser.SLURM_EXP_CMD, None) 223 | 224 | return args, unk_args 225 | 226 | def __parse_primitive_arg_val(self, val): 227 | if val is None: 228 | return True 229 | try: 230 | return int(val) 231 | except ValueError: 232 | try: 233 | return float(val) 234 | except ValueError: 235 | return val 236 | 237 | def parse_args(self, args=None, namespace=None): 238 | # call superclass arg first 239 | results = self.__parse_args(args, namespace) 240 | 241 | # extract vals 242 | old_args = vars(results) 243 | 244 | # override with json args if given 245 | if self.json_config_arg_name and old_args[self.json_config_arg_name]: 246 | for arg, v in self.__read_json_config(old_args[self.json_config_arg_name]).items(): 247 | old_args[arg] = v 248 | 249 | # track args 250 | self.parsed_args = deepcopy(old_args) 251 | # attach optimization fx 252 | old_args['trials'] = self.opt_trials 253 | old_args['optimize_parallel'] = self.optimize_parallel 254 | old_args['optimize_parallel_gpu'] = self.optimize_parallel_gpu 255 | old_args['optimize_parallel_cpu'] = self.optimize_parallel_cpu 256 | old_args['generate_trials'] = self.generate_trials 257 | old_args['optimize_trials_parallel_gpu'] = self.optimize_trials_parallel_gpu 258 | 259 | return TTNamespace(**old_args) 260 | 261 | def __read_json_config(self, file_path): 262 | with open(file_path) as json_data: 263 | json_args = json.load(json_data) 264 | return json_args 265 | 266 | def opt_trials(self, num): 267 | self.trials = strategies.generate_trials( 268 | strategy=self.strategy, 269 | flat_params=self.__flatten_params(self.opt_args), 270 | nb_trials=num, 271 | ) 272 | 273 | for trial in self.trials: 274 | ns = self.__namespace_from_trial(trial) 275 | yield ns 276 | 277 | def generate_trials(self, nb_trials): 278 | trials = strategies.generate_trials( 279 | strategy=self.strategy, 280 | flat_params=self.__flatten_params(self.opt_args), 281 | nb_trials=nb_trials, 282 | ) 283 | 284 | trials = [self.__namespace_from_trial(x) for x in trials] 285 | return trials 286 | 287 | def optimize_parallel_gpu( 288 | self, 289 | train_function, 290 | gpu_ids, 291 | max_nb_trials=None, 292 | ): 293 | """ 294 | Runs optimization across gpus with cuda drivers 295 | :param train_function: 296 | :param max_nb_trials: 297 | :param gpu_ids: List of strings like: ['0', '1, 3'] 298 | :return: 299 | """ 300 | self.trials = strategies.generate_trials( 301 | strategy=self.strategy, 302 | flat_params=self.__flatten_params(self.opt_args), 303 | nb_trials=max_nb_trials, 304 | ) 305 | 306 | self.trials = [(self.__namespace_from_trial(x), train_function) for x in self.trials] 307 | 308 | # build q of gpu ids so we can use them in each process 309 | # this is thread safe so each process can pull out a gpu id, run its task and put it back when done 310 | if self.pool is None: 311 | gpu_q = Queue() 312 | for gpu_id in gpu_ids: 313 | gpu_q.put(gpu_id) 314 | 315 | # called by the Pool when a process starts 316 | def init(local_gpu_q): 317 | global g_gpu_id_q 318 | g_gpu_id_q = local_gpu_q 319 | 320 | # init a pool with the nb of worker threads we want 321 | nb_workers = len(gpu_ids) 322 | self.pool = Pool(processes=nb_workers, initializer=init, initargs=(gpu_q,)) 323 | 324 | # apply parallelization 325 | results = self.pool.map(optimize_parallel_gpu_private, self.trials) 326 | return results 327 | 328 | def optimize_trials_parallel_gpu( 329 | self, 330 | train_function, 331 | nb_trials, 332 | trials, 333 | gpu_ids, 334 | nb_workers=4, 335 | ): 336 | """ 337 | Runs optimization across gpus with cuda drivers 338 | :param train_function: 339 | :param nb_trials: 340 | :param gpu_ids: List of strings like: ['0', '1, 3'] 341 | :param nb_workers: 342 | :return: 343 | """ 344 | self.trials = trials 345 | self.trials = [(x, train_function) for x in self.trials] 346 | 347 | # build q of gpu ids so we can use them in each process 348 | # this is thread safe so each process can pull out a gpu id, run its task and put it back when done 349 | if self.pool is None: 350 | gpu_q = Queue() 351 | for gpu_id in gpu_ids: 352 | gpu_q.put(gpu_id) 353 | 354 | # called by the Pool when a process starts 355 | def init(local_gpu_q): 356 | global g_gpu_id_q 357 | g_gpu_id_q = local_gpu_q 358 | 359 | # init a pool with the nb of worker threads we want 360 | self.pool = Pool(processes=nb_workers, initializer=init, initargs=(gpu_q,)) 361 | 362 | # apply parallelization 363 | results = self.pool.map(optimize_parallel_gpu_private, self.trials) 364 | return results 365 | 366 | def optimize_parallel_cpu( 367 | self, 368 | train_function, 369 | nb_trials, 370 | nb_workers=4, 371 | ): 372 | """ 373 | Runs optimization across n cpus 374 | :param train_function: 375 | :param nb_trials: 376 | :param nb_workers: 377 | :return: 378 | """ 379 | self.trials = strategies.generate_trials( 380 | strategy=self.strategy, 381 | flat_params=self.__flatten_params(self.opt_args), 382 | nb_trials=nb_trials 383 | ) 384 | 385 | self.trials = [(self.__namespace_from_trial(x), train_function) for x in self.trials] 386 | 387 | # init a pool with the nb of worker threads we want 388 | if self.pool is None: 389 | self.pool = Pool(processes=nb_workers) 390 | 391 | # apply parallelization 392 | results = self.pool.map(optimize_parallel_cpu_private, self.trials) 393 | return results 394 | 395 | def optimize_parallel( 396 | self, 397 | train_function, 398 | nb_trials, 399 | nb_parallel=4, 400 | ): 401 | self.trials = strategies.generate_trials( 402 | strategy=self.strategy, 403 | flat_params=self.__flatten_params(self.opt_args), 404 | nb_trials=nb_trials 405 | ) 406 | 407 | # nb of runs through all parallel systems 408 | fork_batches = [ 409 | self.trials[i:i + nb_parallel] for i in range(0, len(self.trials), nb_parallel) 410 | ] 411 | 412 | for fork_batch in fork_batches: 413 | children = [] 414 | 415 | # run n parallel forks 416 | for parallel_nb, trial in enumerate(fork_batch): 417 | 418 | # q up the trial and convert to a namespace 419 | ns = self.__namespace_from_trial(trial) 420 | 421 | # split new fork 422 | pid = os.fork() 423 | 424 | # when the process is a parent 425 | if pid: 426 | children.append(pid) 427 | 428 | # when process is a child 429 | else: 430 | # slight delay to make sure we don't overwrite over test tube log versions 431 | sleep(parallel_nb * 0.5) 432 | train_function(ns, parallel_nb) 433 | os._exit(0) 434 | 435 | for i, child in enumerate(children): 436 | os.waitpid(child, 0) 437 | 438 | def __namespace_from_trial(self, trial): 439 | trial_dict = {d['name']: d['val'] for d in trial} 440 | for k, v in self.parsed_args.items(): 441 | if k not in trial_dict: 442 | trial_dict[k] = v 443 | 444 | return TTNamespace(**trial_dict) 445 | 446 | def __flatten_params(self, params): 447 | """ 448 | Turns a list of parameters with values into a flat tuple list of lists 449 | so we can permute 450 | :param params: 451 | :return: 452 | """ 453 | flat_params = [] 454 | for i, (opt_name, opt_arg) in enumerate(params.items()): 455 | if opt_arg.tunable: 456 | clean_name = opt_name.strip('-') 457 | clean_name = re.sub('-', '_', clean_name) 458 | param_groups = [] 459 | for val in opt_arg.opt_values: 460 | param_groups.append({'idx': i, 'val': val, 'name': clean_name}) 461 | flat_params.append(param_groups) 462 | return flat_params 463 | 464 | 465 | class TTNamespace(argparse.Namespace): 466 | 467 | def __str__(self): 468 | result = '-' * 100 + '\nHyperparameters:\n' 469 | for k, v in self.__dict__.items(): 470 | result += '{0:20}: {1}\n'.format(k, v) 471 | return result 472 | 473 | def __getstate__(self): 474 | # capture what is normally pickled 475 | state = self.__dict__.copy() 476 | 477 | # remove all functions from the namespace 478 | clean_state = {} 479 | for k, v in state.items(): 480 | if not hasattr(v, '__call__'): 481 | clean_state[k] = v 482 | 483 | # what we return here will be stored in the pickle 484 | return clean_state 485 | 486 | def __setstate__(self, newstate): 487 | # re-instate our __dict__ state from the pickled state 488 | self.__dict__.update(newstate) 489 | 490 | 491 | class OptArg(object): 492 | def __init__( 493 | self, 494 | obj_id, 495 | opt_values, 496 | arg_type=None, 497 | nb_samples=None, 498 | tunable=False, 499 | log_base=None, 500 | ): 501 | self.opt_values = opt_values 502 | self.obj_id = obj_id 503 | self.tunable = tunable 504 | 505 | # convert range to list of values 506 | if nb_samples: 507 | low, high = opt_values 508 | 509 | if log_base is None: 510 | # random search on uniform scale 511 | if arg_type is int: 512 | self.opt_values = [int(_) for _ in np.random.choice(np.arange(low, high), nb_samples, replace=False)] 513 | elif arg_type is float: 514 | self.opt_values = np.random.uniform(low, high, nb_samples) 515 | else: 516 | # random search on log scale with specified base 517 | assert high >= low > 0, "`opt_values` must be positive to do log-scale search." 518 | 519 | log_low, log_high = math.log(low, log_base), math.log(high, log_base) 520 | 521 | self.opt_values = log_base ** np.random.uniform(log_low, log_high, nb_samples) 522 | 523 | -------------------------------------------------------------------------------- /test_tube/hpc.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import os 3 | import signal 4 | import sys 5 | import time 6 | import traceback 7 | from subprocess import call 8 | 9 | from .argparse_hopt import HyperOptArgumentParser 10 | 11 | 12 | def exit(): 13 | time.sleep(1) 14 | os._exit(1) 15 | 16 | 17 | class AbstractCluster(object): 18 | 19 | RUN_CMD = 'sbatch' 20 | def __init__( 21 | self, 22 | hyperparam_optimizer=None, 23 | log_path=None, 24 | python_cmd='python3', 25 | enable_log_err=True, 26 | enable_log_out=True, 27 | ): 28 | self.hyperparam_optimizer = hyperparam_optimizer 29 | self.log_path = log_path 30 | 31 | self.enable_log_err = enable_log_err 32 | self.enable_log_out = enable_log_out 33 | self.slurm_files_log_path = None 34 | self.err_log_path = None 35 | self.out_log_path = None 36 | self.modules = [] 37 | self.script_name = os.path.realpath(sys.argv[0]) 38 | self.job_time = '15:00' 39 | self.minutes_to_checkpoint_before_walltime = 5 40 | self.per_experiment_nb_gpus = 1 41 | self.per_experiment_nb_cpus = 1 42 | self.per_experiment_nb_nodes = 1 43 | self.memory_mb_per_node = 2000 44 | self.email = None 45 | self.notify_on_end = False 46 | self.notify_on_fail = False 47 | self.job_name = None 48 | self.python_cmd = python_cmd 49 | self.gpu_type = None 50 | self.on_gpu = False 51 | self.call_load_checkpoint = False 52 | self.commands = [] 53 | self.slurm_commands = [] 54 | self.hpc_exp_number = 0 55 | 56 | # these are set via getters and setters so we can use a BaseManager which can be shared across processes 57 | self.checkpoint_save_function = None 58 | self.checkpoint_load_function = None 59 | 60 | # detect when this was called because a slurm object started a hopt. 61 | # if true, remove the flag so tt logs don't show it 62 | if hyperparam_optimizer is not None: 63 | 64 | self.is_from_slurm_object = HyperOptArgumentParser.TRIGGER_CMD in vars(self.hyperparam_optimizer) and vars(self.hyperparam_optimizer)[HyperOptArgumentParser.TRIGGER_CMD] == True 65 | if self.is_from_slurm_object: 66 | self.hyperparam_optimizer.__delattr__(HyperOptArgumentParser.TRIGGER_CMD) 67 | 68 | self.call_load_checkpoint = HyperOptArgumentParser.SLURM_LOAD_CMD in vars(self.hyperparam_optimizer) 69 | if self.call_load_checkpoint: 70 | self.hyperparam_optimizer.__delattr__(HyperOptArgumentParser.SLURM_LOAD_CMD) 71 | 72 | self.hpc_exp_number = self.hyperparam_optimizer.hpc_exp_number 73 | 74 | def set_checkpoint_save_function(self, fx, kwargs): 75 | self.checkpoint_save_function = [fx, kwargs] 76 | 77 | def get_checkpoint_save_function(self): 78 | return self.checkpoint_save_function 79 | 80 | def set_checkpoint_load_function(self, fx, kwargs): 81 | # if we were passed in the load flag, then we call the load function as soon as it's added 82 | if self.call_load_checkpoint: 83 | fx(**kwargs) 84 | 85 | self.checkpoint_load_function = [fx, kwargs] 86 | 87 | def get_checkpoint_load_function(self): 88 | return self.checkpoint_load_function 89 | 90 | def add_slurm_cmd(self, cmd, value, comment): 91 | self.slurm_commands.append((cmd, value, comment)) 92 | 93 | def add_command(self, cmd): 94 | self.commands.append(cmd) 95 | 96 | def load_modules(self, modules): 97 | self.modules = modules 98 | 99 | def notify_job_status(self, email, on_done, on_fail): 100 | self.email = email 101 | self.notify_on_end = on_done 102 | self.notify_on_fail = on_fail 103 | 104 | def optimize_parallel_cluster(self, train_function, nb_trials, job_name): 105 | raise NotImplementedError 106 | 107 | def optimize_parallel_slurm(self, job_name, output_file, error_file, job_time, nb_gpus, nb_nodes, memory, notifications_email, gpu_types): 108 | pass 109 | 110 | 111 | class SlurmCluster(AbstractCluster): 112 | def __init__(self, *args, **kwargs): 113 | super(SlurmCluster, self).__init__(*args, **kwargs) 114 | 115 | def optimize_parallel_cluster_gpu( 116 | self, 117 | train_function, 118 | nb_trials, 119 | job_name, 120 | enable_auto_resubmit=False, 121 | job_display_name=None 122 | ): 123 | if job_display_name is None: 124 | job_display_name = job_name 125 | 126 | self.__optimize_parallel_cluster_internal(train_function, nb_trials, job_name, job_display_name, 127 | enable_auto_resubmit, on_gpu=True) 128 | 129 | def optimize_parallel_cluster_cpu( 130 | self, 131 | train_function, 132 | nb_trials, 133 | job_name, 134 | enable_auto_resubmit=False, 135 | job_display_name=None 136 | ): 137 | if job_display_name is None: 138 | job_display_name = job_name 139 | 140 | self.__optimize_parallel_cluster_internal(train_function, nb_trials, job_name, job_display_name, 141 | enable_auto_resubmit, on_gpu=False) 142 | 143 | def __optimize_parallel_cluster_internal( 144 | self, 145 | train_function, 146 | nb_trials, 147 | job_name, 148 | job_display_name, 149 | enable_auto_resubmit, 150 | on_gpu 151 | ): 152 | """ 153 | Runs optimization on the attached cluster 154 | :param train_function: 155 | :param nb_trials: 156 | :param job_name: 157 | :return: 158 | """ 159 | self.job_name = job_name 160 | self.job_display_name = job_display_name 161 | self.on_gpu = on_gpu 162 | self.enable_auto_resubmit = enable_auto_resubmit 163 | 164 | # layout logging structure 165 | self.__layout_logging_dir() 166 | 167 | if self.is_from_slurm_object: 168 | # Script is called by slurm: it's an actual experiment. 169 | self.__run_experiment(train_function) 170 | else: 171 | # Launcher script. Generate trials and launch jobs. 172 | 173 | # generate hopt trials 174 | trials = self.hyperparam_optimizer.generate_trials(nb_trials) 175 | 176 | # get the max test tube exp version so far if it's there 177 | scripts_path = os.path.join(self.log_path, 'slurm_out_logs') 178 | next_trial_version = self.__get_max_trial_version(scripts_path) 179 | 180 | # for each trial, generate a slurm command 181 | for i, trial_params in enumerate(trials): 182 | exp_i = i + next_trial_version 183 | self.schedule_experiment(trial_params, exp_i) 184 | 185 | def schedule_experiment(self, trial_params, exp_i): 186 | timestamp = datetime.datetime.now().strftime("%Y-%m-%d__%H-%M-%S") 187 | timestamp = 'trial_{}_{}'.format(exp_i, timestamp) 188 | 189 | # generate command 190 | slurm_cmd_script_path = os.path.join(self.slurm_files_log_path, '{}_slurm_cmd.sh'.format(timestamp)) 191 | slurm_cmd = self.__build_slurm_command(trial_params, slurm_cmd_script_path, timestamp, exp_i, self.on_gpu) 192 | self.__save_slurm_cmd(slurm_cmd, slurm_cmd_script_path) 193 | 194 | # run script to launch job 195 | print('\nlaunching exp...') 196 | result = call('{} {}'.format(AbstractCluster.RUN_CMD, slurm_cmd_script_path), shell=True) 197 | if result == 0: 198 | print('launched exp ', slurm_cmd_script_path) 199 | else: 200 | print('launch failed...') 201 | 202 | def slurm_time_to_seconds(self, job_time): 203 | seconds = 0 204 | time_component = job_time 205 | if '-' in job_time: 206 | days, time_component = job_time.split('-') 207 | seconds += int(days) * 24 * 60 * 60 208 | 209 | time_components = time_component.split(':') 210 | if len(time_components) == 3: 211 | hours, minutes, secs = time_components 212 | time_seconds = int(secs) + (int(minutes) * 60) + (int(hours) * 60 * 60) 213 | seconds += time_seconds 214 | 215 | elif len(time_components) == 2: 216 | minutes, secs = time_components 217 | time_seconds = int(secs) + (int(minutes) * 60) 218 | seconds += time_seconds 219 | 220 | elif len(time_components) == 1: 221 | secs = time_components[0] 222 | seconds += int(secs) 223 | 224 | return seconds 225 | 226 | def call_save(self): 227 | print('calling save') 228 | 229 | # if save function was passed, call it 230 | if self.get_checkpoint_save_function() is not None: 231 | save_fx, kwargs = self.get_checkpoint_save_function() 232 | save_fx(**kwargs) 233 | 234 | # if we're here, the job didn't finish and we were given a save function 235 | # if we were given a load function, then schedule the program again and pass in the load function 236 | if self.get_checkpoint_load_function() is not None: 237 | job_id = os.environ['SLURM_JOB_ID'] 238 | cmd = 'scontrol requeue {}'.format(job_id) 239 | 240 | print('\nrequeing job {}...'.format(job_id)) 241 | result = call(cmd, shell=True) 242 | if result == 0: 243 | print('requeued exp ', job_id) 244 | else: 245 | print('requeue failed...') 246 | 247 | # stop program 248 | os._exit(0) 249 | 250 | def sig_handler(self, signum, frame): 251 | print("caught signal", signum) 252 | self.call_save() 253 | # sys.exit(-1) 254 | 255 | # ------------------------ 256 | # HANDLE SLURM SIGNALS 257 | # ------------------------ 258 | def term_handler(self, signum, frame): 259 | print("bypassing sigterm") 260 | 261 | def __run_experiment(self, train_function): 262 | if self.enable_auto_resubmit: 263 | print('setting signal') 264 | signal.signal(signal.SIGUSR1, self.sig_handler) 265 | signal.signal(signal.SIGTERM, self.term_handler) 266 | 267 | try: 268 | # run training 269 | train_function(self.hyperparam_optimizer, self) 270 | 271 | except Exception as e: 272 | print('Caught exception in worker thread', e) 273 | 274 | # This prints the type, value, and stack trace of the 275 | # current exception being handled. 276 | traceback.print_exc() 277 | raise SystemExit 278 | 279 | def __save_slurm_cmd(self, slurm_cmd, slurm_cmd_script_path): 280 | with open(slurm_cmd_script_path, mode='w') as file: 281 | file.write(slurm_cmd) 282 | 283 | def __get_max_trial_version(self, path): 284 | files = os.listdir(path) 285 | version_files = [f for f in files if 'trial_' in f] 286 | if len(version_files) > 0: 287 | # regex out everything except file version for ve 288 | versions = [int(f_name.split('_')[1]) for f_name in version_files] 289 | max_version = max(versions) 290 | return max_version + 1 291 | else: 292 | return 0 293 | 294 | def __layout_logging_dir(self): 295 | """ 296 | Generates dir structure for logging errors and outputs 297 | :return: 298 | """ 299 | 300 | # format the logging folder path 301 | slurm_out_path = os.path.join(self.log_path, self.job_name) 302 | 303 | self.log_path = slurm_out_path 304 | 305 | # if we have a test tube name, make the folder and set as the logging destination 306 | if not os.path.exists(slurm_out_path): 307 | os.makedirs(slurm_out_path) 308 | 309 | # when err logging is enabled, build add the err logging folder 310 | if self.enable_log_err: 311 | err_path = os.path.join(slurm_out_path, 'slurm_err_logs') 312 | if not os.path.exists(err_path): 313 | os.makedirs(err_path) 314 | self.err_log_path = err_path 315 | 316 | # when out logging is enabled, build add the out logging folder 317 | if self.enable_log_out: 318 | out_path = os.path.join(slurm_out_path, 'slurm_out_logs') 319 | if not os.path.exists(out_path): 320 | os.makedirs(out_path) 321 | self.out_log_path = out_path 322 | 323 | # place where slurm files log to 324 | self.slurm_files_log_path = os.path.join(slurm_out_path, 'slurm_scripts') 325 | if not os.path.exists(self.slurm_files_log_path): 326 | os.makedirs(self.slurm_files_log_path) 327 | 328 | def __get_hopt_params(self, trial): 329 | """ 330 | Turns hopt trial into script params 331 | :param trial: 332 | :return: 333 | """ 334 | 335 | params = [] 336 | for k in trial.__dict__: 337 | v = trial.__dict__[k] 338 | 339 | # don't add None params 340 | if v is None or v is False: 341 | continue 342 | 343 | # put everything in quotes except bools 344 | if self.__should_escape(v): 345 | cmd = '--{} \"{}\"'.format(k, v) 346 | else: 347 | cmd = '--{} {}'.format(k, v) 348 | params.append(cmd) 349 | 350 | # this arg lets the hyperparameter optimizer do its thing 351 | params.append('--{}'.format(HyperOptArgumentParser.TRIGGER_CMD)) 352 | 353 | full_cmd = ' '.join(params) 354 | return full_cmd 355 | 356 | def __should_escape(self, v): 357 | v = str(v) 358 | return '[' in v or ';' in v or ' ' in v 359 | 360 | def __build_slurm_command(self, trial, slurm_cmd_script_path, timestamp, exp_i, on_gpu): 361 | sub_commands = [] 362 | 363 | command =[ 364 | '#!/bin/bash', 365 | '#', 366 | '# Auto-generated by test-tube (https://github.com/williamFalcon/test-tube)', 367 | '#################\n' 368 | ] 369 | sub_commands.extend(command) 370 | 371 | # add job name 372 | job_with_version = '{}v{}'.format(self.job_display_name, exp_i) 373 | command = [ 374 | '# set a job name', 375 | '#SBATCH --job-name={}'.format(job_with_version), 376 | '#################\n', 377 | ] 378 | sub_commands.extend(command) 379 | 380 | # add out output 381 | if self.enable_log_out: 382 | out_path = os.path.join(self.out_log_path, '{}_slurm_output_%j.out'.format(timestamp)) 383 | command = [ 384 | '# a file for job output, you can check job progress', 385 | '#SBATCH --output={}'.format(out_path), 386 | '#################\n', 387 | ] 388 | sub_commands.extend(command) 389 | 390 | # add err output 391 | if self.enable_log_err: 392 | err_path = os.path.join(self.err_log_path, '{}_slurm_output_%j.err'.format(timestamp)) 393 | command = [ 394 | '# a file for errors', 395 | '#SBATCH --error={}'.format(err_path), 396 | '#################\n', 397 | ] 398 | sub_commands.extend(command) 399 | 400 | # add job time 401 | command = [ 402 | '# time needed for job', 403 | '#SBATCH --time={}'.format(self.job_time), 404 | '#################\n' 405 | ] 406 | sub_commands.extend(command) 407 | 408 | # add nb of gpus 409 | if self.per_experiment_nb_gpus > 0 and on_gpu: 410 | command = [ 411 | '# gpus per node', 412 | '#SBATCH --gres=gpu:{}'.format(self.per_experiment_nb_gpus), 413 | '#################\n' 414 | ] 415 | if self.gpu_type is not None: 416 | command = [ 417 | '# gpus per node', 418 | '#SBATCH --gres=gpu:{}:{}'.format(self.gpu_type, self.per_experiment_nb_gpus), 419 | '#################\n' 420 | ] 421 | sub_commands.extend(command) 422 | 423 | # add nb of cpus if not looking at a gpu job 424 | if self.per_experiment_nb_cpus > 0: 425 | command = [ 426 | '# cpus per job', 427 | '#SBATCH --cpus-per-task={}'.format(self.per_experiment_nb_cpus), 428 | '#################\n' 429 | ] 430 | sub_commands.extend(command) 431 | 432 | # pick nb nodes 433 | command = [ 434 | '# number of requested nodes', 435 | '#SBATCH --nodes={}'.format(self.per_experiment_nb_nodes), 436 | '#################\n' 437 | ] 438 | sub_commands.extend(command) 439 | 440 | # pick memory per node 441 | command = [ 442 | '# memory per node', 443 | '#SBATCH --mem={}'.format(self.memory_mb_per_node), 444 | '#################\n' 445 | ] 446 | sub_commands.extend(command) 447 | 448 | # add signal command to catch job termination 449 | command = [ 450 | '# slurm will send a signal this far out before it kills the job', 451 | f'#SBATCH --signal=USR1@{self.minutes_to_checkpoint_before_walltime * 60}', 452 | '#################\n' 453 | ] 454 | 455 | sub_commands.extend(command) 456 | 457 | # Subscribe to email if requested 458 | mail_type = [] 459 | if self.notify_on_end: 460 | mail_type.append('END') 461 | if self.notify_on_fail: 462 | mail_type.append('FAIL') 463 | if len(mail_type) > 0: 464 | mail_type_query = [ 465 | '# Have SLURM send you an email when the job ends or fails', 466 | '#SBATCH --mail-type={}'.format(','.join(mail_type)) 467 | ] 468 | sub_commands.extend(mail_type_query) 469 | 470 | email_query = [ 471 | '#SBATCH --mail-user={}'.format(self.email), 472 | ] 473 | sub_commands.extend(email_query) 474 | 475 | # add custom sbatch commands 476 | sub_commands.append('\n') 477 | for (cmd, value, comment) in self.slurm_commands: 478 | comment = '# {}'.format(comment) 479 | cmd = '#SBATCH --{}={}'.format(cmd, value) 480 | spaces = '#################\n' 481 | sub_commands.extend([comment, cmd, spaces]) 482 | 483 | # load modules 484 | sub_commands.append('\n') 485 | for module in self.modules: 486 | cmd = 'module load {}'.format(module) 487 | sub_commands.append(cmd) 488 | 489 | # remove spaces before the hash 490 | sub_commands = [x.lstrip() for x in sub_commands] 491 | 492 | # add additional commands 493 | for cmd in self.commands: 494 | sub_commands.append(cmd) 495 | sub_commands.append('\n') 496 | 497 | # add run command 498 | trial_args = self.__get_hopt_params(trial) 499 | trial_args = '{} --{} {} --{} {}'.format(trial_args, 500 | HyperOptArgumentParser.SLURM_CMD_PATH, 501 | slurm_cmd_script_path, 502 | HyperOptArgumentParser.SLURM_EXP_CMD, 503 | exp_i) 504 | 505 | cmd = 'srun {} {} {}'.format(self.python_cmd, self.script_name, trial_args) 506 | sub_commands.append(cmd) 507 | 508 | # build full command with empty lines in between 509 | full_command = '\n'.join(sub_commands) 510 | return full_command 511 | -------------------------------------------------------------------------------- /site/hpc/SlurmCluster/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | SlurmCluster class API - Test tube Documentation 12 | 13 | 14 | 15 | 16 | 17 | 18 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 |
35 | 36 | 37 | 109 | 110 |
111 | 112 | 113 | 117 | 118 | 119 |
120 |
121 |
122 |
    123 |
  • Docs »
  • 124 | 125 | 126 | 127 |
  • Hpc »
  • 128 | 129 | 130 | 131 |
  • SlurmCluster class API
  • 132 |
  • 133 | 134 | Edit on GitHub 136 | 137 |
  • 138 |
139 |
140 |
141 |
142 |
143 | 144 |

SlurmCluster class API

145 |

[Github Code]

146 |

The SlurmCluster class enables hyperparameter search parallelization on a cluster managed via Slurm workload manager.

147 |

At a high level, the SlurmCluster creates a submit script for each permutation of hyperparameters requested. If the job hits the walltime but has not completed, the SlurmManager will checkpoint the model and submit a new job to continue training using the saved weights.

148 | 152 |

You can instantiate a SlurmCluster via:

153 |
from test_tube.hpc import SlurmCluster
154 | 
155 | # hyperparameters is a test-tube hyper params object
156 | # see https://williamfalcon.github.io/test-tube/hyperparameter_optimization/HyperOptArgumentParser/
157 | hyperparams = args.parse()
158 | 
159 | # init cluster
160 | cluster = SlurmCluster(
161 |     hyperparam_optimizer=hyperparams,
162 |     log_path='/path/to/log/results/to',
163 |     python_cmd='python3'
164 | )
165 | 
166 | # let the cluster know where to email for a change in job status (ie: complete, fail, etc...)
167 | cluster.notify_job_status(email='some@email.com', on_done=True, on_fail=True)
168 | 
169 | # set the job options. In this instance, we'll run 20 different models
170 | # each with its own set of hyperparameters giving each one 1 GPU (ie: taking up 20 GPUs)
171 | cluster.per_experiment_nb_gpus = 1
172 | cluster.per_experiment_nb_nodes = 1
173 | 
174 | # we'll request 10GB of memory per node
175 | cluster.memory_mb_per_node = 10000
176 | 
177 | # set a walltime of 10 minues
178 | cluster.job_time = '10:00'
179 | 
180 | # 1 minute before walltime is up, SlurmCluster will launch a continuation job and kill this job.
181 | # you must provide your own loading and saving function which the cluster object will call
182 | cluster.minutes_to_checkpoint_before_walltime = 1
183 | 
184 | # run the models on the cluster
185 | cluster.optimize_parallel_cluster_gpu(train, nb_trials=20, job_name='first_tt_batch', job_display_name='my_batch')
186 | 
187 | 188 |
189 |

init options

190 |

hyperparam_optimizer

191 |

A HyperOptArgumentParser object 192 | which contains all permutations of model hyperparameters to run.

193 |

log_path

194 |

Path to save the slurm scripts, error logs and out logs created. Usually this would be the experiments folder path where test tube saves Experiment information.

195 |

python_cmd

196 |

This is the command that starts the python program. Normally it is:

197 |
# python 2
198 | python main.py   
199 | 
200 | # python 3   
201 | python3 main.py
202 | 
203 | 204 |

enable_log_err

205 |

If true, saves slurm error logs to the path at log_path. If anything goes wrong in your job, you'll find the error here.

206 |

enable_log_out

207 |

If true, saves slurm output logs to the path at log_path. This file contains all outputs that would show up on the console normally.

208 |

test_tube_exp_name

209 |

When this is given, it structures the files in a nice format to fit with the folder structure of the Experiment object's output.

210 |

Properties

211 |

job_time
212 | String. Walltime requested. Examples:

213 |
# 1 hour and 10 minutes    
214 | cluster.job_time = '1:10:00'
215 | 
216 | # 1 day and 1 hour and 10 minutes    
217 | cluster.job_time = '1-1:10:00'
218 | 
219 | # 1 day and 1 hour and 10 minutes    
220 | cluster.job_time = '25:10:00'   
221 | 
222 | # 10 minutes    
223 | cluster.job_time = '10:00'   
224 | 
225 | # 10 seconds    
226 | cluster.job_time = '10'   
227 | 
228 | 229 |

minutes_to_checkpoint_before_walltime
230 | Int. Minutes before walltime when a continuation job will be auto-submitted.

231 |
cluster.job_time = '10:00'   
232 | cluster.minutes_to_checkpoint_before_walltime = 2
233 | 
234 | # New job will be submited to continue training after 8 minutes of the job running.      
235 | 
236 | 237 |

per_experiment_nb_gpus
238 | Int. Number of GPUs each job will get.

239 |
# EACH job will get 2 GPUs (ie: if a model runs over two GPUs at the same time).   
240 | cluster.per_experiment_nb_gpus = 2  
241 | 
242 | 243 |

per_experiment_nb_cpus
244 | Int. Number of CPUs each job will get.

245 |
cluster.per_experiment_nb_cpus = 1 
246 | 
247 | 248 |

per_experiment_nb_nodes
249 | Int. Number of nodes each job will get.

250 |
cluster.per_experiment_nb_nodes = 1 
251 | 
252 | 253 |

gpu_type
254 | String. Gpu type requested. Example:

255 |
cluster.gpu_type = '1080ti'   
256 | 
257 | 258 |
259 |

Methods

260 |

set_checkpoint_save_function

261 |
cluster.set_checkpoint_save_function(fx, kwargs)    
262 | 
263 | 264 |

Called if the model isn't finished training minutes_to_checkpoint_before_walltime before the walltime. If walltime = '15:00' and minutes_to_checkpoint_before_walltime = '1:00' the SlurmCluster will call your save function after 14 minutes of training.

265 |
    266 |
  • fx A python function.
  • 267 |
  • kwargs Dictionary where keys are the literal argument names to the function. Dictionary values are the values of the arguments.
  • 268 |
269 |

Example

270 |
def my_save_function(arg_1, arg_k):  
271 |     # ... save my model here    
272 | 
273 | cluster.set_checkpoint_save_function(my_save_function, kwargs={'arg_1': 'whatever', 'arg_k': 'you_want'})    
274 | 
275 | 
276 | 277 |

set_checkpoint_load_function

278 |
cluster.set_checkpoint_load_function(fx, kwargs)    
279 | 
280 | 281 |

Called internally when a job is auto-submitted by the SlurmCluster to give your program a chance to load the model weights or whatever you need to continue training.
282 | This will call your load function immediately whenever you call this method AND training is continuing.

283 |
    284 |
  • fx A python function.
  • 285 |
  • kwargs Dictionary where keys are the literal argument names to the function. Dictionary values are the values of the arguments.
  • 286 |
287 |

Example

288 |
def my_load_function(arg_1, arg_k):  
289 |     # ... restore my model here    
290 | 
291 | cluster.set_checkpoint_save_function(my_load_function, kwargs={'arg_1': 'whatever', 'arg_k': 'you_want'})    
292 | 
293 | 
294 | 295 |

add_slurm_cmd

296 |
cluster.add_slurm_cmd(cmd, value, comment)
297 | 
298 | 299 |

Adds whatever Slurm command you need manually to the generated script. All possible commands are listed here.

300 |
    301 |
  • cmd String with the bash command.
  • 302 |
  • value String value for the command. Numericals need to be in single quotes '1'
  • 303 |
  • comment String with the command comment.
  • 304 |
305 |

Example

306 |
cluster.add_slurm_cmd(cmd='cpus-per-task', value='1', comment='nb cpus per task')
307 | 
308 | # the above command will add an entry like this to the slurm script   
309 | 
310 | # #nb cpus per task
311 | # #SBATCH --cpus-per-task=1
312 | # ############
313 | 
314 | 
315 | 316 |

add_command

317 |
cluster.add_command(cmd)    
318 | 
319 | 320 |

Adds arbitrary bash commands to the script. Use this to activate conda environments, install packages, whatever else you would think about calling on bash.

321 |
    322 |
  • cmd String with your bash command.
  • 323 |
324 |

Example

325 |
# load the anaconda package on the launch node   
326 | cluster.add_command('module load anaconda')   
327 | 
328 | # activate the environment on the launch node   
329 | cluster.add_command('source activate myCondaEnv')   
330 | 
331 | 332 |

load_modules

333 |
cluster.load_modules(modules)  
334 | 
335 | 336 |

Loads modules needed to run the job. Your Slurm documentation should have a list of available modules. You can also get those by running module avail.
337 | - modules Array of module names.

338 |

Example

339 |
cluster.load_modules([
340 |     'python-3',
341 |     'anaconda3'
342 | ])   
343 | 
344 | 345 |

notify_job_status

346 |
cluster.notify_job_status(email, on_done, on_fail)  
347 | 
348 | 349 |

Loads modules needed to run the job. Your Slurm documentation should have a list of available modules. You can also get those by running module avail.

350 |
    351 |
  • email String. Email address to get notifications.
  • 352 |
  • on_done Boolean. If true, you'll get an email when the job completes.
  • 353 |
  • on_fail Boolean. If true, you'll get an email if the job fails.
  • 354 |
355 |

Example

356 |
cluster.notify_job_status(email='some@email.com', on_done=True, on_fail=True)   
357 | 
358 | 359 |

optimize_parallel_cluster_gpu

360 |
cluster.optimize_parallel_cluster_gpu(train_function, nb_trials, job_name, job_display_name=None)  
361 | 
362 | 363 |

Launches the hyperparameter search across the cluster nodes.
364 | - train_function The entry point to start your training routine.
365 | - nb_trials Number of trials to launch. This is the number of hyperparameter configurations to train over.
366 | - job_name Folder name where the slurm scripts will save to. This should be the same as your Experiment name.
367 | - job_display_name Visible name when slurm lists running jobs (ie: through squeue -u user_name). This should be really short (if using a test tube Experiment, it'll put the experiment version at the end).

368 |

Example

369 |
def main(hparams, cluster, return_dict):   
370 |     # do your own generic training code here... 
371 |     # init model
372 |     model = model_build(hparams)    
373 | 
374 |     # set the load and save fxs
375 |     cluster.set_checkpoint_save_function(fx, {})
376 |     cluster.set_checkpoint_load_function(fx, {})
377 | 
378 |     # train ...
379 | 
380 | 
381 | cluster.optimize_parallel_cluster_gpu(main, nb_trials=20, job_name='my_job', job_display_name='mj')    
382 | 
383 | 384 |

Now if you get the job information, you'll see this:

385 |
(conda_env) [user@node dir]$ squeue -u my_name
386 |              JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
387 |             104040       all  mjv0   my_name  R      58:22      1 nodeName
388 |             104041       all  mjv1   my_name  R      58:22      1 nodeName
389 |             104042       all  mjv2   my_name  R      58:22      1 nodeName
390 |             104043       all  mjv3   my_name  R      58:22      1 nodeName
391 | 
392 | 393 |

optimize_parallel_cluster_cpu

394 |
cluster.optimize_parallel_cluster_cpu(train_function, nb_trials, job_name, job_display_name=None)  
395 | 
396 | 397 |

Launches the hyperparameter search across the cluster nodes using cpus.
398 | - train_function The entry point to start your training routine.
399 | - nb_trials Number of trials to launch. This is the number of hyperparameter configurations to train over.
400 | - job_name Folder name where the slurm scripts will save to. This should be the same as your Experiment name.
401 | - job_display_name Visible name when slurm lists running jobs (ie: through squeue -u user_name). This should be really short (if using a test tube Experiment, it'll put the experiment version at the end).

402 |

Example

403 |
def main(hparams, cluster, return_dict):   
404 |     # do your own generic training code here... 
405 |     # init model
406 |     model = model_build(hparams)    
407 | 
408 |     # set the load and save fxs
409 |     cluster.set_checkpoint_save_function(fx, {})
410 |     cluster.set_checkpoint_load_function(fx, {})
411 | 
412 |     # train ...
413 | 
414 | 
415 | cluster.optimize_parallel_cluster_cpu(main, nb_trials=20, job_name='my_job', job_display_name='mj')    
416 | 
417 | 418 |

Now if you get the job information, you'll see this:

419 |
(conda_env) [user@node dir]$ squeue -u my_name
420 |              JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
421 |             104040       all  mjv0   my_name  R      58:22      1 nodeName
422 |             104041       all  mjv1   my_name  R      58:22      1 nodeName
423 |             104042       all  mjv2   my_name  R      58:22      1 nodeName
424 |             104043       all  mjv3   my_name  R      58:22      1 nodeName
425 | 
426 | 427 |
428 |
429 | 450 | 451 |
452 |
453 | 454 |
455 | 456 |
457 | 458 |
459 | 460 | 461 | GitHub 462 | 463 | 464 | « Previous 465 | 466 | 467 | Next » 468 | 469 | 470 |
471 | 472 | 473 | 474 | 475 | 476 | 477 | -------------------------------------------------------------------------------- /test_tube/log.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import json 3 | import os 4 | import shutil 5 | from datetime import datetime 6 | 7 | import numpy as np 8 | import pandas as pd 9 | from imageio import imwrite 10 | from tensorboard.compat.proto.event_pb2 import Event 11 | from tensorboard.compat.proto.event_pb2 import SessionLog 12 | from torch.utils.tensorboard import SummaryWriter, FileWriter 13 | 14 | # constants 15 | _ROOT = os.path.abspath(os.path.dirname(__file__)) 16 | 17 | # ----------------------------- 18 | # Experiment object 19 | # ----------------------------- 20 | 21 | 22 | class DDPExperiment(object): 23 | def __init__( 24 | self, 25 | exp 26 | ): 27 | """ 28 | Used as meta_data storage if the experiment needs to be pickled 29 | :param name: 30 | :param debug: 31 | :param version: 32 | :param save_dir: 33 | :param autosave: 34 | :param description: 35 | :param create_git_tag: 36 | :param args: 37 | :param kwargs: 38 | """ 39 | 40 | self.tag_markdown_saved = exp.tag_markdown_saved 41 | self.no_save_dir = exp.no_save_dir 42 | self.metrics = exp.metrics 43 | self.tags = exp.tags 44 | self.name = exp.name 45 | self.debug = exp.debug 46 | self.version = exp.version 47 | self.autosave = exp.autosave 48 | self.description = exp.description 49 | self.create_git_tag = exp.create_git_tag 50 | self.exp_hash = exp.exp_hash 51 | self.created_at = exp.created_at 52 | self.save_dir = exp.save_dir 53 | 54 | 55 | def get_non_ddp_exp(self): 56 | return Experiment( 57 | name=self.name, 58 | debug=self.debug, 59 | version=self.version, 60 | save_dir=self.save_dir, 61 | autosave=self.autosave, 62 | description=self.description, 63 | create_git_tag=self.create_git_tag 64 | ) 65 | 66 | class Experiment(SummaryWriter): 67 | 68 | def __init__( 69 | self, 70 | save_dir=None, 71 | name='default', 72 | debug=False, 73 | version=None, 74 | autosave=False, 75 | description=None, 76 | create_git_tag=False, 77 | rank=0, 78 | *args, **kwargs 79 | ): 80 | """ 81 | A new Experiment object defaults to 'default' unless a specific name is provided 82 | If a known name is already provided, then the file version is changed 83 | :param name: 84 | :param debug: 85 | """ 86 | 87 | # change where the save dir is if requested 88 | 89 | if save_dir is not None: 90 | global _ROOT 91 | _ROOT = save_dir 92 | 93 | self.save_dir = save_dir 94 | self.tag_markdown_saved = False 95 | self.no_save_dir = save_dir is None 96 | self.metrics = [] 97 | self.tags = {} 98 | self.name = name 99 | self.debug = debug 100 | self.version = version 101 | self.autosave = autosave 102 | self.description = description 103 | self.create_git_tag = create_git_tag 104 | self.exp_hash = '{}_v{}'.format(self.name, version) 105 | self.created_at = str(datetime.utcnow()) 106 | self.rank = rank 107 | self.process = os.getpid() 108 | 109 | # when debugging don't do anything else 110 | if debug: 111 | return 112 | 113 | # update version hash if we need to increase version on our own 114 | # we will increase the previous version, so do it now so the hash 115 | # is accurate 116 | if version is None: 117 | old_version = self.__get_last_experiment_version() 118 | self.exp_hash = '{}_v{}'.format(self.name, old_version + 1) 119 | self.version = old_version + 1 120 | 121 | # create a new log file 122 | self.__init_cache_file_if_needed() 123 | 124 | # when we have a version, load it 125 | if self.version is not None: 126 | 127 | # when no version and no file, create it 128 | if not os.path.exists(self.__get_log_name()): 129 | self.__create_exp_file(self.version) 130 | else: 131 | # otherwise load it 132 | try: 133 | self.__load() 134 | except Exception as e: 135 | self.debug = True 136 | else: 137 | # if no version given, increase the version to a new exp 138 | # create the file if not exists 139 | old_version = self.__get_last_experiment_version() 140 | self.version = old_version 141 | self.__create_exp_file(self.version + 1) 142 | 143 | # create a git tag if requested 144 | if self.create_git_tag: 145 | desc = description if description is not None else 'no description' 146 | tag_msg = 'Test tube exp: {} - {}'.format(self.name, desc) 147 | cmd = 'git tag -a tt_{} -m "{}"'.format(self.exp_hash, tag_msg) 148 | os.system(cmd) 149 | print('Test tube created git tag:', 'tt_{}'.format(self.exp_hash)) 150 | 151 | # set the tensorboardx log path to the /tf folder in the exp folder 152 | log_dir = self.get_tensorboardx_path(self.name, self.version) 153 | # this is a fix for pytorch 1.1 since it does not have this attribute 154 | for attr, val in [('purge_step', None), 155 | ('max_queue', 10), 156 | ('flush_secs', 120), 157 | ('filename_suffix', '')]: 158 | if not hasattr(self, attr): 159 | setattr(self, attr, val) 160 | super().__init__(log_dir=log_dir, *args, **kwargs) 161 | 162 | # register on exit fx so we always close the writer 163 | # atexit.register(self.on_exit) 164 | 165 | def get_meta_copy(self): 166 | """ 167 | Gets a meta-version only copy of this module 168 | :return: 169 | """ 170 | return DDPExperiment(self) 171 | 172 | def on_exit(self): 173 | pass 174 | 175 | 176 | def __clean_dir(self): 177 | files = os.listdir(self.save_dir) 178 | 179 | if self.rank == 0: 180 | return 181 | 182 | for f in files: 183 | if str(self.process) in f: 184 | os.remove(os.path.join(self.save_dir, f)) 185 | 186 | def argparse(self, argparser): 187 | parsed = vars(argparser) 188 | to_add = {} 189 | 190 | # don't store methods 191 | for k, v in parsed.items(): 192 | if not callable(v): 193 | to_add[k] = v 194 | 195 | self.tag(to_add) 196 | 197 | def add_meta_from_hyperopt(self, hypo): 198 | """ 199 | Transfers meta data about all the params from the 200 | hyperoptimizer to the log 201 | :param hypo: 202 | :return: 203 | """ 204 | meta = hypo.get_current_trial_meta() 205 | for tag in meta: 206 | self.tag(tag) 207 | 208 | # -------------------------------- 209 | # FILE IO UTILS 210 | # -------------------------------- 211 | def __init_cache_file_if_needed(self): 212 | """ 213 | Inits a file that we log historical experiments 214 | :return: 215 | """ 216 | try: 217 | exp_cache_file = self.get_data_path(self.name, self.version) 218 | if not os.path.isdir(exp_cache_file): 219 | os.makedirs(exp_cache_file, exist_ok=True) 220 | except Exception as e: 221 | # file already exists (likely written by another exp. In this case disable the experiment 222 | self.debug = True 223 | 224 | def __create_exp_file(self, version): 225 | """ 226 | Recreates the old file with this exp and version 227 | :param version: 228 | :return: 229 | """ 230 | 231 | try: 232 | exp_cache_file = self.get_data_path(self.name, self.version) 233 | # if no exp, then make it 234 | path = '{}/meta.experiment'.format(exp_cache_file) 235 | open(path, 'w').close() 236 | self.version = version 237 | 238 | # make the directory for the experiment media assets name 239 | os.makedirs(self.get_media_path(self.name, self.version), exist_ok=True) 240 | 241 | # make the directory for tensorboardx stuff 242 | os.makedirs(self.get_tensorboardx_path(self.name, self.version), exist_ok=True) 243 | except Exception as e: 244 | # file already exists (likely written by another exp. In this case disable the experiment 245 | self.debug = True 246 | 247 | 248 | def __get_last_experiment_version(self): 249 | try: 250 | exp_cache_file = os.sep.join(self.get_data_path(self.name, self.version).split(os.sep)[:-1]) 251 | return find_last_experiment_version(exp_cache_file) 252 | except Exception as e: 253 | return -1 254 | 255 | def __get_log_name(self): 256 | exp_cache_file = self.get_data_path(self.name, self.version) 257 | return '{}/meta.experiment'.format(exp_cache_file) 258 | 259 | def tag(self, tag_dict): 260 | """ 261 | Adds a tag to the experiment. 262 | Tags are metadata for the exp. 263 | 264 | >> e.tag({"model": "Convnet A"}) 265 | 266 | :param key: 267 | :param val: 268 | :return: 269 | """ 270 | if self.debug or self.rank > 0: return 271 | 272 | # parse tags 273 | for k, v in tag_dict.items(): 274 | self.tags[k] = v 275 | 276 | # save if needed 277 | if self.autosave == True: 278 | self.save() 279 | 280 | def log(self, metrics_dict, global_step=None, walltime=None): 281 | """ 282 | Adds a json dict of metrics. 283 | 284 | >> e.log({"loss": 23, "coeff_a": 0.2}) 285 | 286 | :param metrics_dict: 287 | :tag optional tfx tag 288 | :return: 289 | """ 290 | if self.debug or self.rank > 0: return 291 | 292 | # handle tfx metrics 293 | if global_step is None: 294 | global_step = len(self.metrics) 295 | 296 | new_metrics_dict = metrics_dict.copy() 297 | for k, v in metrics_dict.items(): 298 | if isinstance(v, dict): 299 | self.add_scalars(main_tag=k, tag_scalar_dict=v, global_step=global_step, walltime=walltime) 300 | tmp_metrics_dict = new_metrics_dict.pop(k) 301 | new_metrics_dict.update(tmp_metrics_dict) 302 | else: 303 | self.add_scalar(tag=k, scalar_value=v, global_step=global_step, walltime=walltime) 304 | 305 | metrics_dict = new_metrics_dict 306 | 307 | # timestamp 308 | if 'created_at' not in metrics_dict: 309 | metrics_dict['created_at'] = str(datetime.utcnow()) 310 | 311 | self.__convert_numpy_types(metrics_dict) 312 | 313 | self.metrics.append(metrics_dict) 314 | 315 | if self.autosave: 316 | self.save() 317 | 318 | def __convert_numpy_types(self, metrics_dict): 319 | for k, v in metrics_dict.items(): 320 | if v.__class__.__name__ == 'float32': 321 | metrics_dict[k] = float(v) 322 | 323 | if v.__class__.__name__ == 'float64': 324 | metrics_dict[k] = float(v) 325 | 326 | def save(self): 327 | """ 328 | Saves current experiment progress 329 | :return: 330 | """ 331 | if self.debug or self.rank > 0: return 332 | 333 | # save images and replace the image array with the 334 | # file name 335 | self.__save_images(self.metrics) 336 | metrics_file_path = self.get_data_path(self.name, self.version) + '/metrics.csv' 337 | meta_tags_path = self.get_data_path(self.name, self.version) + '/meta_tags.csv' 338 | 339 | obj = { 340 | 'name': self.name, 341 | 'version': self.version, 342 | 'tags_path': meta_tags_path, 343 | 'metrics_path': metrics_file_path, 344 | 'autosave': self.autosave, 345 | 'description': self.description, 346 | 'created_at': self.created_at, 347 | 'exp_hash': self.exp_hash 348 | } 349 | 350 | # save the experiment meta file 351 | with atomic_write(self.__get_log_name()) as tmp_path: 352 | with open(tmp_path, 'w') as file: 353 | json.dump(obj, file, ensure_ascii=False) 354 | 355 | # save the metatags file 356 | df = pd.DataFrame({'key': list(self.tags.keys()), 'value': list(self.tags.values())}) 357 | with atomic_write(meta_tags_path) as tmp_path: 358 | df.to_csv(tmp_path, index=False) 359 | 360 | # save the metrics data 361 | df = pd.DataFrame(self.metrics) 362 | with atomic_write(metrics_file_path) as tmp_path: 363 | df.to_csv(tmp_path, index=False) 364 | 365 | # write new vals to disk 366 | self.flush() 367 | 368 | # until hparam plugin is fixed, generate hparams as text 369 | if not self.tag_markdown_saved and len(self.tags) > 0: 370 | self.tag_markdown_saved = True 371 | self.add_text('hparams', self.__generate_tfx_meta_log()) 372 | 373 | def __generate_tfx_meta_log(self): 374 | header = f'''###### {self.name}, version {self.version}\n---\n''' 375 | desc = '' 376 | if self.description is not None: 377 | desc = f'''#####*{self.description}*\n''' 378 | params = f'''##### Hyperparameters\n''' 379 | 380 | row_header = '''parameter|value\n-|-\n''' 381 | rows = [row_header] 382 | for k, v in self.tags.items(): 383 | row = f'''{k}|{v}\n''' 384 | rows.append(row) 385 | 386 | all_rows = [ 387 | header, 388 | desc, 389 | params 390 | ] 391 | all_rows.extend(rows) 392 | mkdown_log = ''.join(all_rows) 393 | return mkdown_log 394 | 395 | def __save_images(self, metrics): 396 | """ 397 | Save tags that have a png_ prefix (as images) 398 | and replace the meta tag with the file name 399 | :param metrics: 400 | :return: 401 | """ 402 | # iterate all metrics and find keys with a specific prefix 403 | for i, metric in enumerate(metrics): 404 | for k, v in metric.items(): 405 | # if the prefix is a png, save the image and replace the value with the path 406 | img_extension = None 407 | img_extension = 'png' if 'png_' in k else img_extension 408 | img_extension = 'jpg' if 'jpg' in k else img_extension 409 | img_extension = 'jpeg' if 'jpeg' in k else img_extension 410 | 411 | if img_extension is not None: 412 | # determine the file name 413 | img_name = '_'.join(k.split('_')[1:]) 414 | save_path = self.get_media_path(self.name, self.version) 415 | save_path = '{}/{}_{}.{}'.format(save_path, img_name, i, img_extension) 416 | 417 | # save image to disk 418 | if type(metric[k]) is not str: 419 | imwrite(save_path, metric[k]) 420 | 421 | # replace the image in the metric with the file path 422 | metric[k] = save_path 423 | 424 | def __load(self): 425 | # load .experiment file 426 | with open(self.__get_log_name(), 'r') as file: 427 | data = json.load(file) 428 | self.name = data['name'] 429 | self.version = data['version'] 430 | self.autosave = data['autosave'] 431 | self.created_at = data['created_at'] 432 | self.description = data['description'] 433 | self.exp_hash = data['exp_hash'] 434 | 435 | # load .tags file 436 | meta_tags_path = self.get_data_path(self.name, self.version) + '/meta_tags.csv' 437 | df = pd.read_csv(meta_tags_path) 438 | self.tags_list = df.to_dict(orient='records') 439 | self.tags = {} 440 | for d in self.tags_list: 441 | k, v = d['key'], d['value'] 442 | self.tags[k] = v 443 | 444 | # load metrics 445 | metrics_file_path = self.get_data_path(self.name, self.version) + '/metrics.csv' 446 | try: 447 | df = pd.read_csv(metrics_file_path) 448 | self.metrics = df.to_dict(orient='records') 449 | 450 | # remove nans 451 | for metric in self.metrics: 452 | to_delete = [] 453 | for k, v in metric.items(): 454 | try: 455 | if np.isnan(v): 456 | to_delete.append(k) 457 | except Exception as e: 458 | pass 459 | 460 | for k in to_delete: 461 | del metric[k] 462 | except Exception as e: 463 | # metrics was empty... 464 | self.metrics = [] 465 | 466 | def get_data_path(self, exp_name, exp_version): 467 | """ 468 | Returns the path to the local package cache 469 | :param path: 470 | :return: 471 | """ 472 | if self.no_save_dir: 473 | return os.path.join(_ROOT, 'test_tube_data', exp_name, 'version_{}'.format(exp_version)) 474 | else: 475 | return os.path.join(_ROOT, exp_name, 'version_{}'.format(exp_version)) 476 | 477 | def get_media_path(self, exp_name, exp_version): 478 | """ 479 | Returns the path to the local package cache 480 | :param path: 481 | :return: 482 | """ 483 | return os.path.join(self.get_data_path(exp_name, exp_version), 'media') 484 | 485 | def get_tensorboardx_path(self, exp_name, exp_version): 486 | """ 487 | Returns the path to the local package cache 488 | :param path: 489 | :return: 490 | """ 491 | return os.path.join(self.get_data_path(exp_name, exp_version), 'tf') 492 | 493 | def get_tensorboardx_scalars_path(self, exp_name, exp_version): 494 | """ 495 | Returns the path to the local package cache 496 | :param path: 497 | :return: 498 | """ 499 | tfx_path = self.get_tensorboardx_path(exp_name, exp_version) 500 | return os.path.join(tfx_path, 'scalars.json') 501 | 502 | 503 | # ---------------------------- 504 | # OVERWRITES 505 | # ---------------------------- 506 | def _get_file_writer(self): 507 | """Returns the default FileWriter instance. Recreates it if closed.""" 508 | if self.rank > 0: 509 | return TTDummyFileWriter() 510 | 511 | if self.all_writers is None or self.file_writer is None: 512 | if self.purge_step is not None: 513 | most_recent_step = self.purge_step 514 | self.file_writer = FileWriter(self.log_dir, self.max_queue, 515 | self.flush_secs, self.filename_suffix) 516 | self.file_writer.debug = self.debug 517 | self.file_writer.rank = self.rank 518 | 519 | self.file_writer.add_event( 520 | Event(step=most_recent_step, file_version='brain.Event:2')) 521 | self.file_writer.add_event( 522 | Event(step=most_recent_step, session_log=SessionLog(status=SessionLog.START))) 523 | else: 524 | self.file_writer = FileWriter(self.log_dir, self.max_queue, 525 | self.flush_secs, self.filename_suffix) 526 | self.all_writers = {self.file_writer.get_logdir(): self.file_writer} 527 | return self.file_writer 528 | 529 | 530 | def __str__(self): 531 | return 'Exp: {}, v: {}'.format(self.name, self.version) 532 | 533 | def __hash__(self): 534 | return 'Exp: {}, v: {}'.format(self.name, self.version) 535 | 536 | def flush(self): 537 | if self.rank > 0: 538 | return 539 | 540 | if self.all_writers is None: 541 | return # ignore double close 542 | 543 | for writer in self.all_writers.values(): 544 | writer.flush() 545 | 546 | 547 | class TTDummyFileWriter(object): 548 | 549 | def add_summary(self, summary, global_step=None, walltime=None): 550 | """ 551 | Overwrite tf add summary so we can ignore when other non-zero processes call it 552 | Avoids overwriting logs from multiple processes 553 | :param summary: 554 | :param global_step: 555 | :param walltime: 556 | :return: 557 | """ 558 | return 559 | 560 | 561 | @contextlib.contextmanager 562 | def atomic_write(dst_path): 563 | """A context manager to simplify atomic writing. 564 | 565 | Usage: 566 | >>> with atomic_write(dst_path) as tmp_path: 567 | >>> # write to tmp_path 568 | >>> # Here tmp_path renamed to dst_path, if no exception happened. 569 | """ 570 | tmp_path = str(dst_path) + '.tmp' 571 | try: 572 | yield tmp_path 573 | except: 574 | if os.path.exists(tmp_path): 575 | os.remove(tmp_path) 576 | raise 577 | else: 578 | # If everything is fine, move tmp file to the destination. 579 | shutil.move(tmp_path, str(dst_path)) 580 | 581 | 582 | def find_last_experiment_version(path): 583 | last_version = -1 584 | for f in os.listdir(path): 585 | if 'version_' in f: 586 | file_parts = f.split('_') 587 | version = int(file_parts[-1]) 588 | last_version = max(last_version, version) 589 | return last_version 590 | 591 | 592 | if __name__ == '__main__': 593 | from time import sleep 594 | e = Experiment(description='my description') 595 | e.tag({'lr': 0.02, 'layers': 4}) 596 | 597 | for n_iter in range(20): 598 | sleep(0.3) 599 | e.log({'loss/xsinx': n_iter * np.sin(n_iter)}) 600 | if n_iter % 10 == 0: 601 | print('saved') 602 | e.save() 603 | 604 | e.close() 605 | os._exit(1) 606 | 607 | --------------------------------------------------------------------------------