├── examples
├── __init__.py
├── saved_logs
│ └── example_test_tube_data
│ │ ├── demo_test_0
│ │ ├── version_0
│ │ │ ├── meta_tags.json
│ │ │ ├── media
│ │ │ │ └── jpg_0.jpg
│ │ │ └── metrics.csv
│ │ └── version_1
│ │ │ ├── meta_tags.json
│ │ │ ├── media
│ │ │ └── jpg_0.jpg
│ │ │ └── metrics.csv
│ │ └── demo_test_1
│ │ ├── version_0
│ │ ├── meta_tags.json
│ │ ├── media
│ │ │ └── jpg_0.jpg
│ │ └── metrics.csv
│ │ └── version_1
│ │ ├── meta_tags.json
│ │ ├── media
│ │ └── jpg_0.jpg
│ │ └── metrics.csv
├── tensorflow_example.py
├── pytorch_hpc_example.py
└── hpc_cpu_example.py
├── test_tube
├── hyper_opt_utils
│ ├── __init__.py
│ └── strategies.py
├── .DS_Store
├── __init__.py
├── hyperopt.py
├── argparse_hopt.py
├── hpc.py
└── log.py
├── .DS_Store
├── imgs
├── viz_a.png
└── test_tube_logo.png
├── docs
├── img
│ └── viz_a.png
├── index.md
├── experiment_tracking
│ └── experiment.md
├── hyperparameter_optimization
│ └── HyperOptArgumentParser.md
└── hpc
│ └── SlurmCluster.md
├── site
├── img
│ ├── viz_a.png
│ └── favicon.ico
├── sitemap.xml.gz
├── fonts
│ ├── fontawesome-webfont.eot
│ ├── fontawesome-webfont.ttf
│ └── fontawesome-webfont.woff
├── sitemap.xml
├── search
│ ├── main.js
│ └── worker.js
├── js
│ ├── theme.js
│ └── modernizr-2.8.3.min.js
├── css
│ └── theme_extra.css
├── 404.html
├── search.html
├── index.html
├── experiment_tracking
│ └── experiment
│ │ └── index.html
├── hyperparameter_optimization
│ └── HyperOptArgumentParser
│ │ └── index.html
└── hpc
│ └── SlurmCluster
│ └── index.html
├── requirements.txt
├── tests
├── log_test.py
├── argparse_hopt_test.py
├── hpc_test.py
└── strategies_test.py
├── update.sh
├── mkdocs.yml
├── setup.cfg
├── .travis.yml
├── LICENSE
├── setup.py
├── .gitignore
└── README.md
/examples/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/test_tube/hyper_opt_utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/.DS_Store
--------------------------------------------------------------------------------
/imgs/viz_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/imgs/viz_a.png
--------------------------------------------------------------------------------
/docs/img/viz_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/docs/img/viz_a.png
--------------------------------------------------------------------------------
/site/img/viz_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/site/img/viz_a.png
--------------------------------------------------------------------------------
/site/sitemap.xml.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/site/sitemap.xml.gz
--------------------------------------------------------------------------------
/test_tube/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/test_tube/.DS_Store
--------------------------------------------------------------------------------
/site/img/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/site/img/favicon.ico
--------------------------------------------------------------------------------
/examples/saved_logs/example_test_tube_data/demo_test_0/version_0/meta_tags.json:
--------------------------------------------------------------------------------
1 | {"tag_b": "s", "tag_a": 2}
--------------------------------------------------------------------------------
/examples/saved_logs/example_test_tube_data/demo_test_0/version_1/meta_tags.json:
--------------------------------------------------------------------------------
1 | {"tag_a": 2, "tag_b": "s"}
--------------------------------------------------------------------------------
/examples/saved_logs/example_test_tube_data/demo_test_1/version_0/meta_tags.json:
--------------------------------------------------------------------------------
1 | {"tag_b": "s", "tag_a": 2}
--------------------------------------------------------------------------------
/examples/saved_logs/example_test_tube_data/demo_test_1/version_1/meta_tags.json:
--------------------------------------------------------------------------------
1 | {"tag_a": 2, "tag_b": "s"}
--------------------------------------------------------------------------------
/imgs/test_tube_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/imgs/test_tube_logo.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas>=0.20.3
2 | numpy>=1.13.3
3 | imageio>=2.3.0
4 | tensorboard>=1.15.0
5 | torch>=1.1.0
6 | future
--------------------------------------------------------------------------------
/site/fonts/fontawesome-webfont.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/site/fonts/fontawesome-webfont.eot
--------------------------------------------------------------------------------
/site/fonts/fontawesome-webfont.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/site/fonts/fontawesome-webfont.ttf
--------------------------------------------------------------------------------
/site/fonts/fontawesome-webfont.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/site/fonts/fontawesome-webfont.woff
--------------------------------------------------------------------------------
/tests/log_test.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 |
4 | def test_hello():
5 | assert 4==4
6 |
7 | if __name__ == '__main__':
8 | pytest.main([__file__])
9 |
--------------------------------------------------------------------------------
/tests/argparse_hopt_test.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 |
4 | def test_hello():
5 | assert 4==4
6 |
7 | if __name__ == '__main__':
8 | pytest.main([__file__])
9 |
--------------------------------------------------------------------------------
/examples/saved_logs/example_test_tube_data/demo_test_0/version_0/media/jpg_0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/examples/saved_logs/example_test_tube_data/demo_test_0/version_0/media/jpg_0.jpg
--------------------------------------------------------------------------------
/examples/saved_logs/example_test_tube_data/demo_test_0/version_1/media/jpg_0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/examples/saved_logs/example_test_tube_data/demo_test_0/version_1/media/jpg_0.jpg
--------------------------------------------------------------------------------
/examples/saved_logs/example_test_tube_data/demo_test_1/version_0/media/jpg_0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/examples/saved_logs/example_test_tube_data/demo_test_1/version_0/media/jpg_0.jpg
--------------------------------------------------------------------------------
/examples/saved_logs/example_test_tube_data/demo_test_1/version_1/media/jpg_0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamFalcon/test-tube/HEAD/examples/saved_logs/example_test_tube_data/demo_test_1/version_1/media/jpg_0.jpg
--------------------------------------------------------------------------------
/test_tube/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Experiment logger module
3 | """
4 |
5 | from .argparse_hopt import HyperOptArgumentParser
6 | from .hpc import SlurmCluster
7 | from .hyperopt import HyperParamOptimizer
8 | from .log import Experiment
9 |
--------------------------------------------------------------------------------
/examples/saved_logs/example_test_tube_data/demo_test_0/version_0/metrics.csv:
--------------------------------------------------------------------------------
1 | created_at,fake_jpg,row_3,test
2 | 2017-10-13 02:07:28.005016,/Users/waf/test_tube_data/demo_test_0/version_0/media/jpg_0.jpg,,2
3 | 2017-10-13 02:07:28.005031,,3,2
4 |
--------------------------------------------------------------------------------
/examples/saved_logs/example_test_tube_data/demo_test_0/version_1/metrics.csv:
--------------------------------------------------------------------------------
1 | created_at,fake_jpg,row_3,test
2 | 2017-10-13 02:07:37.395603,/Users/waf/Developer/log_suite/test_tube/test_tube/test_tube_data/demo_test_0/version_1/media/jpg_0.jpg,,2
3 | 2017-10-13 02:07:37.395635,,3,2
4 |
--------------------------------------------------------------------------------
/examples/saved_logs/example_test_tube_data/demo_test_1/version_0/metrics.csv:
--------------------------------------------------------------------------------
1 | created_at,fake_jpg,row_3,test
2 | 2017-10-13 02:07:28.035057,/Users/waf/Developer/log_suite/test_tube/test_tube/test_tube_data/demo_test_1/version_0/media/jpg_0.jpg,,2
3 | 2017-10-13 02:07:28.035086,,3,2
4 |
--------------------------------------------------------------------------------
/examples/saved_logs/example_test_tube_data/demo_test_1/version_1/metrics.csv:
--------------------------------------------------------------------------------
1 | created_at,fake_jpg,row_3,test
2 | 2017-10-13 02:07:37.443175,/Users/waf/Developer/log_suite/test_tube/test_tube/test_tube_data/demo_test_1/version_1/media/jpg_0.jpg,,2
3 | 2017-10-13 02:07:37.443252,,3,2
4 |
--------------------------------------------------------------------------------
/update.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | version=$1
4 |
5 | git commit -am "release v$version"
6 | git tag $version -m "test_tube v$version"
7 | git push --tags origin master
8 |
9 | # push to pypi
10 | rm -rf ./dist/*
11 | python3 setup.py sdist
12 | twine upload dist/*
13 |
14 |
15 |
16 | # to update docs
17 | # cd to root dir
18 | # mkdocs gh-deploy
19 |
20 |
21 |
--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: Test tube Documentation
2 | theme: readthedocs
3 | docs_dir: docs
4 | repo_url: https://github.com/williamFalcon/test_tube
5 | site_dir: 'site'
6 | site_description: 'Documentation for Test Tube, the Python Deep Learning and Machine Learning experiment tracking and tuning framework.'
7 |
8 | dev_addr: '0.0.0.0:8000'
9 | #google_analytics: ['UA-aasd', 'sitename']
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 |
4 | [yapf]
5 | align_closing_bracket_with_visual_indent = True
6 | # Put braces on their own line.
7 | dedent_closing_brackets = True
8 | split_before_closing_bracket = True
9 | indent_width = 4
10 | coalesce_brackets = True
11 | allow_multiline_lambdas = True
12 | join_multiple_lines = True
13 | spaces_around_power_operator = False
14 | column_limit = 100
15 |
--------------------------------------------------------------------------------
/site/sitemap.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | None
5 | 2019-08-03
6 | daily
7 |
8 |
9 | None
10 | 2019-08-03
11 | daily
12 |
13 |
14 | None
15 | 2019-08-03
16 | daily
17 |
18 |
19 | None
20 | 2019-08-03
21 | daily
22 |
23 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | # command to install dependencies
3 | cache: pip
4 |
5 | matrix:
6 | include:
7 | - python: 3.6
8 | dist: xenial # Ubuntu 16.04
9 | env:
10 | - MIN_REQUIREMENTS=1
11 | - python: 3.6
12 | dist: bionic # Ubuntu 18.04
13 | - python: 3.7
14 | dist: bionic # Ubuntu 18.04
15 |
16 | install:
17 | - if [[ "${MIN_REQUIREMENTS}" == "1" ]]; then
18 | python -c "req = open('requirements.txt').read().replace('>', '=') ; open('requirements-ci.txt', 'w').write(req)" ;
19 | pip install -r requirements-ci.txt ;
20 | fi
21 | - pip install -e .
22 |
23 |
24 | # command to run tests
25 | script:
26 | - pytest # or py.test for Python versions 3.5 and below
27 |
28 | notifications:
29 | email: false
30 |
--------------------------------------------------------------------------------
/tests/hpc_test.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from test_tube.argparse_hopt import HyperOptArgumentParser
4 | from test_tube.hpc import SlurmCluster
5 |
6 |
7 | def test_slurm_time_to_seconds():
8 | parser = HyperOptArgumentParser()
9 | parsed = parser.parse_args()
10 | cluster = SlurmCluster(log_path='/home/travis', hyperparam_optimizer=parsed)
11 |
12 | assert cluster.slurm_time_to_seconds('15:00') == 900
13 | assert cluster.slurm_time_to_seconds('1-12:20:12') == 130812
14 | assert cluster.slurm_time_to_seconds('1:20:12') == 4812
15 | assert cluster.slurm_time_to_seconds('00:20:12') == 1212
16 | assert cluster.slurm_time_to_seconds('00:00:12') == 12
17 | assert cluster.slurm_time_to_seconds('12') == 12
18 |
19 |
20 | if __name__ == '__main__':
21 | pytest.main([__file__])
22 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License
2 |
3 | Copyright (c) 2017-2018 William Falcon
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import os
3 | from setuptools import find_packages, setup
4 |
5 | version = '0.7.5'
6 | PATH_ROOT = os.path.dirname(__file__)
7 |
8 |
9 | def load_requirements(path_dir=PATH_ROOT, comment_char='#'):
10 | with open(os.path.join(path_dir, 'requirements.txt'), 'r') as file:
11 | lines = [ln.strip() for ln in file.readlines()]
12 | reqs = []
13 | for ln in lines:
14 | # filer all comments
15 | if comment_char in ln:
16 | ln = ln[:ln.index(comment_char)]
17 | if ln: # if requirement is not empty
18 | reqs.append(ln)
19 | return reqs
20 |
21 |
22 | setup(
23 | name='test_tube',
24 | packages=find_packages(),
25 | version=version,
26 | description='Experiment logger and visualizer',
27 | author='William Falcon',
28 | install_requires=load_requirements(PATH_ROOT),
29 | author_email='will@hacstudios.com',
30 | url='https://github.com/williamFalcon/test_tube',
31 | download_url='https://github.com/williamFalcon/test_tube/archive/{}.tar.gz'.format(version),
32 | keywords=[
33 | 'testing',
34 | 'machine learning',
35 | 'deep learning',
36 | 'prototyping',
37 | 'experimenting',
38 | 'modeling',
39 | ],
40 | )
41 |
--------------------------------------------------------------------------------
/tests/strategies_test.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from test_tube.hyper_opt_utils import strategies
4 |
5 | GRID_SEARCH = 'grid_search'
6 | RANDOM_SEARCH = 'random_search'
7 |
8 | FLAT_PARAMS = [
9 | [
10 | {'idx': 0, 'val': 0.0001, 'name': 'learning_rate'},
11 | {'idx': 1, 'val': 0.001, 'name': 'learning_rate'},
12 | {'idx': 2, 'val': 0.01, 'name': 'learning_rate'},
13 | {'idx': 3, 'val': 0.1, 'name': 'learning_rate'}
14 | ],
15 | [
16 | {'idx': 4, 'val': 0.99, 'name': 'decay'},
17 | {'idx': 5, 'val': 0.999, 'name': 'decay'},
18 | ]
19 | ]
20 | def test_unknown_strategy():
21 | with pytest.raises(ValueError):
22 | strategies.generate_trials(
23 | 'unknown_strategy', FLAT_PARAMS, nb_trials=None)
24 |
25 | def test_grid_search_no_limit():
26 | trials = strategies.generate_trials(
27 | GRID_SEARCH, FLAT_PARAMS, nb_trials=None)
28 | assert len(trials) == len(FLAT_PARAMS[0]) * len(FLAT_PARAMS[1])
29 |
30 | def test_grid_search_limit():
31 | trials = strategies.generate_trials(
32 | GRID_SEARCH, FLAT_PARAMS, nb_trials=5)
33 | assert len(trials) == 5
34 |
35 |
36 | def test_random_search():
37 | trials = strategies.generate_trials(
38 | RANDOM_SEARCH, FLAT_PARAMS, nb_trials=5)
39 | assert len(trials) == 5
40 |
41 | def test_random_search_unbounded_error():
42 | with pytest.raises(TypeError):
43 | trials = strategies.generate_trials(
44 | RANDOM_SEARCH, FLAT_PARAMS, nb_trials=None)
45 |
46 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 | data/
6 | test_tube_data/
7 | *.experiment
8 | test.py
9 | example.json
10 | .pytest_cache/
11 | talk/
12 | .DS_Store
13 |
14 | # C extensions
15 | *.so
16 |
17 | src
18 |
19 | # Distribution / packaging
20 | .Python
21 | env/
22 | build/
23 | develop-eggs/
24 | dist/
25 | downloads/
26 | eggs/
27 | .eggs/
28 | lib/
29 | lib64/
30 | parts/
31 | sdist/
32 | var/
33 | *.egg-info/
34 | .installed.cfg
35 | *.egg
36 |
37 | # PyInstaller
38 | # Usually these files are written by a python script from a template
39 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
40 | *.manifest
41 | *.spec
42 |
43 | # Installer logs
44 | pip-log.txt
45 | pip-delete-this-directory.txt
46 |
47 | # Unit test / coverage reports
48 | htmlcov/
49 | .tox/
50 | .coverage
51 | .coverage.*
52 | .cache
53 | nosetests.xml
54 | coverage.xml
55 | *,cover
56 | .hypothesis/
57 |
58 | .idea
59 |
60 | # Translations
61 | *.mo
62 | *.pot
63 |
64 | # Django stuff:
65 | *.log
66 | local_settings.py
67 |
68 | # Flask stuff:
69 | instance/
70 | .webassets-cache
71 |
72 | # Scrapy stuff:
73 | .scrapy
74 |
75 | # Sphinx documentation
76 | docs/_build/
77 |
78 | # PyBuilder
79 | target/
80 |
81 | # IPython Notebook
82 | .ipynb_checkpoints
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # celery beat schedule file
88 | celerybeat-schedule
89 |
90 | # dotenv
91 | .env
92 |
93 | # virtualenv
94 | venv/
95 | ENV/
96 |
97 | # Spyder project settings
98 | .spyderproject
99 |
100 | # Rope project settings
101 | .ropeproject
102 |
--------------------------------------------------------------------------------
/examples/tensorflow_example.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | from test_tube import Experiment, HyperOptArgumentParser
4 |
5 | """
6 | This script demonstrates how to do a hyperparameter search over 2 parameters in tensorflow
7 | on 4 simultaneous GPUs. Each trial will also save its own experiment logs.
8 |
9 | A single trial gets allocated on a single GPU until all trials have completed.
10 | This means for 10 trials and 4 GPUs, we'll run 4 in parallel twice and the last 2 trials in parallel.
11 | """
12 |
13 |
14 | # main training function (very simple)
15 | def train(hparams):
16 | # init exp and track all the parameters from the HyperOptArgumentParser
17 | exp = Experiment(
18 | name=hparams.test_tube_exp_name,
19 | save_dir=hparams.log_path,
20 | autosave=False,
21 | )
22 | exp.argparse(hparams)
23 |
24 | # define tensorflow graph
25 | x = tf.placeholder(dtype=tf.int32, name='x')
26 | y = tf.placeholder(dtype=tf.int32, name='y')
27 | out = x * y
28 |
29 | sess = tf.Session()
30 |
31 | # Run the tf op
32 | for train_step in range(0, 100):
33 | output = sess.run(out, feed_dict={x: hparams.x_val, y: hparams.y_val})
34 | exp.log({'fake_err': output})
35 |
36 | # save exp when we're done
37 | exp.save()
38 |
39 |
40 | # set up our argparser and make the y_val tunable
41 | parser = HyperOptArgumentParser(strategy='random_search')
42 | parser.add_argument('--test_tube_exp_name', default='my_test')
43 | parser.add_argument('--log_path', default='/Users/waf/Desktop/test')
44 | parser.opt_list('--y_val', default=12, options=[1, 2, 3, 4], tunable=True)
45 | parser.opt_list('--x_val', default=12, options=[20, 12, 30, 45], tunable=True)
46 | hyperparams = parser.parse_args()
47 |
48 |
49 | # optimize on 4 gpus at the same time
50 | # each gpu will get 1 experiment with a set of hyperparams
51 | hyperparams.optimize_parallel_gpu(train, gpu_ids=['1', '0', '3', '2'], nb_trials=4, nb_workers=4)
52 |
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | # Test Tube: Easily log and tune Deep Learning experiments
2 |
3 | Test Tube allows you to easily log metadata and track your machine
4 | learning experiments.
5 |
6 | Use Test Tube if you need to:
7 |
8 | - Track many [Experiments](experiment_tracking/experiment.md) across
9 | models.
10 | - Visualize and compare different
11 | experiments without uploading anywhere.
12 | - [Optimize your
13 | hyperparameters](hyperparameter_optimization/HyperOptArgumentParser/)
14 | using grid search or random search.
15 | - Automatically track ALL parameters for a particular training run.
16 |
17 | Test Tube is compatible with: Python 2 and 3
18 |
19 | ## Getting started
20 |
21 | ------------------------------------------------------------------------
22 |
23 | ### Create an [Experiment](experiment_tracking/experiment.md)
24 |
25 | ``` {.python}
26 | from test_tube import Experiment
27 |
28 | exp = Experiment(name='dense_model',
29 | debug=False,
30 | save_dir='/Desktop/test_tube')
31 |
32 | exp.tag({'learning_rate': 0.002, 'nb_layers': 2})
33 |
34 | for step in training_steps:
35 | tng_err = model.eval(tng_x, tng_y)
36 |
37 | exp.log('tng_err': tng_err)
38 |
39 | # training complete!
40 | # all your logs and data are ready to be visualized at testtube.williamfalcon.com
41 | ```
42 |
43 | ------------------------------------------------------------------------
44 |
45 | ### Optimize your [hyperparameters](hyperparameter_optimization/HyperOptArgumentParser/)
46 |
47 | ``` {.python}
48 | from test_tube import HyperOptArgumentParser
49 |
50 | # subclass of argparse
51 | parser = HyperOptArgumentParser(strategy='random_search')
52 | parser.add_argument('--learning_rate', default=0.002, type=float, help='the learning rate')
53 |
54 | # let's enable optimizing over the number of layers in the network
55 | parser.opt_list('--nb_layers', default=2, type=int, tunable=True, options=[2, 4, 8])
56 |
57 | # and tune the number of units in each layer
58 | parser.opt_range('--neurons', default=50, type=int, tunable=True, low=100, high=800, nb_samples=10)
59 |
60 | # compile (because it's argparse underneath)
61 | hparams = parser.parse_args()
62 |
63 | # run 20 trials of random search over the hyperparams
64 | for hparam_trial in hparams.trials(20):
65 | train_network(hparam_trial)
66 | ```
67 |
68 | ------------------------------------------------------------------------
69 |
70 | ### Visualize
71 |
72 | ``` {.python}
73 | import pandas as pd
74 | import matplotlib
75 |
76 | # each experiment is saved to a metrics.csv file which can be imported anywhere
77 | # images save to exp/version/images
78 | df = pd.read_csv('../some/dir/test_tube_data/dense_model/version_0/metrics.csv')
79 | df.tng_err.plot()
80 | ```
81 |
--------------------------------------------------------------------------------
/examples/pytorch_hpc_example.py:
--------------------------------------------------------------------------------
1 | """Example launcher for a hyperparameter search on SLURM.
2 |
3 | This example shows how to use gpus on SLURM with PyTorch.
4 | """
5 | import torch
6 |
7 | from test_tube import Experiment, HyperOptArgumentParser, SlurmCluster
8 |
9 |
10 | def train(hparams, *args):
11 | """Train your awesome model.
12 |
13 | :param hparams: The arguments to run the model with.
14 | """
15 | # Initialize experiments and track all the hyperparameters
16 | exp = Experiment(
17 | name=hparams.test_tube_exp_name,
18 | # Location to save the metrics.
19 | save_dir=hparams.log_path,
20 | autosave=False,
21 | )
22 | exp.argparse(hparams)
23 |
24 | # Pretend to train.
25 | x = torch.rand((1, hparams.x_val))
26 | for train_step in range(0, 100):
27 | y = torch.rand((hparams.x_val, 1))
28 | out = x.mm(y)
29 | exp.log({'fake_err': out.item()})
30 |
31 | # Save exp when .
32 | exp.save()
33 |
34 |
35 | if __name__ == '__main__':
36 | # Set up our argparser and make the y_val tunable.
37 | parser = HyperOptArgumentParser(strategy='random_search')
38 | parser.add_argument('--test_tube_exp_name', default='my_test')
39 | parser.add_argument('--log_path', default='/some/path/to/log')
40 | parser.opt_list('--y_val',
41 | default=12, options=[1, 2, 3, 4, 5, 6], tunable=True)
42 | parser.opt_list('--x_val',
43 | default=12, options=[20, 12, 30, 45], tunable=True)
44 | hyperparams = parser.parse_args()
45 |
46 | # Enable cluster training.
47 | cluster = SlurmCluster(
48 | hyperparam_optimizer=hyperparams,
49 | log_path=hyperparams.log_path,
50 | python_cmd='python3',
51 | test_tube_exp_name=hyperparams.test_tube_exp_name
52 | )
53 |
54 | # Email results if your hpc supports it.
55 | cluster.notify_job_status(
56 | email='some@email.com', on_done=True, on_fail=True)
57 |
58 | # SLURM Module to load.
59 | cluster.load_modules([
60 | 'python-3',
61 | 'anaconda3'
62 | ])
63 |
64 | # Add commands to the non-SLURM portion.
65 | cluster.add_command('source activate myCondaEnv')
66 |
67 | # Add custom SLURM commands which show up as:
68 | # #comment
69 | # #SBATCH --cmd=value
70 | # ############
71 | # cluster.add_slurm_cmd(
72 | # cmd='cpus-per-task', value='1', comment='CPUS per task.')
73 |
74 | # Set job compute details (this will apply PER set of hyperparameters.)
75 | cluster.per_experiment_nb_gpus = 4
76 | cluster.per_experiment_nb_nodes = 2
77 | cluster.gpu_type = '1080ti'
78 |
79 | # Each hyperparameter combination will use 8 gpus.
80 | cluster.optimize_parallel_cluster_gpu(
81 | # Function to execute:
82 | train,
83 | # Number of hyperparameter combinations to search:
84 | nb_trials=24,
85 | # This is what will display in the slurm queue:
86 | job_name='first_tt_job')
87 |
--------------------------------------------------------------------------------
/examples/hpc_cpu_example.py:
--------------------------------------------------------------------------------
1 | """Example launcher for a hyperparameter search on SLURM."""
2 | from test_tube import Experiment, HyperOptArgumentParser, SlurmCluster
3 |
4 |
5 | def train(hparams, *args):
6 | """Train your awesome model.
7 |
8 | :param hparams: The arguments to run the model with.
9 | """
10 | # Initialize experiments and track all the hyperparameters
11 | exp = Experiment(
12 | name=hparams.test_tube_exp_name,
13 | # Location to save the metrics.
14 | save_dir=hparams.log_path,
15 | # The experiment version is optional, but using the one
16 | # from SLURM means the exp will not collide with other
17 | # versions if SLURM runs multiple at once.
18 | version=hparams.hpc_exp_number,
19 | autosave=False,
20 | )
21 | exp.argparse(hparams)
22 |
23 | # Pretend to train.
24 | x = hparams.x_val
25 | for train_step in range(0, 100):
26 | y = hparams.y_val
27 | out = x * y
28 | exp.log({'fake_err': out.item()}) # Log metrics.
29 |
30 | # Save exp when done.
31 | exp.save()
32 |
33 |
34 | if __name__ == '__main__':
35 | # Set up our argparser and make the y_val tunable.
36 | parser = HyperOptArgumentParser(strategy='random_search')
37 | parser.add_argument('--test_tube_exp_name', default='my_test')
38 | parser.add_argument('--log_path', default='/some/path/to/log')
39 | parser.opt_list('--y_val',
40 | default=12, options=[1, 2, 3, 4, 5, 6], tunable=True)
41 | parser.opt_list('--x_val',
42 | default=12, options=[20, 12, 30, 45], tunable=True)
43 | hyperparams = parser.parse_args()
44 |
45 | # Enable cluster training.
46 | cluster = SlurmCluster(
47 | hyperparam_optimizer=hyperparams,
48 | log_path=hyperparams.log_path,
49 | python_cmd='python3',
50 | test_tube_exp_name=hyperparams.test_tube_exp_name
51 | )
52 |
53 | # Email results if your hpc supports it.
54 | cluster.notify_job_status(
55 | email='some@email.com', on_done=True, on_fail=True)
56 |
57 | # SLURM Module to load.
58 | cluster.load_modules([
59 | 'python-3',
60 | 'anaconda3'
61 | ])
62 |
63 | # Add commands to the non-SLURM portion.
64 | cluster.add_command('source activate myCondaEnv')
65 |
66 | # Add custom SLURM commands which show up as:
67 | # #comment
68 | # #SBATCH --cmd=value
69 | # ############
70 | # cluster.add_slurm_cmd(
71 | # cmd='cpus-per-task', value='1', comment='CPUS per task.')
72 |
73 | # Set job compute details (this will apply PER set of hyperparameters.)
74 | cluster.per_experiment_nb_cpus = 20
75 | cluster.per_experiment_nb_nodes = 10
76 |
77 | # Each hyperparameter combination will use 200 cpus.
78 | cluster.optimize_parallel_cluster_cpu(
79 | # Function to execute:
80 | train,
81 | # Number of hyperparameter combinations to search:
82 | nb_trials=24,
83 | job_name='first_tt_job',
84 | # This is what will display in the slurm queue:
85 | job_display_name='short_name')
86 |
--------------------------------------------------------------------------------
/site/search/main.js:
--------------------------------------------------------------------------------
1 | function getSearchTermFromLocation() {
2 | var sPageURL = window.location.search.substring(1);
3 | var sURLVariables = sPageURL.split('&');
4 | for (var i = 0; i < sURLVariables.length; i++) {
5 | var sParameterName = sURLVariables[i].split('=');
6 | if (sParameterName[0] == 'q') {
7 | return decodeURIComponent(sParameterName[1].replace(/\+/g, '%20'));
8 | }
9 | }
10 | }
11 |
12 | function joinUrl (base, path) {
13 | if (path.substring(0, 1) === "/") {
14 | // path starts with `/`. Thus it is absolute.
15 | return path;
16 | }
17 | if (base.substring(base.length-1) === "/") {
18 | // base ends with `/`
19 | return base + path;
20 | }
21 | return base + "/" + path;
22 | }
23 |
24 | function formatResult (location, title, summary) {
25 | return '' + summary +'
';
26 | }
27 |
28 | function displayResults (results) {
29 | var search_results = document.getElementById("mkdocs-search-results");
30 | while (search_results.firstChild) {
31 | search_results.removeChild(search_results.firstChild);
32 | }
33 | if (results.length > 0){
34 | for (var i=0; i < results.length; i++){
35 | var result = results[i];
36 | var html = formatResult(result.location, result.title, result.summary);
37 | search_results.insertAdjacentHTML('beforeend', html);
38 | }
39 | } else {
40 | search_results.insertAdjacentHTML('beforeend', "
No results found
");
41 | }
42 | }
43 |
44 | function doSearch () {
45 | var query = document.getElementById('mkdocs-search-query').value;
46 | if (query.length > 2) {
47 | if (!window.Worker) {
48 | displayResults(search(query));
49 | } else {
50 | searchWorker.postMessage({query: query});
51 | }
52 | } else {
53 | // Clear results for short queries
54 | displayResults([]);
55 | }
56 | }
57 |
58 | function initSearch () {
59 | var search_input = document.getElementById('mkdocs-search-query');
60 | if (search_input) {
61 | search_input.addEventListener("keyup", doSearch);
62 | }
63 | var term = getSearchTermFromLocation();
64 | if (term) {
65 | search_input.value = term;
66 | doSearch();
67 | }
68 | }
69 |
70 | function onWorkerMessage (e) {
71 | if (e.data.allowSearch) {
72 | initSearch();
73 | } else if (e.data.results) {
74 | var results = e.data.results;
75 | displayResults(results);
76 | }
77 | }
78 |
79 | if (!window.Worker) {
80 | console.log('Web Worker API not supported');
81 | // load index in main thread
82 | $.getScript(joinUrl(base_url, "search/worker.js")).done(function () {
83 | console.log('Loaded worker');
84 | init();
85 | window.postMessage = function (msg) {
86 | onWorkerMessage({data: msg});
87 | };
88 | }).fail(function (jqxhr, settings, exception) {
89 | console.error('Could not load worker.js');
90 | });
91 | } else {
92 | // Wrap search in a web worker
93 | var searchWorker = new Worker(joinUrl(base_url, "search/worker.js"));
94 | searchWorker.postMessage({init: true});
95 | searchWorker.onmessage = onWorkerMessage;
96 | }
97 |
--------------------------------------------------------------------------------
/test_tube/hyper_opt_utils/strategies.py:
--------------------------------------------------------------------------------
1 | """Hyperparameter search strategies."""
2 | import itertools
3 | import json
4 | import random
5 |
6 |
7 | def generate_trials(strategy, flat_params, nb_trials=None):
8 | r"""Generates the parameter combinations to search.
9 |
10 | Two search strategies are implemented:
11 | 1. `grid_search`: creates a search space that consists of the
12 | product of all flat_params. If `nb_trials` is specified
13 | the first `nb_trials` combinations are searched.
14 | 2. `random_search`: Creates random combinations of the
15 | hyperparameters. Can be used for a more efficient search.
16 | See (Bergstra and Bengio, 2012) for more details.
17 |
18 | :param strategy: The hyperparameter search to strategy. Can be
19 | one of: {`grid_search`, `random`}.
20 | :param flat_params: The hyperparameter arguments to iterate over.
21 | :param nb_trials: The number of hyperparameter combinations to try.
22 | Generates the parameter combinations for each requested trial
23 | :param strategy:
24 | :param flat_params:
25 | :param nb_trials: The number of trials to un.
26 | :return:
27 | """
28 | if strategy == 'grid_search':
29 | trials = generate_grid_search_trials(flat_params, nb_trials)
30 | return trials
31 | elif strategy == 'random_search':
32 | trials = generate_random_search_trials(flat_params, nb_trials)
33 | return trials
34 | else:
35 | raise ValueError(
36 | ('Unknown strategy "{}". Must be one of '
37 | '{{grid_search, random_search}}').format(strategy))
38 |
39 |
40 | def generate_grid_search_trials(flat_params, nb_trials):
41 | """
42 | Standard grid search. Takes the product of `flat_params`
43 | to generate the search space.
44 |
45 | :param params: The hyperparameters options to search.
46 | :param nb_trials: Returns the first `nb_trials` from the
47 | combinations space. If this is None, all combinations
48 | are returned.
49 | :return: A dict containing the hyperparameters.
50 | """
51 | trials = list(itertools.product(*flat_params))
52 | if nb_trials:
53 | trials = trials[0:nb_trials]
54 | return trials
55 |
56 |
57 | def generate_random_search_trials(params, nb_trials):
58 | """
59 | Generates random combination of hyperparameters to try.
60 | See (Bergstra and Bengio, 2012) for more details.
61 |
62 | :param params: The hyperparameters options to search.
63 | :param nb_trials: The number of trials to run.
64 | :return: A dict containing the hyperparameters.
65 | """
66 | if nb_trials is None:
67 | raise TypeError(
68 | '`random_search` strategy requires nb_trails to be an int.')
69 | results = []
70 |
71 | # ensures we have unique results
72 | seen_trials = set()
73 |
74 | # shuffle each param list
75 | potential_trials = 1
76 | for param in params:
77 | random.shuffle(param)
78 | potential_trials *= len(param)
79 |
80 | # we can't sample more trials than are possible
81 | max_iters = min(potential_trials, nb_trials)
82 |
83 | # then for the nb of trials requested, create a new param tuple
84 | # by picking a random integer at each param level
85 | while len(results) < max_iters:
86 | trial = []
87 | for param in params:
88 | sampled_param = random.sample(param, 1)[0]
89 | trial.append(sampled_param)
90 |
91 | # verify this is a unique trial so we
92 | # don't duplicate work
93 | trial_str = json.dumps(trial)
94 | if trial_str not in seen_trials:
95 | seen_trials.add(trial_str)
96 | results.append(trial)
97 |
98 | return results
99 |
--------------------------------------------------------------------------------
/site/js/theme.js:
--------------------------------------------------------------------------------
1 | $( document ).ready(function() {
2 | // Shift nav in mobile when clicking the menu.
3 | $(document).on('click', "[data-toggle='wy-nav-top']", function() {
4 | $("[data-toggle='wy-nav-shift']").toggleClass("shift");
5 | $("[data-toggle='rst-versions']").toggleClass("shift");
6 | });
7 |
8 | // Close menu when you click a link.
9 | $(document).on('click', ".wy-menu-vertical .current ul li a", function() {
10 | $("[data-toggle='wy-nav-shift']").removeClass("shift");
11 | $("[data-toggle='rst-versions']").toggleClass("shift");
12 | });
13 |
14 | // Keyboard navigation
15 | document.addEventListener("keydown", function(e) {
16 | var key = e.which || e.keyCode || window.event && window.event.keyCode;
17 | var page;
18 | switch (key) {
19 | case 78: // n
20 | page = $('[role="navigation"] a:contains(Next):first').prop('href');
21 | break;
22 | case 80: // p
23 | page = $('[role="navigation"] a:contains(Previous):first').prop('href');
24 | break;
25 | case 13: // enter
26 | if (e.target === document.getElementById('mkdocs-search-query')) {
27 | e.preventDefault();
28 | }
29 | break;
30 | default: break;
31 | }
32 | if ($(e.target).is(':input')) {
33 | return true;
34 | } else if (page) {
35 | window.location.href = page;
36 | }
37 | });
38 |
39 | $(document).on('click', "[data-toggle='rst-current-version']", function() {
40 | $("[data-toggle='rst-versions']").toggleClass("shift-up");
41 | });
42 |
43 | // Make tables responsive
44 | $("table.docutils:not(.field-list)").wrap("
");
45 |
46 | $('table').addClass('docutils');
47 | });
48 |
49 | window.SphinxRtdTheme = (function (jquery) {
50 | var stickyNav = (function () {
51 | var navBar,
52 | win,
53 | stickyNavCssClass = 'stickynav',
54 | applyStickNav = function () {
55 | if (navBar.height() <= win.height()) {
56 | navBar.addClass(stickyNavCssClass);
57 | } else {
58 | navBar.removeClass(stickyNavCssClass);
59 | }
60 | },
61 | enable = function () {
62 | applyStickNav();
63 | win.on('resize', applyStickNav);
64 | },
65 | init = function () {
66 | navBar = jquery('nav.wy-nav-side:first');
67 | win = jquery(window);
68 | };
69 | jquery(init);
70 | return {
71 | enable : enable
72 | };
73 | }());
74 | return {
75 | StickyNav : stickyNav
76 | };
77 | }($));
78 |
79 | // The code below is a copy of @seanmadsen code posted Jan 10, 2017 on issue 803.
80 | // https://github.com/mkdocs/mkdocs/issues/803
81 | // This just incorporates the auto scroll into the theme itself without
82 | // the need for additional custom.js file.
83 | //
84 | $(function() {
85 | $.fn.isFullyWithinViewport = function(){
86 | var viewport = {};
87 | viewport.top = $(window).scrollTop();
88 | viewport.bottom = viewport.top + $(window).height();
89 | var bounds = {};
90 | bounds.top = this.offset().top;
91 | bounds.bottom = bounds.top + this.outerHeight();
92 | return ( ! (
93 | (bounds.top <= viewport.top) ||
94 | (bounds.bottom >= viewport.bottom)
95 | ) );
96 | };
97 | if( $('li.toctree-l1.current').length && !$('li.toctree-l1.current').isFullyWithinViewport() ) {
98 | $('.wy-nav-side')
99 | .scrollTop(
100 | $('li.toctree-l1.current').offset().top -
101 | $('.wy-nav-side').offset().top -
102 | 60
103 | );
104 | }
105 | });
106 |
--------------------------------------------------------------------------------
/site/search/worker.js:
--------------------------------------------------------------------------------
1 | var base_path = 'function' === typeof importScripts ? '.' : '/search/';
2 | var allowSearch = false;
3 | var index;
4 | var documents = {};
5 | var lang = ['en'];
6 | var data;
7 |
8 | function getScript(script, callback) {
9 | console.log('Loading script: ' + script);
10 | $.getScript(base_path + script).done(function () {
11 | callback();
12 | }).fail(function (jqxhr, settings, exception) {
13 | console.log('Error: ' + exception);
14 | });
15 | }
16 |
17 | function getScriptsInOrder(scripts, callback) {
18 | if (scripts.length === 0) {
19 | callback();
20 | return;
21 | }
22 | getScript(scripts[0], function() {
23 | getScriptsInOrder(scripts.slice(1), callback);
24 | });
25 | }
26 |
27 | function loadScripts(urls, callback) {
28 | if( 'function' === typeof importScripts ) {
29 | importScripts.apply(null, urls);
30 | callback();
31 | } else {
32 | getScriptsInOrder(urls, callback);
33 | }
34 | }
35 |
36 | function onJSONLoaded () {
37 | data = JSON.parse(this.responseText);
38 | var scriptsToLoad = ['lunr.js'];
39 | if (data.config && data.config.lang && data.config.lang.length) {
40 | lang = data.config.lang;
41 | }
42 | if (lang.length > 1 || lang[0] !== "en") {
43 | scriptsToLoad.push('lunr.stemmer.support.js');
44 | if (lang.length > 1) {
45 | scriptsToLoad.push('lunr.multi.js');
46 | }
47 | for (var i=0; i < lang.length; i++) {
48 | if (lang[i] != 'en') {
49 | scriptsToLoad.push(['lunr', lang[i], 'js'].join('.'));
50 | }
51 | }
52 | }
53 | loadScripts(scriptsToLoad, onScriptsLoaded);
54 | }
55 |
56 | function onScriptsLoaded () {
57 | console.log('All search scripts loaded, building Lunr index...');
58 | if (data.config && data.config.separator && data.config.separator.length) {
59 | lunr.tokenizer.separator = new RegExp(data.config.separator);
60 | }
61 | if (data.index) {
62 | index = lunr.Index.load(data.index);
63 | data.docs.forEach(function (doc) {
64 | documents[doc.location] = doc;
65 | });
66 | console.log('Lunr pre-built index loaded, search ready');
67 | } else {
68 | index = lunr(function () {
69 | if (lang.length === 1 && lang[0] !== "en" && lunr[lang[0]]) {
70 | this.use(lunr[lang[0]]);
71 | } else if (lang.length > 1) {
72 | this.use(lunr.multiLanguage.apply(null, lang)); // spread operator not supported in all browsers: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/Spread_operator#Browser_compatibility
73 | }
74 | this.field('title');
75 | this.field('text');
76 | this.ref('location');
77 |
78 | for (var i=0; i < data.docs.length; i++) {
79 | var doc = data.docs[i];
80 | this.add(doc);
81 | documents[doc.location] = doc;
82 | }
83 | });
84 | console.log('Lunr index built, search ready');
85 | }
86 | allowSearch = true;
87 | postMessage({allowSearch: allowSearch});
88 | }
89 |
90 | function init () {
91 | var oReq = new XMLHttpRequest();
92 | oReq.addEventListener("load", onJSONLoaded);
93 | var index_path = base_path + '/search_index.json';
94 | if( 'function' === typeof importScripts ){
95 | index_path = 'search_index.json';
96 | }
97 | oReq.open("GET", index_path);
98 | oReq.send();
99 | }
100 |
101 | function search (query) {
102 | if (!allowSearch) {
103 | console.error('Assets for search still loading');
104 | return;
105 | }
106 |
107 | var resultDocuments = [];
108 | var results = index.search(query);
109 | for (var i=0; i < results.length; i++){
110 | var result = results[i];
111 | doc = documents[result.ref];
112 | doc.summary = doc.text.substring(0, 200);
113 | resultDocuments.push(doc);
114 | }
115 | return resultDocuments;
116 | }
117 |
118 | if( 'function' === typeof importScripts ) {
119 | onmessage = function (e) {
120 | if (e.data.init) {
121 | init();
122 | } else if (e.data.query) {
123 | postMessage({ results: search(e.data.query) });
124 | } else {
125 | console.error("Worker - Unrecognized message: " + e);
126 | }
127 | };
128 | }
129 |
--------------------------------------------------------------------------------
/docs/experiment_tracking/experiment.md:
--------------------------------------------------------------------------------
1 | # Experiment class API
2 |
3 | [[Github Code](https://github.com/williamFalcon/test-tube/blob/master/test_tube/log.py)]
4 |
5 | An Experiment holds metadata and the results of the training run, you
6 | can instantiate an `Experiment` via:
7 |
8 | ``` {.python}
9 | from test_tube import Experiment
10 |
11 | exp = Experiment(name='dense_model',
12 | debug=False,
13 | save_dir='/Desktop/test_tube')
14 |
15 | exp.tag({'learning_rate': 0.002, 'nb_layers': 2})
16 |
17 | for step in training_steps:
18 | tng_err = model.eval(tng_x, tng_y)
19 |
20 | exp.log({'tng_err': tng_err})
21 |
22 | # training complete!
23 | # all your logs and data are ready to be visualized at testtube.williamfalcon.com
24 | ```
25 |
26 | ------------------------------------------------------------------------
27 |
28 | ## init options
29 |
30 | ### version
31 |
32 | The same Experiment can have multiple versions. Test tube generates
33 | these automatically each time you run your model. To set your own
34 | version use:
35 |
36 | ``` {.python}
37 | exp = Experiment(name='dense_model',version=1)
38 | ```
39 |
40 | ### debug
41 |
42 | If you're debugging and don't want to create a log file turn debug to
43 | True
44 |
45 | ``` {.python}
46 | exp = Experiment(name='dense_model',debug=True)
47 | ```
48 |
49 | ### autosave
50 |
51 | If you only want to save at the end of training, turn autosave off:
52 |
53 | ``` {.python}
54 | exp = Experiment(name='dense_model', autosave=False)
55 |
56 | # run long training...
57 |
58 | # first time any logs are saved
59 | exp.save()
60 | ```
61 |
62 | ### `create_git_tag`
63 |
64 | Ever wanted a flashback to your code when you ran an experiment?
65 | Snapshot your code for this experiment using git tags:
66 |
67 | ``` {.python}
68 | exp = Experiment(name='dense_model', create_git_tag=True)
69 | ```
70 |
71 | ------------------------------------------------------------------------
72 |
73 | ## Methods
74 |
75 | ### tag
76 |
77 | ``` {.python}
78 | exp.tag({k: v})
79 | ```
80 |
81 | Adds an arbitrary dictionary of tags to the experiment
82 |
83 | **Example**
84 |
85 | ``` {.python}
86 | exp.tag({'dataset_name': 'imagenet_1', 'learning_rate': 0.0002})
87 | ```
88 |
89 | ### log
90 |
91 | ``` {.python}
92 | exp.log({k:v})
93 | ```
94 |
95 | Adds a row of data to the experiments
96 |
97 | **Example**
98 |
99 | ``` {.python}
100 | exp.log({'val_loss': 0.22, 'epoch_nb': 1, 'batch_nb': 12})
101 |
102 | # you can also add other rows that have separate information
103 | exp.log({'tng_loss': 0.01})
104 |
105 | # or even a numpy array image
106 | image = np.imread('image.png')
107 | exp.log({'fake_png': image})
108 | ```
109 |
110 | **Saving images Example**
111 |
112 | ``` {.python}
113 | # name must have either jpg, png or jpeg in it
114 | img = np.imread('a.jpg')
115 | exp.log('test_jpg': img, 'val_err': 0.2)
116 |
117 | # saves image to ../exp/version/media/test_0.jpg
118 | # csv has file path to that image in that cell
119 | ```
120 |
121 | To save an image, add `jpg`, `png` or `jpeg` to the key corresponding
122 | with the image array. The image must be formatted the same as skimage's
123 | [imsave](http://scikit-image.org/docs/dev/api/skimage.io.html#skimage.io.imsave)
124 | function
125 |
126 | ### argparse
127 |
128 | ``` {.python}
129 | exp.argparse(hparams)
130 | ```
131 |
132 | Transfers hyperparam information from Argparser or
133 | HyperOptArgumentParser
134 |
135 | **Example**
136 |
137 | ``` {.python}
138 | from test_tube import HyperOptArgumentParser
139 |
140 | # parse args
141 | parser = HyperOptArgumentParser()
142 | parser.add_argument('--learning_rate', default=0.002, type=float, help='the learning rate')
143 | hparams = parser.parse_args()
144 |
145 | # learning_rate is now a meta tag for your experiment
146 | exp.argparse(hparams)
147 | ```
148 |
149 | ### save
150 |
151 | ``` {.python}
152 | exp.save()
153 | ```
154 |
155 | Saves the exp to disk (including images)
156 |
157 | **Example**
158 |
159 | ``` {.python}
160 | exp = Experiment(name='dense_model', autosave=False)
161 |
162 | # run long training...
163 |
164 | # first time any logs are saved
165 | exp.save()
166 | ```
167 |
--------------------------------------------------------------------------------
/site/css/theme_extra.css:
--------------------------------------------------------------------------------
1 | /*
2 | * Sphinx doesn't have support for section dividers like we do in
3 | * MkDocs, this styles the section titles in the nav
4 | *
5 | * https://github.com/mkdocs/mkdocs/issues/175
6 | */
7 | .wy-menu-vertical span {
8 | line-height: 18px;
9 | padding: 0.4045em 1.618em;
10 | display: block;
11 | position: relative;
12 | font-size: 90%;
13 | color: #838383;
14 | }
15 |
16 | .wy-menu-vertical .subnav a {
17 | padding: 0.4045em 2.427em;
18 | }
19 |
20 | /*
21 | * Long navigations run off the bottom of the screen as the nav
22 | * area doesn't scroll.
23 | *
24 | * https://github.com/mkdocs/mkdocs/pull/202
25 | *
26 | * Builds upon pull 202 https://github.com/mkdocs/mkdocs/pull/202
27 | * to make toc scrollbar end before navigations buttons to not be overlapping.
28 | */
29 | .wy-nav-side {
30 | height: calc(100% - 45px);
31 | overflow-y: auto;
32 | min-height: 0;
33 | }
34 |
35 | .rst-versions{
36 | border-top: 0;
37 | height: 45px;
38 | }
39 |
40 | @media screen and (max-width: 768px) {
41 | .wy-nav-side {
42 | height: 100%;
43 | }
44 | }
45 |
46 | /*
47 | * readthedocs theme hides nav items when the window height is
48 | * too small to contain them.
49 | *
50 | * https://github.com/mkdocs/mkdocs/issues/#348
51 | */
52 | .wy-menu-vertical ul {
53 | margin-bottom: 2em;
54 | }
55 |
56 | /*
57 | * Wrap inline code samples otherwise they shoot of the side and
58 | * can't be read at all.
59 | *
60 | * https://github.com/mkdocs/mkdocs/issues/313
61 | * https://github.com/mkdocs/mkdocs/issues/233
62 | * https://github.com/mkdocs/mkdocs/issues/834
63 | */
64 | code {
65 | white-space: pre-wrap;
66 | word-wrap: break-word;
67 | padding: 2px 5px;
68 | }
69 |
70 | /**
71 | * Make code blocks display as blocks and give them the appropriate
72 | * font size and padding.
73 | *
74 | * https://github.com/mkdocs/mkdocs/issues/855
75 | * https://github.com/mkdocs/mkdocs/issues/834
76 | * https://github.com/mkdocs/mkdocs/issues/233
77 | */
78 | pre code {
79 | white-space: pre;
80 | word-wrap: normal;
81 | display: block;
82 | padding: 12px;
83 | font-size: 12px;
84 | }
85 |
86 | /*
87 | * Fix link colors when the link text is inline code.
88 | *
89 | * https://github.com/mkdocs/mkdocs/issues/718
90 | */
91 | a code {
92 | color: #2980B9;
93 | }
94 | a:hover code {
95 | color: #3091d1;
96 | }
97 | a:visited code {
98 | color: #9B59B6;
99 | }
100 |
101 | /*
102 | * The CSS classes from highlight.js seem to clash with the
103 | * ReadTheDocs theme causing some code to be incorrectly made
104 | * bold and italic.
105 | *
106 | * https://github.com/mkdocs/mkdocs/issues/411
107 | */
108 | pre .cs, pre .c {
109 | font-weight: inherit;
110 | font-style: inherit;
111 | }
112 |
113 | /*
114 | * Fix some issues with the theme and non-highlighted code
115 | * samples. Without and highlighting styles attached the
116 | * formatting is broken.
117 | *
118 | * https://github.com/mkdocs/mkdocs/issues/319
119 | */
120 | .no-highlight {
121 | display: block;
122 | padding: 0.5em;
123 | color: #333;
124 | }
125 |
126 |
127 | /*
128 | * Additions specific to the search functionality provided by MkDocs
129 | */
130 |
131 | .search-results {
132 | margin-top: 23px;
133 | }
134 |
135 | .search-results article {
136 | border-top: 1px solid #E1E4E5;
137 | padding-top: 24px;
138 | }
139 |
140 | .search-results article:first-child {
141 | border-top: none;
142 | }
143 |
144 | form .search-query {
145 | width: 100%;
146 | border-radius: 50px;
147 | padding: 6px 12px; /* csslint allow: box-model */
148 | border-color: #D1D4D5;
149 | }
150 |
151 | .wy-menu-vertical li ul {
152 | display: inherit;
153 | }
154 |
155 | .wy-menu-vertical li ul.subnav ul.subnav{
156 | padding-left: 1em;
157 | }
158 |
159 | .wy-menu-vertical .subnav li.current > a {
160 | padding-left: 2.42em;
161 | }
162 | .wy-menu-vertical .subnav li.current > ul li a {
163 | padding-left: 3.23em;
164 | }
165 |
166 | /*
167 | * Improve inline code blocks within admonitions.
168 | *
169 | * https://github.com/mkdocs/mkdocs/issues/656
170 | */
171 | .admonition code {
172 | color: #404040;
173 | border: 1px solid #c7c9cb;
174 | border: 1px solid rgba(0, 0, 0, 0.2);
175 | background: #f8fbfd;
176 | background: rgba(255, 255, 255, 0.7);
177 | }
178 |
179 | /*
180 | * Account for wide tables which go off the side.
181 | * Override borders to avoid wierdness on narrow tables.
182 | *
183 | * https://github.com/mkdocs/mkdocs/issues/834
184 | * https://github.com/mkdocs/mkdocs/pull/1034
185 | */
186 | .rst-content .section .docutils {
187 | width: 100%;
188 | overflow: auto;
189 | display: block;
190 | border: none;
191 | }
192 |
193 | td, th {
194 | border: 1px solid #e1e4e5 !important; /* csslint allow: important */
195 | border-collapse: collapse;
196 | }
197 |
198 |
--------------------------------------------------------------------------------
/site/404.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 | Test tube Documentation
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
39 |
40 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 | Test tube Documentation
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 | Docs »
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
404
114 |
115 |
Page not found
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 | Built with MkDocs using a theme provided by Read the Docs .
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 | GitHub
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
--------------------------------------------------------------------------------
/site/search.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 | Test tube Documentation
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
39 |
40 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 | Test tube Documentation
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 | Docs »
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
Search Results
114 |
115 |
119 |
120 |
121 | Searching...
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 | Built with MkDocs using a theme provided by Read the Docs .
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 | GitHub
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
--------------------------------------------------------------------------------
/docs/hyperparameter_optimization/HyperOptArgumentParser.md:
--------------------------------------------------------------------------------
1 | # HyperOptArgumentParser class API
2 |
3 | [[Github Code](https://github.com/williamFalcon/test-tube/blob/master/test_tube/argparse_hopt.py)]
4 |
5 | The HyperOptArgumentParser is a subclass of python's
6 | [argparse](https://docs.python.org/3/library/argparse.html), with added
7 | finctionality to change parameters on the fly as determined by a
8 | sampling strategy.
9 |
10 | You can instantiate an `HyperOptArgumentParser` via:
11 |
12 | ``` {.python}
13 | from test_tube import HyperOptArgumentParser
14 |
15 | # subclass of argparse
16 | parser = HyperOptArgumentParser(strategy='random_search')
17 | parser.add_argument('--learning_rate', default=0.002, type=float, help='the learning rate')
18 |
19 | # let's enable optimizing over the number of layers in the network
20 | parser.opt_list('--nb_layers', default=2, type=int, tunable=True, options=[2, 4, 8])
21 |
22 | # and tune the number of units in each layer
23 | parser.opt_range('--neurons', default=50, type=int, tunable=True, low=100, high=800, nb_samples=10)
24 |
25 | # compile (because it's argparse underneath)
26 | hparams = parser.parse_args()
27 |
28 | # run 20 trials of random search over the hyperparams
29 | for hparam_trial in hparams.trials(20):
30 | train_network(hparam_trial)
31 | ```
32 |
33 | ------------------------------------------------------------------------
34 |
35 | ## init options
36 |
37 | ### `strategy`
38 |
39 | Use either [random
40 | search](http://www.jmlr.org/papers/volume13/bergstra12a/bergstra12a.pdf)
41 | or [grid
42 | search](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)
43 | for tuning:
44 |
45 | ``` {.python}
46 | parser = HyperOptArgumentParser(strategy='grid_search')
47 | ```
48 |
49 | ------------------------------------------------------------------------
50 |
51 | ## Methods
52 |
53 | All the functionality from argparse works but we've added the following
54 | functionality:
55 |
56 | ### `opt_list`
57 |
58 | ``` {.python}
59 | parser.opt_list('--nb_layers', default=2, type=int, tunable=True, options=[2, 4, 8])
60 | ```
61 |
62 | Enables searching over a list of values for this parameter. The tunable
63 | values ONLY replace the argparse values when running a hyperparameter
64 | optimization search. This is on purpose so your code doesn't have to
65 | change when you want to tune it.
66 |
67 | **Example**
68 |
69 | ``` {.python}
70 | parser.opt_list('--nb_layers', default=2, type=int, tunable=True, options=[2, 4, 8])
71 | hparams = parser.parse_args()
72 | # hparams.nb_layers = 2
73 |
74 | for trial in hparams.trials(2):
75 | # trial.nb_layers is now a value in [2, 4, 8]
76 | # but hparams.nb_layers is still 2
77 | ```
78 |
79 | ### `opt_range`
80 |
81 | ``` {.python}
82 | parser.opt_range('--neurons', default=50, type=int, tunable=True, low=100, high=800, nb_samples=8, log_base=None)
83 | ```
84 |
85 | Enables searching over a range of values chosen randomly using the
86 | `nb_samples` given. The tunable values *only* replace the argparse
87 | values when running a hyperparameter optimization search. This is on
88 | purpose so your code doesn't have to change when you want to tune it.
89 |
90 | If `log_base` is set to a positive number, it will randomly search over
91 | a log scale, where the log base is `log_base`. This is better for search
92 | over several orders of magnitude efficiently.
93 |
94 | **Example**
95 |
96 | ``` {.python}
97 | parser.opt_range('--neurons', default=50, type=int, tunable=True, low=100, high=800, nb_samples=8)
98 | hparams = parser.parse_args()
99 | # hparams.neurons = 50
100 |
101 | for trial in hparams.trials(2):
102 | # trial.nb_layers is now a value in [100, 200, 300, 400, 500, 600 700, 800]
103 | # but hparams.neurons is still 50
104 | ```
105 |
106 | ### `json_config`
107 |
108 | ``` {.python}
109 | parser.json_config('--config', default='example.json')
110 | ```
111 |
112 | Replaces default values in the parser with those read from the json file
113 |
114 | **Example**
115 |
116 | *example.json*
117 |
118 | ``` {.json}
119 | {
120 | "learning_rate": 200
121 | }
122 | ```
123 |
124 | ``` {.python}
125 | parser.add_argument('--learning_rate', default=0.002, type=float, help='the learning rate')
126 | parser.json_config('--config', default='example.json')
127 | hparams = parser.parse_args()
128 |
129 | # hparams.learning_rate = 200
130 | ```
131 |
132 | ### trials
133 |
134 | ``` {.python}
135 | trial_generator = hparams.trials(2)
136 | ```
137 |
138 | Computes the trials needed for these experiments and serves them via a
139 | generator
140 |
141 | **Example**
142 |
143 | ``` {.python}
144 | hparams = parser.parse_args()
145 | for trial_hparams in hparams.trials(2):
146 | # trial_hparams now has values sampled from the training routine
147 | ```
148 |
149 | ### `optimize_parallel_gpu`
150 |
151 | ``` {.python}
152 | hparams = parser.parse_args()
153 | hparams.optimize_parallel_gpu(function_to_optimize, gpu_ids=['1', '0, 2'])
154 | ```
155 |
156 | Parallelize the trials across `nb_workers` processes. Auto assign the
157 | correct gpus. Argument passed into the `function_to_optimize` is the
158 | `trial_params` argument and the gpu_ids.
159 |
160 | **Example**
161 |
162 | ``` {.python}
163 | # parallelize tuning on 2 gpus
164 | # this will place each trial in n into a given gpu
165 | def train_main(trial_params, gpu_ids):
166 | # train your model, etc here...
167 |
168 | hparams = parser.parse_args()
169 | hparams.optimize_parallel_gpu(train_main, gpu_ids=['1', '0, 2'])
170 |
171 | # at the end of the optimize_parallel function, all 20 trials will be completed
172 | # in this case by running 10 sets of 2 trials in parallel
173 | ```
174 |
175 | ### `optimize_parallel_cpu`
176 |
177 | ``` {.python}
178 | hparams = parser.parse_args()
179 | hparams.optimize_parallel_cpu(function_to_optimize, nb_trials=20, nb_workers=2)
180 | ```
181 |
182 | Parallelize the trials across `nb_workers` cpus. Argument passed into
183 | the `function_to_optimize` is the `trial_params` argument.
184 |
185 | **Example**
186 |
187 | ``` {.python}
188 | # parallelize tuning on 2 cpus
189 | # this will place each trial in n into a given gpu
190 | def train_main(trial_params):
191 | # train your model, etc here...
192 |
193 | hparams = parser.parse_args()
194 | hparams.optimize_parallel_cpu(train_main, nb_trials=20, nb_workers=2)
195 |
196 | # at the end of the optimize_parallel function, all 20 trials will be completed
197 | # in this case by running 10 sets of 2 trials in parallel
198 | ```
199 |
--------------------------------------------------------------------------------
/test_tube/hyperopt.py:
--------------------------------------------------------------------------------
1 | import itertools
2 | import json
3 | import random
4 |
5 |
6 | class HyperParamOptimizer(object):
7 |
8 | def __init__(self, method='grid_search', enabled=True, experiment=None):
9 | """
10 | :param method: 'grid_search', 'random_search'
11 | :param enabled:
12 | """
13 | self.method = method
14 | self.enabled = enabled
15 | self.experiment = experiment
16 | self.seen_params = {}
17 | self.current_iteration = 0
18 |
19 | # the params to use at each trial
20 | self.trials = None
21 |
22 | # total iterations we're doing
23 | self.nb_iterations = None
24 |
25 | # details about each param
26 | self.params = []
27 |
28 | # -----------------------------
29 | # PARAMETER CHOICES
30 | # -----------------------------
31 | def tune_uniform(self, low, high, samples, default, name):
32 | # how this fx samples for the data
33 | def gen_samples():
34 | vals = [random.uniform(low, high) for i in range(samples)]
35 | return vals
36 |
37 | return self.__resolve_param(gen_samples, default, name)
38 |
39 | def tune_odds(self, low, high, default, name):
40 | start = low if low %2 != 0 else low + 1
41 | def gen_samples():
42 | return range(start, high+1, 2)
43 |
44 | return self.__resolve_param(gen_samples, default, name)
45 |
46 | def tune_evens(self, low, high, default, name):
47 | start = low if low %2 == 0 else low + 1
48 | def gen_samples():
49 | return range(start, high+1, 2)
50 |
51 | return self.__resolve_param(gen_samples, default, name)
52 |
53 | def tune_choice(self, options, default, name):
54 | def gen_samples():
55 | return options
56 |
57 | return self.__resolve_param(gen_samples, default, name)
58 |
59 | def __resolve_param(self, gen_fx, default, name):
60 | # case when no action was requested
61 | if not self.enabled:
62 | return default
63 |
64 | # create the param when it's new
65 | # return the first value in this case
66 | if name not in self.seen_params:
67 | vals = gen_fx()
68 | param = {'vals': vals, 'name': name}
69 | self.seen_params[name] = {'idx': len(self.params)}
70 | self.params.append(param)
71 | return vals[0]
72 |
73 | # not the first iteration so return the ith element
74 | # in the possible values
75 | iteration_params = self.trials[self.current_iteration]
76 | param_i = self.seen_params[name]['idx']
77 | param = iteration_params[param_i]
78 | return param['val']
79 |
80 | # -----------------------------
81 | # OPTIMIZATION
82 | # -----------------------------
83 | def optimize(self, fx, nb_iterations=None):
84 | """
85 | Primary entry point into the optimization
86 | :param fx:
87 | :param nb_iterations:
88 | :return:
89 | """
90 | self.nb_iterations = nb_iterations
91 |
92 | # run first iteration
93 | result = fx(self)
94 |
95 | # log if requested
96 | if self.experiment is not None:
97 | result['hypo_iter_nb'] = self.current_iteration
98 | self.experiment.log(result)
99 |
100 | self.current_iteration += 1
101 |
102 | # generate the rest of the training seq
103 | # we couldn't do this before because we don't know
104 | # how many params the user needed
105 | self.__generate_trials()
106 |
107 | # run trials for the rest of the iterations
108 | # we either know the iterations or they're
109 | # calculated from the strategy used
110 | for i in range(1, len(self.trials)):
111 | result = fx(self)
112 | result['hypo_iter_nb'] = self.current_iteration
113 |
114 | # log if requested
115 | if self.experiment is not None:
116 | self.experiment.log(result)
117 |
118 | self.current_iteration += 1
119 |
120 | # -----------------------------
121 | # INTERFACE WITH LOGGER
122 | # -----------------------------
123 | def get_current_trial_meta(self):
124 | meta_results = []
125 |
126 | # when we have trials, means we've already done 1 run
127 | # we can just get the params that are about to be run
128 | # otherwise we need to infer params from the current param list
129 | # this assumes the user feeds the opt into the experiment after
130 | # they're done setting up the params
131 | is_first_trial = self.trials is not None and len(self.trials) > 0
132 | if is_first_trial:
133 | trial_params = self.trials[self.current_iteration]
134 | for trial_param in trial_params:
135 | root_param = self.params[trial_param['idx']]
136 | meta_results.append({'hypo_' + root_param['name']: trial_param['val']})
137 |
138 | # if we haven't done a pass through the data yet,
139 | # we need to infer from the params in the list
140 | else:
141 | for param in self.params:
142 | meta_results.append({'hypo_' + param['name']: param['vals'][0]})
143 |
144 | # add shared meta
145 | meta_results.append({'hypo_iter_nb': self.current_iteration})
146 | return meta_results
147 |
148 | # -----------------------------
149 | # TRIALS HELPER
150 | # -----------------------------
151 | def __generate_trials(self):
152 | """
153 | Generates the parameter combinations for each requested trial
154 | :return:
155 | """
156 | flat_params = self.__flatten_params(self.params)
157 |
158 | # permute for grid search
159 | if self.method == 'grid_search':
160 | self.trials = list(itertools.product(*flat_params))
161 |
162 | if self.nb_iterations is not None:
163 | self.trials = self.trials[0: self.nb_iterations]
164 |
165 | if self.method == 'random_search':
166 | self.trials = self.__generate_random_search_trials(flat_params)
167 |
168 | def __flatten_params(self, params):
169 | """
170 | Turns a list of parameters with values into a flat tuple list of lists
171 | so we can permute
172 | :param params:
173 | :return:
174 | """
175 | flat_params = []
176 | for i, param in enumerate(params):
177 | param_groups = []
178 | for val in param['vals']:
179 | param_groups.append({'idx': i, 'val': val})
180 | flat_params.append(param_groups)
181 | return flat_params
182 |
183 | def __generate_random_search_trials(self, params):
184 | results = []
185 |
186 | # ensures we have unique results
187 | seen_trials = set()
188 |
189 | # shuffle each param list
190 | potential_trials = 1
191 | for p in params:
192 | random.shuffle(p)
193 | potential_trials *= len(p)
194 |
195 | # we can't sample more trials than are possible
196 | max_iters = min(potential_trials, self.nb_iterations)
197 |
198 | # then for the nb of trials requested, create a new param tuple
199 | # by picking a random integer at each param level
200 | while len(results) < max_iters:
201 | trial = []
202 | for param in params:
203 | p = random.sample(param, 1)[0]
204 | trial.append(p)
205 |
206 | # verify this is a unique trial so we
207 | # don't duplicate work
208 | trial_str = json.dumps(trial)
209 | if trial_str not in seen_trials:
210 | seen_trials.add(trial_str)
211 | results.append(trial)
212 |
213 | return results
214 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Test Tube
8 |
9 |
10 | Log, organize and parallelize hyperparameter search for Deep Learning experiments
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 | ## Docs
20 |
21 | **[View the docs here](https://williamfalcon.github.io/test-tube/)**
22 |
23 | ---
24 |
25 | Test tube is a python library to track and parallelize hyperparameter
26 | search for Deep Learning and ML experiments. It's framework agnostic and
27 | built on top of the python argparse API for ease of use.
28 |
29 | ``` {.bash}
30 | pip install test_tube
31 | ```
32 |
33 | ---
34 |
35 | ### Main test-tube uses
36 |
37 | - [Parallelize hyperparameter
38 | optimization](https://williamfalcon.github.io/test-tube/hyperparameter_optimization/HyperOptArgumentParser/)
39 | (across multiple gpus or cpus).
40 | - [Parallelize hyperparameter
41 | optimization](https://williamfalcon.github.io/test-tube/hyperparameter_optimization/HyperOptArgumentParser/)
42 | across HPC cluster using SLURM.
43 | - Log experiment hyperparameters and experiment data.
44 | [Experiments](https://williamfalcon.github.io/test-tube/experiment_tracking/experiment/)
45 | across models.
46 | - Visualize with [tensorboard](https://www.tensorflow.org/guide/summaries_and_tensorboard)
47 |
48 | Compatible with Python any Python ML library like Tensorflow, Keras, Pytorch, Caffe, Caffe2, Chainer, MXNet, Theano, Scikit-learn
49 |
50 | ---
51 | ### Examples
52 | The Experiment object is a subclass of Pytorch.SummaryWriter.
53 |
54 | **Log and visualize with Tensorboard**
55 |
56 | ```{.python}
57 | from test-tube import Experiment
58 | import torch
59 |
60 | exp = Experiment('/some/path')
61 | exp.tag({'learning_rate': 0.02, 'layers': 4})
62 |
63 | # exp is superclass of SummaryWriter
64 | features = torch.Tensor(100, 784)
65 | writer.add_embedding(features, metadata=label, label_img=images.unsqueeze(1))
66 |
67 | # simulate training
68 | for n_iter in range(2000):
69 | e.log({'testtt': n_iter * np.sin(n_iter)})
70 |
71 | # save and close
72 | exp.save()
73 | exp.close()
74 | ```
75 |
76 | ```{.bash}
77 | pip install tensorflow
78 |
79 | tensorboard --logdir /some/path
80 | ```
81 |
82 | **Run grid search on SLURM GPU cluster**
83 |
84 | ``` {.python}
85 | from test_tube.hpc import SlurmCluster
86 |
87 | # hyperparameters is a test-tube hyper params object
88 | hyperparams = args.parse()
89 |
90 | # init cluster
91 | cluster = SlurmCluster(
92 | hyperparam_optimizer=hyperparams,
93 | log_path='/path/to/log/results/to',
94 | python_cmd='python3'
95 | )
96 |
97 | # let the cluster know where to email for a change in job status (ie: complete, fail, etc...)
98 | cluster.notify_job_status(email='some@email.com', on_done=True, on_fail=True)
99 |
100 | # set the job options. In this instance, we'll run 20 different models
101 | # each with its own set of hyperparameters giving each one 1 GPU (ie: taking up 20 GPUs)
102 | cluster.per_experiment_nb_gpus = 1
103 | cluster.per_experiment_nb_nodes = 1
104 |
105 | # run the models on the cluster
106 | cluster.optimize_parallel_cluster_gpu(train, nb_trials=20, job_name='first_tt_batch', job_display_name='my_batch')
107 |
108 | # we just ran 20 different hyperparameters on 20 GPUs in the HPC cluster!!
109 | ```
110 |
111 | **Optimize hyperparameters across GPUs**
112 |
113 | ``` {.python}
114 | from test_tube import HyperOptArgumentParser
115 |
116 | # subclass of argparse
117 | parser = HyperOptArgumentParser(strategy='random_search')
118 | parser.add_argument('--learning_rate', default=0.002, type=float, help='the learning rate')
119 |
120 | # let's enable optimizing over the number of layers in the network
121 | parser.opt_list('--nb_layers', default=2, type=int, tunable=True, options=[2, 4, 8])
122 |
123 | # and tune the number of units in each layer
124 | parser.opt_range('--neurons', default=50, type=int, tunable=True, low=100, high=800, nb_samples=10)
125 |
126 | # compile (because it's argparse underneath)
127 | hparams = parser.parse_args()
128 |
129 | # optimize across 4 gpus
130 | # use 2 gpus together and the other two separately
131 | hparams.optimize_parallel_gpu(MyModel.fit, gpu_ids=['1', '2,3', '0'], nb_trials=192, nb_workers=4)
132 | ```
133 |
134 | Or... across CPUs
135 |
136 | ``` {.python}
137 | hparams.optimize_parallel_cpu(MyModel.fit, nb_trials=192, nb_workers=12)
138 | ```
139 |
140 | You can also optimize on a *log* scale to allow better search over
141 | magnitudes of hyperparameter values, with a chosen base (disabled by
142 | default). Keep in mind that the range you search over must be strictly
143 | positive.
144 |
145 | ``` {.python}
146 | from test_tube import HyperOptArgumentParser
147 |
148 | # subclass of argparse
149 | parser = HyperOptArgumentParser(strategy='random_search')
150 |
151 | # Randomly searches over the (log-transformed) range [100,800).
152 |
153 | parser.opt_range('--neurons', default=50, type=int, tunable=True, low=100, high=800, nb_samples=10, log_base=10)
154 |
155 |
156 | # compile (because it's argparse underneath)
157 | hparams = parser.parse_args()
158 |
159 | # run 20 trials of random search over the hyperparams
160 | for hparam_trial in hparams.trials(20):
161 | train_network(hparam_trial)
162 | ```
163 |
164 | ### Convert your argparse params into searchable params by changing 1 line
165 |
166 | ``` {.python}
167 | import argparse
168 | from test_tube import HyperOptArgumentParser
169 |
170 | # these lines are equivalent
171 | parser = argparse.ArgumentParser(description='Process some integers.')
172 | parser = HyperOptArgumentParser(description='Process some integers.', strategy='grid_search')
173 |
174 | # do normal argparse stuff
175 | ...
176 | ```
177 |
178 | ### Log images inline with metrics
179 |
180 | ``` {.python}
181 | # name must have either jpg, png or jpeg in it
182 | img = np.imread('a.jpg')
183 | exp.log('test_jpg': img, 'val_err': 0.2)
184 |
185 | # saves image to ../exp/version/media/test_0.jpg
186 | # csv has file path to that image in that cell
187 | ```
188 |
189 | ## Demos
190 |
191 | - [Hyperparameter optimization for PyTorch across 20 cluster GPUs](https://github.com/williamFalcon/test-tube/blob/master/examples/pytorch_hpc_example.py)
192 | - [Hyperparameter optimization across 20 cluster CPUs](https://github.com/williamFalcon/test-tube/blob/master/examples/hpc_cpu_example.py)
193 | - [Experiments and hyperparameter optimization for tensorflow across 4 GPUs simultaneously](https://github.com/williamFalcon/test-tube/blob/master/examples/tensorflow_example.py)
194 |
195 | ## How to contribute
196 |
197 | Feel free to fix bugs and make improvements! 1. Check out the [current
198 | bugs here](https://github.com/williamFalcon/test-tube/issues) or
199 | [feature
200 | requests](https://github.com/williamFalcon/test-tube/projects/1). 2. To
201 | work on a bug or feature, head over to our [project
202 | page](https://github.com/williamFalcon/test-tube/projects/1) and assign
203 | yourself the bug. 3. We'll add contributor names periodically as people
204 | improve the library!
205 |
206 | ## Bibtex
207 |
208 | To cite the framework use:
209 |
210 | @misc{Falcon2017,
211 | author = {Falcon, W.A.},
212 | title = {Test Tube},
213 | year = {2017},
214 | publisher = {GitHub},
215 | journal = {GitHub repository},
216 | howpublished = {\url{https://github.com/williamfalcon/test-tube}}
217 | }
218 |
219 | ## License
220 | In addition to the terms outlined in the license, this software is U.S. Patent Pending.
221 |
--------------------------------------------------------------------------------
/site/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 | Test Tube: Easily log and tune Deep Learning experiments - Test tube Documentation
12 |
13 |
14 |
15 |
16 |
17 |
18 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
46 |
47 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 | Test tube Documentation
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 | Docs »
120 |
121 |
122 |
123 | Test Tube: Easily log and tune Deep Learning experiments
124 |
125 |
126 | Edit on GitHub
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
Test Tube: Easily log and tune Deep Learning experiments
137 |
Test Tube allows you to easily log metadata and track your machine
138 | learning experiments.
139 |
Use Test Tube if you need to:
140 |
141 | Track many Experiments across
142 | models.
143 | Visualize and compare different
144 | experiments without uploading anywhere.
145 | Optimize your
146 | hyperparameters
147 | using grid search or random search.
148 | Automatically track ALL parameters for a particular training run.
149 |
150 |
Test Tube is compatible with: Python 2 and 3
151 |
Getting started
152 |
153 |
154 |
from test_tube import Experiment
155 |
156 | exp = Experiment(name='dense_model',
157 | debug=False,
158 | save_dir='/Desktop/test_tube')
159 |
160 | exp.tag({'learning_rate': 0.002, 'nb_layers': 2})
161 |
162 | for step in training_steps:
163 | tng_err = model.eval(tng_x, tng_y)
164 |
165 | exp.log('tng_err': tng_err)
166 |
167 | # training complete!
168 | # all your logs and data are ready to be visualized at testtube.williamfalcon.com
169 |
170 |
171 |
172 |
173 |
from test_tube import HyperOptArgumentParser
174 |
175 | # subclass of argparse
176 | parser = HyperOptArgumentParser(strategy='random_search')
177 | parser.add_argument('--learning_rate', default=0.002, type=float, help='the learning rate')
178 |
179 | # let's enable optimizing over the number of layers in the network
180 | parser.opt_list('--nb_layers', default=2, type=int, tunable=True, options=[2, 4, 8])
181 |
182 | # and tune the number of units in each layer
183 | parser.opt_range('--neurons', default=50, type=int, tunable=True, low=100, high=800, nb_samples=10)
184 |
185 | # compile (because it's argparse underneath)
186 | hparams = parser.parse_args()
187 |
188 | # run 20 trials of random search over the hyperparams
189 | for hparam_trial in hparams.trials(20):
190 | train_network(hparam_trial)
191 |
192 |
193 |
194 |
Visualize
195 |
import pandas as pd
196 | import matplotlib
197 |
198 | # each experiment is saved to a metrics.csv file which can be imported anywhere
199 | # images save to exp/version/images
200 | df = pd.read_csv('../some/dir/test_tube_data/dense_model/version_0/metrics.csv')
201 | df.tng_err.plot()
202 |
203 |
204 |
205 |
206 |
207 |
208 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 | Built with MkDocs using a theme provided by Read the Docs .
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 | GitHub
237 |
238 |
239 |
240 | Next »
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
255 |
--------------------------------------------------------------------------------
/site/js/modernizr-2.8.3.min.js:
--------------------------------------------------------------------------------
1 | window.Modernizr=function(e,t,n){function r(e){b.cssText=e}function o(e,t){return r(S.join(e+";")+(t||""))}function a(e,t){return typeof e===t}function i(e,t){return!!~(""+e).indexOf(t)}function c(e,t){for(var r in e){var o=e[r];if(!i(o,"-")&&b[o]!==n)return"pfx"==t?o:!0}return!1}function s(e,t,r){for(var o in e){var i=t[e[o]];if(i!==n)return r===!1?e[o]:a(i,"function")?i.bind(r||t):i}return!1}function u(e,t,n){var r=e.charAt(0).toUpperCase()+e.slice(1),o=(e+" "+k.join(r+" ")+r).split(" ");return a(t,"string")||a(t,"undefined")?c(o,t):(o=(e+" "+T.join(r+" ")+r).split(" "),s(o,t,n))}function l(){p.input=function(n){for(var r=0,o=n.length;o>r;r++)j[n[r]]=!!(n[r]in E);return j.list&&(j.list=!(!t.createElement("datalist")||!e.HTMLDataListElement)),j}("autocomplete autofocus list placeholder max min multiple pattern required step".split(" ")),p.inputtypes=function(e){for(var r,o,a,i=0,c=e.length;c>i;i++)E.setAttribute("type",o=e[i]),r="text"!==E.type,r&&(E.value=x,E.style.cssText="position:absolute;visibility:hidden;",/^range$/.test(o)&&E.style.WebkitAppearance!==n?(g.appendChild(E),a=t.defaultView,r=a.getComputedStyle&&"textfield"!==a.getComputedStyle(E,null).WebkitAppearance&&0!==E.offsetHeight,g.removeChild(E)):/^(search|tel)$/.test(o)||(r=/^(url|email)$/.test(o)?E.checkValidity&&E.checkValidity()===!1:E.value!=x)),P[e[i]]=!!r;return P}("search tel url email datetime date month week time datetime-local number range color".split(" "))}var d,f,m="2.8.3",p={},h=!0,g=t.documentElement,v="modernizr",y=t.createElement(v),b=y.style,E=t.createElement("input"),x=":)",w={}.toString,S=" -webkit- -moz- -o- -ms- ".split(" "),C="Webkit Moz O ms",k=C.split(" "),T=C.toLowerCase().split(" "),N={svg:"http://www.w3.org/2000/svg"},M={},P={},j={},$=[],D=$.slice,F=function(e,n,r,o){var a,i,c,s,u=t.createElement("div"),l=t.body,d=l||t.createElement("body");if(parseInt(r,10))for(;r--;)c=t.createElement("div"),c.id=o?o[r]:v+(r+1),u.appendChild(c);return a=["",'"].join(""),u.id=v,(l?u:d).innerHTML+=a,d.appendChild(u),l||(d.style.background="",d.style.overflow="hidden",s=g.style.overflow,g.style.overflow="hidden",g.appendChild(d)),i=n(u,e),l?u.parentNode.removeChild(u):(d.parentNode.removeChild(d),g.style.overflow=s),!!i},z=function(t){var n=e.matchMedia||e.msMatchMedia;if(n)return n(t)&&n(t).matches||!1;var r;return F("@media "+t+" { #"+v+" { position: absolute; } }",function(t){r="absolute"==(e.getComputedStyle?getComputedStyle(t,null):t.currentStyle).position}),r},A=function(){function e(e,o){o=o||t.createElement(r[e]||"div"),e="on"+e;var i=e in o;return i||(o.setAttribute||(o=t.createElement("div")),o.setAttribute&&o.removeAttribute&&(o.setAttribute(e,""),i=a(o[e],"function"),a(o[e],"undefined")||(o[e]=n),o.removeAttribute(e))),o=null,i}var r={select:"input",change:"input",submit:"form",reset:"form",error:"img",load:"img",abort:"img"};return e}(),L={}.hasOwnProperty;f=a(L,"undefined")||a(L.call,"undefined")?function(e,t){return t in e&&a(e.constructor.prototype[t],"undefined")}:function(e,t){return L.call(e,t)},Function.prototype.bind||(Function.prototype.bind=function(e){var t=this;if("function"!=typeof t)throw new TypeError;var n=D.call(arguments,1),r=function(){if(this instanceof r){var o=function(){};o.prototype=t.prototype;var a=new o,i=t.apply(a,n.concat(D.call(arguments)));return Object(i)===i?i:a}return t.apply(e,n.concat(D.call(arguments)))};return r}),M.flexbox=function(){return u("flexWrap")},M.flexboxlegacy=function(){return u("boxDirection")},M.canvas=function(){var e=t.createElement("canvas");return!(!e.getContext||!e.getContext("2d"))},M.canvastext=function(){return!(!p.canvas||!a(t.createElement("canvas").getContext("2d").fillText,"function"))},M.webgl=function(){return!!e.WebGLRenderingContext},M.touch=function(){var n;return"ontouchstart"in e||e.DocumentTouch&&t instanceof DocumentTouch?n=!0:F(["@media (",S.join("touch-enabled),("),v,")","{#modernizr{top:9px;position:absolute}}"].join(""),function(e){n=9===e.offsetTop}),n},M.geolocation=function(){return"geolocation"in navigator},M.postmessage=function(){return!!e.postMessage},M.websqldatabase=function(){return!!e.openDatabase},M.indexedDB=function(){return!!u("indexedDB",e)},M.hashchange=function(){return A("hashchange",e)&&(t.documentMode===n||t.documentMode>7)},M.history=function(){return!(!e.history||!history.pushState)},M.draganddrop=function(){var e=t.createElement("div");return"draggable"in e||"ondragstart"in e&&"ondrop"in e},M.websockets=function(){return"WebSocket"in e||"MozWebSocket"in e},M.rgba=function(){return r("background-color:rgba(150,255,150,.5)"),i(b.backgroundColor,"rgba")},M.hsla=function(){return r("background-color:hsla(120,40%,100%,.5)"),i(b.backgroundColor,"rgba")||i(b.backgroundColor,"hsla")},M.multiplebgs=function(){return r("background:url(https://),url(https://),red url(https://)"),/(url\s*\(.*?){3}/.test(b.background)},M.backgroundsize=function(){return u("backgroundSize")},M.borderimage=function(){return u("borderImage")},M.borderradius=function(){return u("borderRadius")},M.boxshadow=function(){return u("boxShadow")},M.textshadow=function(){return""===t.createElement("div").style.textShadow},M.opacity=function(){return o("opacity:.55"),/^0.55$/.test(b.opacity)},M.cssanimations=function(){return u("animationName")},M.csscolumns=function(){return u("columnCount")},M.cssgradients=function(){var e="background-image:",t="gradient(linear,left top,right bottom,from(#9f9),to(white));",n="linear-gradient(left top,#9f9, white);";return r((e+"-webkit- ".split(" ").join(t+e)+S.join(n+e)).slice(0,-e.length)),i(b.backgroundImage,"gradient")},M.cssreflections=function(){return u("boxReflect")},M.csstransforms=function(){return!!u("transform")},M.csstransforms3d=function(){var e=!!u("perspective");return e&&"webkitPerspective"in g.style&&F("@media (transform-3d),(-webkit-transform-3d){#modernizr{left:9px;position:absolute;height:3px;}}",function(t){e=9===t.offsetLeft&&3===t.offsetHeight}),e},M.csstransitions=function(){return u("transition")},M.fontface=function(){var e;return F('@font-face {font-family:"font";src:url("https://")}',function(n,r){var o=t.getElementById("smodernizr"),a=o.sheet||o.styleSheet,i=a?a.cssRules&&a.cssRules[0]?a.cssRules[0].cssText:a.cssText||"":"";e=/src/i.test(i)&&0===i.indexOf(r.split(" ")[0])}),e},M.generatedcontent=function(){var e;return F(["#",v,"{font:0/0 a}#",v,':after{content:"',x,'";visibility:hidden;font:3px/1 a}'].join(""),function(t){e=t.offsetHeight>=3}),e},M.video=function(){var e=t.createElement("video"),n=!1;try{(n=!!e.canPlayType)&&(n=new Boolean(n),n.ogg=e.canPlayType('video/ogg; codecs="theora"').replace(/^no$/,""),n.h264=e.canPlayType('video/mp4; codecs="avc1.42E01E"').replace(/^no$/,""),n.webm=e.canPlayType('video/webm; codecs="vp8, vorbis"').replace(/^no$/,""))}catch(r){}return n},M.audio=function(){var e=t.createElement("audio"),n=!1;try{(n=!!e.canPlayType)&&(n=new Boolean(n),n.ogg=e.canPlayType('audio/ogg; codecs="vorbis"').replace(/^no$/,""),n.mp3=e.canPlayType("audio/mpeg;").replace(/^no$/,""),n.wav=e.canPlayType('audio/wav; codecs="1"').replace(/^no$/,""),n.m4a=(e.canPlayType("audio/x-m4a;")||e.canPlayType("audio/aac;")).replace(/^no$/,""))}catch(r){}return n},M.localstorage=function(){try{return localStorage.setItem(v,v),localStorage.removeItem(v),!0}catch(e){return!1}},M.sessionstorage=function(){try{return sessionStorage.setItem(v,v),sessionStorage.removeItem(v),!0}catch(e){return!1}},M.webworkers=function(){return!!e.Worker},M.applicationcache=function(){return!!e.applicationCache},M.svg=function(){return!!t.createElementNS&&!!t.createElementNS(N.svg,"svg").createSVGRect},M.inlinesvg=function(){var e=t.createElement("div");return e.innerHTML=" ",(e.firstChild&&e.firstChild.namespaceURI)==N.svg},M.smil=function(){return!!t.createElementNS&&/SVGAnimate/.test(w.call(t.createElementNS(N.svg,"animate")))},M.svgclippaths=function(){return!!t.createElementNS&&/SVGClipPath/.test(w.call(t.createElementNS(N.svg,"clipPath")))};for(var H in M)f(M,H)&&(d=H.toLowerCase(),p[d]=M[H](),$.push((p[d]?"":"no-")+d));return p.input||l(),p.addTest=function(e,t){if("object"==typeof e)for(var r in e)f(e,r)&&p.addTest(r,e[r]);else{if(e=e.toLowerCase(),p[e]!==n)return p;t="function"==typeof t?t():t,"undefined"!=typeof h&&h&&(g.className+=" "+(t?"":"no-")+e),p[e]=t}return p},r(""),y=E=null,function(e,t){function n(e,t){var n=e.createElement("p"),r=e.getElementsByTagName("head")[0]||e.documentElement;return n.innerHTML="x",r.insertBefore(n.lastChild,r.firstChild)}function r(){var e=y.elements;return"string"==typeof e?e.split(" "):e}function o(e){var t=v[e[h]];return t||(t={},g++,e[h]=g,v[g]=t),t}function a(e,n,r){if(n||(n=t),l)return n.createElement(e);r||(r=o(n));var a;return a=r.cache[e]?r.cache[e].cloneNode():p.test(e)?(r.cache[e]=r.createElem(e)).cloneNode():r.createElem(e),!a.canHaveChildren||m.test(e)||a.tagUrn?a:r.frag.appendChild(a)}function i(e,n){if(e||(e=t),l)return e.createDocumentFragment();n=n||o(e);for(var a=n.frag.cloneNode(),i=0,c=r(),s=c.length;s>i;i++)a.createElement(c[i]);return a}function c(e,t){t.cache||(t.cache={},t.createElem=e.createElement,t.createFrag=e.createDocumentFragment,t.frag=t.createFrag()),e.createElement=function(n){return y.shivMethods?a(n,e,t):t.createElem(n)},e.createDocumentFragment=Function("h,f","return function(){var n=f.cloneNode(),c=n.createElement;h.shivMethods&&("+r().join().replace(/[\w\-]+/g,function(e){return t.createElem(e),t.frag.createElement(e),'c("'+e+'")'})+");return n}")(y,t.frag)}function s(e){e||(e=t);var r=o(e);return!y.shivCSS||u||r.hasCSS||(r.hasCSS=!!n(e,"article,aside,dialog,figcaption,figure,footer,header,hgroup,main,nav,section{display:block}mark{background:#FF0;color:#000}template{display:none}")),l||c(e,r),e}var u,l,d="3.7.0",f=e.html5||{},m=/^<|^(?:button|map|select|textarea|object|iframe|option|optgroup)$/i,p=/^(?:a|b|code|div|fieldset|h1|h2|h3|h4|h5|h6|i|label|li|ol|p|q|span|strong|style|table|tbody|td|th|tr|ul)$/i,h="_html5shiv",g=0,v={};!function(){try{var e=t.createElement("a");e.innerHTML=" ",u="hidden"in e,l=1==e.childNodes.length||function(){t.createElement("a");var e=t.createDocumentFragment();return"undefined"==typeof e.cloneNode||"undefined"==typeof e.createDocumentFragment||"undefined"==typeof e.createElement}()}catch(n){u=!0,l=!0}}();var y={elements:f.elements||"abbr article aside audio bdi canvas data datalist details dialog figcaption figure footer header hgroup main mark meter nav output progress section summary template time video",version:d,shivCSS:f.shivCSS!==!1,supportsUnknownElements:l,shivMethods:f.shivMethods!==!1,type:"default",shivDocument:s,createElement:a,createDocumentFragment:i};e.html5=y,s(t)}(this,t),p._version=m,p._prefixes=S,p._domPrefixes=T,p._cssomPrefixes=k,p.mq=z,p.hasEvent=A,p.testProp=function(e){return c([e])},p.testAllProps=u,p.testStyles=F,p.prefixed=function(e,t,n){return t?u(e,t,n):u(e,"pfx")},g.className=g.className.replace(/(^|\s)no-js(\s|$)/,"$1$2")+(h?" js "+$.join(" "):""),p}(this,this.document);
--------------------------------------------------------------------------------
/site/experiment_tracking/experiment/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 | Experiment class API - Test tube Documentation
12 |
13 |
14 |
15 |
16 |
17 |
18 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
46 |
47 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 | Test tube Documentation
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 | Docs »
122 |
123 |
124 |
125 | Experiment tracking »
126 |
127 |
128 |
129 | Experiment class API
130 |
131 |
132 | Edit on GitHub
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
Experiment class API
143 |
[Github Code ]
144 |
An Experiment holds metadata and the results of the training run, you
145 | can instantiate an Experiment via:
146 |
from test_tube import Experiment
147 |
148 | exp = Experiment(name='dense_model',
149 | debug=False,
150 | save_dir='/Desktop/test_tube')
151 |
152 | exp.tag({'learning_rate': 0.002, 'nb_layers': 2})
153 |
154 | for step in training_steps:
155 | tng_err = model.eval(tng_x, tng_y)
156 |
157 | exp.log('tng_err': tng_err)
158 |
159 | # training complete!
160 | # all your logs and data are ready to be visualized at testtube.williamfalcon.com
161 |
162 |
163 |
164 |
init options
165 |
version
166 |
The same Experiment can have multiple versions. Test tube generates
167 | these automatically each time you run your model. To set your own
168 | version use:
169 |
exp = Experiment(name='dense_model',version=1)
170 |
171 |
172 |
debug
173 |
If you're debugging and don't want to create a log file turn debug to
174 | True
175 |
exp = Experiment(name='dense_model',debug=True)
176 |
177 |
178 |
autosave
179 |
If you only want to save at the end of training, turn autosave off:
180 |
exp = Experiment(name='dense_model', autosave=False)
181 |
182 | # run long training...
183 |
184 | # first time any logs are saved
185 | exp.save()
186 |
187 |
188 |
create_git_tag
189 |
Ever wanted a flashback to your code when you ran an experiment?
190 | Snapshot your code for this experiment using git tags:
191 |
exp = Experiment(name='dense_model', create_git_tag=True)
192 |
193 |
194 |
195 |
Methods
196 |
tag
197 |
exp.tag({k: v})
198 |
199 |
200 |
Adds an arbitrary dictionary of tags to the experiment
201 |
Example
202 |
exp.tag({'dataset_name': 'imagenet_1', 'learning_rate': 0.0002})
203 |
204 |
205 |
log
206 |
exp.log({k:v})
207 |
208 |
209 |
Adds a row of data to the experiments
210 |
Example
211 |
exp.log({'val_loss': 0.22, 'epoch_nb': 1, 'batch_nb': 12})
212 |
213 | # you can also add other rows that have separate information
214 | exp.log({'tng_loss': 0.01})
215 |
216 | # or even a numpy array image
217 | image = np.imread('image.png')
218 | exp.log({'fake_png': image})
219 |
220 |
221 |
Saving images Example
222 |
# name must have either jpg, png or jpeg in it
223 | img = np.imread('a.jpg')
224 | exp.log('test_jpg': img, 'val_err': 0.2)
225 |
226 | # saves image to ../exp/version/media/test_0.jpg
227 | # csv has file path to that image in that cell
228 |
229 |
230 |
To save an image, add jpg, png or jpeg to the key corresponding
231 | with the image array. The image must be formatted the same as skimage's
232 | imsave
233 | function
234 |
argparse
235 |
exp.argparse(hparams)
236 |
237 |
238 |
Transfers hyperparam information from Argparser or
239 | HyperOptArgumentParser
240 |
Example
241 |
from test_tube import HyperOptArgumentParser
242 |
243 | # parse args
244 | parser = HyperOptArgumentParser()
245 | parser.add_argument('--learning_rate', default=0.002, type=float, help='the learning rate')
246 | hparams = parser.parse_args()
247 |
248 | # learning_rate is now a meta tag for your experiment
249 | exp.argparse(hparams)
250 |
251 |
252 |
save
253 |
exp.save()
254 |
255 |
256 |
Saves the exp to disk (including images)
257 |
Example
258 |
exp = Experiment(name='dense_model', autosave=False)
259 |
260 | # run long training...
261 |
262 | # first time any logs are saved
263 | exp.save()
264 |
265 |
266 |
267 |
268 |
269 |
270 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 | Built with MkDocs using a theme provided by Read the Docs .
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
--------------------------------------------------------------------------------
/docs/hpc/SlurmCluster.md:
--------------------------------------------------------------------------------
1 | # SlurmCluster class API
2 |
3 | [[Github Code](https://github.com/williamFalcon/test-tube/blob/master/test_tube/hpc.py)]
4 |
5 | The SlurmCluster class enables hyperparameter search parallelization on a cluster managed via [Slurm workload manager](https://slurm.schedmd.com/).
6 |
7 | At a high level, the SlurmCluster creates a submit script for each permutation of hyperparameters requested. If the job hits the walltime but has not completed, the SlurmManager will checkpoint the model and submit a new job to continue training using the saved weights.
8 |
9 | - Here's a [full GPU PyTorch example](https://github.com/williamFalcon/test-tube/blob/master/examples/pytorch_hpc_example.py).
10 | - Here's a [full CPU example](https://github.com/williamFalcon/test-tube/blob/master/examples/hpc_cpu_example.py).
11 |
12 | You can instantiate a `SlurmCluster` via:
13 |
14 | ``` {.python}
15 | from test_tube.hpc import SlurmCluster
16 |
17 | # hyperparameters is a test-tube hyper params object
18 | # see https://williamfalcon.github.io/test-tube/hyperparameter_optimization/HyperOptArgumentParser/
19 | hyperparams = args.parse()
20 |
21 | # init cluster
22 | cluster = SlurmCluster(
23 | hyperparam_optimizer=hyperparams,
24 | log_path='/path/to/log/results/to',
25 | python_cmd='python3'
26 | )
27 |
28 | # let the cluster know where to email for a change in job status (ie: complete, fail, etc...)
29 | cluster.notify_job_status(email='some@email.com', on_done=True, on_fail=True)
30 |
31 | # set the job options. In this instance, we'll run 20 different models
32 | # each with its own set of hyperparameters giving each one 1 GPU (ie: taking up 20 GPUs)
33 | cluster.per_experiment_nb_gpus = 1
34 | cluster.per_experiment_nb_nodes = 1
35 |
36 | # we'll request 10GB of memory per node
37 | cluster.memory_mb_per_node = 10000
38 |
39 | # set a walltime of 10 minues
40 | cluster.job_time = '10:00'
41 |
42 | # 1 minute before walltime is up, SlurmCluster will launch a continuation job and kill this job.
43 | # you must provide your own loading and saving function which the cluster object will call
44 | cluster.minutes_to_checkpoint_before_walltime = 1
45 |
46 | # run the models on the cluster
47 | cluster.optimize_parallel_cluster_gpu(train, nb_trials=20, job_name='first_tt_batch', job_display_name='my_batch')
48 | ```
49 |
50 | ------------------------------------------------------------------------
51 |
52 | ## init options
53 |
54 | ### `hyperparam_optimizer`
55 |
56 | A [HyperOptArgumentParser](https://williamfalcon.github.io/test-tube/hyperparameter_optimization/HyperOptArgumentParser/) object
57 | which contains all permutations of model hyperparameters to run.
58 |
59 | ### `log_path`
60 |
61 | Path to save the slurm scripts, error logs and out logs created. Usually this would be the experiments folder path where test tube saves [Experiment](https://williamfalcon.github.io/test-tube/experiment_tracking/experiment/) information.
62 |
63 | ### `python_cmd`
64 |
65 | This is the command that starts the python program. Normally it is:
66 |
67 | ``` {.python}
68 | # python 2
69 | python main.py
70 |
71 | # python 3
72 | python3 main.py
73 | ```
74 |
75 | ### `enable_log_err`
76 |
77 | If true, saves slurm error logs to the path at *log_path*. If anything goes wrong in your job, you'll find the error here.
78 |
79 | ### `enable_log_out`
80 |
81 | If true, saves slurm output logs to the path at *log_path*. This file contains all outputs that would show up on the console normally.
82 |
83 | ### `test_tube_exp_name`
84 |
85 | When this is given, it structures the files in a nice format to fit with the folder structure of the [Experiment](https://williamfalcon.github.io/test-tube/experiment_tracking/experiment/) object's output.
86 |
87 | ## Properties
88 |
89 | `job_time`
90 | String. Walltime requested. Examples:
91 | ```{.python}
92 | # 1 hour and 10 minutes
93 | cluster.job_time = '1:10:00'
94 |
95 | # 1 day and 1 hour and 10 minutes
96 | cluster.job_time = '1-1:10:00'
97 |
98 | # 1 day and 1 hour and 10 minutes
99 | cluster.job_time = '25:10:00'
100 |
101 | # 10 minutes
102 | cluster.job_time = '10:00'
103 |
104 | # 10 seconds
105 | cluster.job_time = '10'
106 | ```
107 |
108 | `minutes_to_checkpoint_before_walltime`
109 | Int. Minutes before walltime when a continuation job will be auto-submitted.
110 | ```{.python}
111 | cluster.job_time = '10:00'
112 | cluster.minutes_to_checkpoint_before_walltime = 2
113 |
114 | # New job will be submited to continue training after 8 minutes of the job running.
115 | ```
116 |
117 | `per_experiment_nb_gpus`
118 | Int. Number of GPUs each job will get.
119 | ```{.python}
120 | # EACH job will get 2 GPUs (ie: if a model runs over two GPUs at the same time).
121 | cluster.per_experiment_nb_gpus = 2
122 | ```
123 |
124 | `per_experiment_nb_cpus`
125 | Int. Number of CPUs each job will get.
126 | ```{.python}
127 | cluster.per_experiment_nb_cpus = 1
128 | ```
129 |
130 | `per_experiment_nb_nodes`
131 | Int. Number of nodes each job will get.
132 | ```{.python}
133 | cluster.per_experiment_nb_nodes = 1
134 | ```
135 |
136 | `gpu_type`
137 | String. Gpu type requested. Example:
138 | ```{.python}
139 | cluster.gpu_type = '1080ti'
140 | ```
141 |
142 | ------------------------------------------------------------------------
143 |
144 | ## Methods
145 |
146 | ### `set_checkpoint_save_function`
147 |
148 | ``` {.python}
149 | cluster.set_checkpoint_save_function(fx, kwargs)
150 | ```
151 |
152 | Called if the model isn't finished training *minutes_to_checkpoint_before_walltime* before the walltime. If walltime = '15:00' and minutes_to_checkpoint_before_walltime = '1:00' the SlurmCluster will call your save function after 14 minutes of training.
153 |
154 | - ```fx``` A python function.
155 | - ```kwargs``` Dictionary where keys are the literal argument names to the function. Dictionary values are the values of the arguments.
156 |
157 | **Example**
158 |
159 | ``` {.python}
160 | def my_save_function(arg_1, arg_k):
161 | # ... save my model here
162 |
163 | cluster.set_checkpoint_save_function(my_save_function, kwargs={'arg_1': 'whatever', 'arg_k': 'you_want'})
164 |
165 | ```
166 |
167 | ### `set_checkpoint_load_function`
168 |
169 | ``` {.python}
170 | cluster.set_checkpoint_load_function(fx, kwargs)
171 | ```
172 |
173 | Called internally when a job is auto-submitted by the SlurmCluster to give your program a chance to load the model weights or whatever you need to continue training.
174 | This will call your load function immediately whenever you call this method AND training is continuing.
175 |
176 | - ```fx``` A python function.
177 | - ```kwargs``` Dictionary where keys are the literal argument names to the function. Dictionary values are the values of the arguments.
178 |
179 | **Example**
180 |
181 | ``` {.python}
182 | def my_load_function(arg_1, arg_k):
183 | # ... restore my model here
184 |
185 | cluster.set_checkpoint_save_function(my_load_function, kwargs={'arg_1': 'whatever', 'arg_k': 'you_want'})
186 |
187 | ```
188 |
189 | ### `add_slurm_cmd`
190 |
191 | ``` {.python}
192 | cluster.add_slurm_cmd(cmd, value, comment)
193 | ```
194 |
195 | Adds whatever Slurm command you need manually to the generated script. All possible commands are listed [here](https://slurm.schedmd.com/pdfs/summary.pdf).
196 |
197 | - ```cmd``` String with the bash command.
198 | - ```value``` String value for the command. Numericals need to be in single quotes ```'1'```
199 | - ```comment``` String with the command comment.
200 |
201 | **Example**
202 |
203 | ``` {.python}
204 | cluster.add_slurm_cmd(cmd='cpus-per-task', value='1', comment='nb cpus per task')
205 |
206 | # the above command will add an entry like this to the slurm script
207 |
208 | # #nb cpus per task
209 | # #SBATCH --cpus-per-task=1
210 | # ############
211 |
212 | ```
213 |
214 | ### `add_command`
215 |
216 | ``` {.python}
217 | cluster.add_command(cmd)
218 | ```
219 |
220 | Adds arbitrary bash commands to the script. Use this to activate conda environments, install packages, whatever else you would think about calling on bash.
221 |
222 | - ```cmd``` String with your bash command.
223 |
224 | **Example**
225 |
226 |
227 | ``` {.python}
228 | # load the anaconda package on the launch node
229 | cluster.add_command('module load anaconda')
230 |
231 | # activate the environment on the launch node
232 | cluster.add_command('source activate myCondaEnv')
233 | ```
234 |
235 | ### `load_modules`
236 |
237 | ``` {.python}
238 | cluster.load_modules(modules)
239 | ```
240 |
241 | Loads modules needed to run the job. Your Slurm documentation should have a list of available modules. You can also get those by running ```module avail```.
242 | - ```modules``` Array of module names.
243 |
244 | **Example**
245 |
246 |
247 | ``` {.python}
248 | cluster.load_modules([
249 | 'python-3',
250 | 'anaconda3'
251 | ])
252 | ```
253 |
254 | ### `notify_job_status`
255 |
256 | ``` {.python}
257 | cluster.notify_job_status(email, on_done, on_fail)
258 | ```
259 |
260 | Loads modules needed to run the job. Your Slurm documentation should have a list of available modules. You can also get those by running ```module avail```.
261 |
262 | - ```email``` String. Email address to get notifications.
263 | - ```on_done``` Boolean. If true, you'll get an email when the job completes.
264 | - ```on_fail``` Boolean. If true, you'll get an email if the job fails.
265 |
266 | **Example**
267 |
268 |
269 | ``` {.python}
270 | cluster.notify_job_status(email='some@email.com', on_done=True, on_fail=True)
271 | ```
272 |
273 | ### `optimize_parallel_cluster_gpu`
274 |
275 | ``` {.python}
276 | cluster.optimize_parallel_cluster_gpu(train_function, nb_trials, job_name, job_display_name=None)
277 | ```
278 |
279 | Launches the hyperparameter search across the cluster nodes.
280 | - ```train_function``` The entry point to start your training routine.
281 | - ```nb_trials``` Number of trials to launch. This is the number of hyperparameter configurations to train over.
282 | - ```job_name``` Folder name where the slurm scripts will save to. This should be the same as your [Experiment](https://williamfalcon.github.io/test-tube/experiment_tracking/experiment/) name.
283 | - ```job_display_name``` Visible name when slurm lists running jobs (ie: through ```squeue -u user_name```). This should be really short (if using a test tube Experiment, it'll put the experiment version at the end).
284 |
285 | **Example**
286 |
287 |
288 | ``` {.python}
289 | def main(hparams, cluster, return_dict):
290 | # do your own generic training code here...
291 | # init model
292 | model = model_build(hparams)
293 |
294 | # set the load and save fxs
295 | cluster.set_checkpoint_save_function(fx, {})
296 | cluster.set_checkpoint_load_function(fx, {})
297 |
298 | # train ...
299 |
300 |
301 | cluster.optimize_parallel_cluster_gpu(main, nb_trials=20, job_name='my_job', job_display_name='mj')
302 | ```
303 |
304 | Now if you get the job information, you'll see this:
305 | ``` {.bash}
306 | (conda_env) [user@node dir]$ squeue -u my_name
307 | JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON)
308 | 104040 all mjv0 my_name R 58:22 1 nodeName
309 | 104041 all mjv1 my_name R 58:22 1 nodeName
310 | 104042 all mjv2 my_name R 58:22 1 nodeName
311 | 104043 all mjv3 my_name R 58:22 1 nodeName
312 | ```
313 |
314 | ### `optimize_parallel_cluster_cpu`
315 |
316 | ``` {.python}
317 | cluster.optimize_parallel_cluster_cpu(train_function, nb_trials, job_name, job_display_name=None)
318 | ```
319 |
320 | Launches the hyperparameter search across the cluster nodes using cpus.
321 | - ```train_function``` The entry point to start your training routine.
322 | - ```nb_trials``` Number of trials to launch. This is the number of hyperparameter configurations to train over.
323 | - ```job_name``` Folder name where the slurm scripts will save to. This should be the same as your [Experiment](https://williamfalcon.github.io/test-tube/experiment_tracking/experiment/) name.
324 | - ```job_display_name``` Visible name when slurm lists running jobs (ie: through ```squeue -u user_name```). This should be really short (if using a test tube Experiment, it'll put the experiment version at the end).
325 |
326 | **Example**
327 |
328 |
329 | ``` {.python}
330 | def main(hparams, cluster, return_dict):
331 | # do your own generic training code here...
332 | # init model
333 | model = model_build(hparams)
334 |
335 | # set the load and save fxs
336 | cluster.set_checkpoint_save_function(fx, {})
337 | cluster.set_checkpoint_load_function(fx, {})
338 |
339 | # train ...
340 |
341 |
342 | cluster.optimize_parallel_cluster_cpu(main, nb_trials=20, job_name='my_job', job_display_name='mj')
343 | ```
344 |
345 | Now if you get the job information, you'll see this:
346 | ``` {.bash}
347 | (conda_env) [user@node dir]$ squeue -u my_name
348 | JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON)
349 | 104040 all mjv0 my_name R 58:22 1 nodeName
350 | 104041 all mjv1 my_name R 58:22 1 nodeName
351 | 104042 all mjv2 my_name R 58:22 1 nodeName
352 | 104043 all mjv3 my_name R 58:22 1 nodeName
353 | ```
354 |
--------------------------------------------------------------------------------
/site/hyperparameter_optimization/HyperOptArgumentParser/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 | HyperOptArgumentParser class API - Test tube Documentation
12 |
13 |
14 |
15 |
16 |
17 |
18 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
46 |
47 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 | Test tube Documentation
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 | Docs »
122 |
123 |
124 |
125 | Hyperparameter optimization »
126 |
127 |
128 |
129 | HyperOptArgumentParser class API
130 |
131 |
132 | Edit on GitHub
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
HyperOptArgumentParser class API
143 |
[Github Code ]
144 |
The HyperOptArgumentParser is a subclass of python's
145 | argparse , with added
146 | finctionality to change parameters on the fly as determined by a
147 | sampling strategy.
148 |
You can instantiate an HyperOptArgumentParser via:
149 |
from test_tube import HyperOptArgumentParser
150 |
151 | # subclass of argparse
152 | parser = HyperOptArgumentParser(strategy='random_search')
153 | parser.add_argument('--learning_rate', default=0.002, type=float, help='the learning rate')
154 |
155 | # let's enable optimizing over the number of layers in the network
156 | parser.opt_list('--nb_layers', default=2, type=int, tunable=True, options=[2, 4, 8])
157 |
158 | # and tune the number of units in each layer
159 | parser.opt_range('--neurons', default=50, type=int, tunable=True, low=100, high=800, nb_samples=10)
160 |
161 | # compile (because it's argparse underneath)
162 | hparams = parser.parse_args()
163 |
164 | # run 20 trials of random search over the hyperparams
165 | for hparam_trial in hparams.trials(20):
166 | train_network(hparam_trial)
167 |
168 |
169 |
170 |
init options
171 |
strategy
172 |
Use either random
173 | search
174 | or grid
175 | search
176 | for tuning:
177 |
parser = HyperOptArgumentParser(strategy='grid_search')
178 |
179 |
180 |
181 |
Methods
182 |
All the functionality from argparse works but we've added the following
183 | functionality:
184 |
opt_list
185 |
parser.opt_list('--nb_layers', default=2, type=int, tunable=True, options=[2, 4, 8])
186 |
187 |
188 |
Enables searching over a list of values for this parameter. The tunable
189 | values ONLY replace the argparse values when running a hyperparameter
190 | optimization search. This is on purpose so your code doesn't have to
191 | change when you want to tune it.
192 |
Example
193 |
parser.opt_list('--nb_layers', default=2, type=int, tunable=True, options=[2, 4, 8])
194 | hparams = parser.parse_args()
195 | # hparams.nb_layers = 2
196 |
197 | for trial in hparams.trials(2):
198 | # trial.nb_layers is now a value in [2, 4, 8]
199 | # but hparams.nb_layers is still 2
200 |
201 |
202 |
opt_range
203 |
parser.opt_range('--neurons', default=50, type=int, tunable=True, low=100, high=800, nb_samples=8, log_base=None)
204 |
205 |
206 |
Enables searching over a range of values chosen randomly using the
207 | nb_samples given. The tunable values only replace the argparse
208 | values when running a hyperparameter optimization search. This is on
209 | purpose so your code doesn't have to change when you want to tune it.
210 |
If log_base is set to a positive number, it will randomly search over
211 | a log scale, where the log base is log_base. This is better for search
212 | over several orders of magnitude efficiently.
213 |
Example
214 |
parser.opt_range('--neurons', default=50, type=int, tunable=True, low=100, high=800, nb_samples=8)
215 | hparams = parser.parse_args()
216 | # hparams.neurons = 50
217 |
218 | for trial in hparams.trials(2):
219 | # trial.nb_layers is now a value in [100, 200, 300, 400, 500, 600 700, 800]
220 | # but hparams.neurons is still 50
221 |
222 |
223 |
json_config
224 |
parser.json_config('--config', default='example.json')
225 |
226 |
227 |
Replaces default values in the parser with those read from the json file
228 |
Example
229 |
example.json
230 |
{
231 | "learning_rate": 200
232 | }
233 |
234 |
235 |
parser.add_argument('--learning_rate', default=0.002, type=float, help='the learning rate')
236 | parser.json_config('--config', default='example.json')
237 | hparams = parser.parse_args()
238 |
239 | # hparams.learning_rate = 200
240 |
241 |
242 |
trials
243 |
trial_generator = hparams.trials(2)
244 |
245 |
246 |
Computes the trials needed for these experiments and serves them via a
247 | generator
248 |
Example
249 |
hparams = parser.parse_args()
250 | for trial_hparams in hparams.trials(2):
251 | # trial_hparams now has values sampled from the training routine
252 |
253 |
254 |
optimize_parallel_gpu
255 |
hparams = parser.parse_args()
256 | hparams.optimize_parallel_gpu(function_to_optimize, gpu_ids=['1', '0, 2'])
257 |
258 |
259 |
Parallelize the trials across nb_workers processes. Auto assign the
260 | correct gpus. Argument passed into the function_to_optimize is the
261 | trial_params argument and the gpu_ids.
262 |
Example
263 |
# parallelize tuning on 2 gpus
264 | # this will place each trial in n into a given gpu
265 | def train_main(trial_params, gpu_ids):
266 | # train your model, etc here...
267 |
268 | hparams = parser.parse_args()
269 | hparams.optimize_parallel_gpu(train_main, gpu_ids=['1', '0, 2'])
270 |
271 | # at the end of the optimize_parallel function, all 20 trials will be completed
272 | # in this case by running 10 sets of 2 trials in parallel
273 |
274 |
275 |
optimize_parallel_cpu
276 |
hparams = parser.parse_args()
277 | hparams.optimize_parallel_cpu(function_to_optimize, nb_trials=20, nb_workers=2)
278 |
279 |
280 |
Parallelize the trials across nb_workers cpus. Argument passed into
281 | the function_to_optimize is the trial_params argument.
282 |
Example
283 |
# parallelize tuning on 2 cpus
284 | # this will place each trial in n into a given gpu
285 | def train_main(trial_params):
286 | # train your model, etc here...
287 |
288 | hparams = parser.parse_args()
289 | hparams.optimize_parallel_cpu(train_main, nb_trials=20, nb_workers=2)
290 |
291 | # at the end of the optimize_parallel function, all 20 trials will be completed
292 | # in this case by running 10 sets of 2 trials in parallel
293 |
294 |
295 |
296 |
297 |
298 |
299 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 | Built with MkDocs using a theme provided by Read the Docs .
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
--------------------------------------------------------------------------------
/test_tube/argparse_hopt.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | import math
4 | import os
5 | import random
6 | import re
7 | import traceback
8 | from argparse import ArgumentParser
9 | from copy import deepcopy
10 | from gettext import gettext as _
11 | from multiprocessing import Pool, Queue
12 | from time import sleep
13 |
14 | import numpy as np
15 |
16 | from .hyper_opt_utils import strategies
17 |
18 | # needed to work with pytorch multiprocess
19 | try:
20 | import torch
21 | import multiprocessing
22 | # multiprocessing.set_start_method('spawn', force=True)
23 | except ModuleNotFoundError:
24 | pass
25 |
26 |
27 | def optimize_parallel_gpu_private(args):
28 | trial_params, train_function = args[0], args[1]
29 |
30 | # get set of gpu ids
31 | gpu_id_set = g_gpu_id_q.get(block=True)
32 |
33 | try:
34 |
35 | # enable the proper gpus
36 | os.environ["CUDA_VISIBLE_DEVICES"] = gpu_id_set
37 |
38 | # run training fx on the specific gpus
39 | results = train_function(trial_params, gpu_id_set)
40 |
41 | return [trial_params, results]
42 |
43 | except Exception as e:
44 | print('Caught exception in worker thread', e)
45 |
46 | # This prints the type, value, and stack trace of the
47 | # current exception being handled.
48 | traceback.print_exc()
49 | return [trial_params, None]
50 |
51 | finally:
52 | g_gpu_id_q.put(gpu_id_set)
53 |
54 |
55 | def optimize_parallel_cpu_private(args):
56 | trial_params, train_function = args[0], args[1]
57 |
58 | sleep(random.randint(0, 4))
59 |
60 | # run training fx on the specific gpus
61 | results = train_function(trial_params)
62 |
63 | # True = completed
64 | return [trial_params, results]
65 |
66 |
67 | class HyperOptArgumentParser(ArgumentParser):
68 | """
69 | Subclass of argparse ArgumentParser which adds optional calls to sample from lists or ranges
70 | Also enables running optimizations across parallel processes
71 | """
72 |
73 | # these are commands injected by test tube from cluster operations
74 | TRIGGER_CMD = 'test_tube_from_cluster_hopt'
75 | SLURM_CMD_PATH = 'test_tube_slurm_cmd_path'
76 | SLURM_EXP_CMD = 'hpc_exp_number'
77 | SLURM_LOAD_CMD = 'test_tube_do_checkpoint_load'
78 | CMD_MAP = {
79 | TRIGGER_CMD: bool,
80 | SLURM_CMD_PATH: str,
81 | SLURM_EXP_CMD: int,
82 | SLURM_LOAD_CMD: bool
83 | }
84 |
85 | def __init__(self, strategy='grid_search', **kwargs):
86 | """
87 |
88 | :param strategy: 'grid_search', 'random_search'
89 | :param enabled:
90 | :param experiment:
91 | :param kwargs:
92 | """
93 | ArgumentParser.__init__(self, **kwargs)
94 |
95 | self.strategy = strategy
96 | self.trials = []
97 | self.parsed_args = None
98 | self.opt_args = {}
99 | self.json_config_arg_name = None
100 | self.pool = None
101 |
102 | def __getstate__(self):
103 | # capture what is normally pickled
104 | state = self.__dict__.copy()
105 |
106 | # remove all functions from the namespace
107 | clean_state = {}
108 | for k, v in state.items():
109 | if not hasattr(v, '__call__'):
110 | clean_state[k] = v
111 |
112 | # what we return here will be stored in the pickle
113 | return clean_state
114 |
115 | def __setstate__(self, newstate):
116 | # re-instate our __dict__ state from the pickled state
117 | self.__dict__.update(newstate)
118 |
119 | def add_argument(self, *args, **kwargs):
120 | super(HyperOptArgumentParser, self).add_argument(*args, **kwargs)
121 |
122 | def opt_list(self, *args, **kwargs):
123 | options = kwargs.pop("options", None)
124 | tunable = kwargs.pop("tunable", False)
125 | self.add_argument(*args, **kwargs)
126 | for i in range(len(args)):
127 | arg_name = args[i]
128 | self.opt_args[arg_name] = OptArg(obj_id=arg_name, opt_values=options, tunable=tunable)
129 |
130 | def opt_range(
131 | self,
132 | *args,
133 | **kwargs
134 | ):
135 | low = kwargs.pop("low", None)
136 | high = kwargs.pop("high", None)
137 | arg_type = kwargs["type"]
138 | nb_samples = kwargs.pop("nb_samples", 10)
139 | tunable = kwargs.pop("tunable", False)
140 | log_base = kwargs.pop("log_base", None)
141 |
142 | self.add_argument(*args, **kwargs)
143 | arg_name = args[-1]
144 | self.opt_args[arg_name] = OptArg(
145 | obj_id=arg_name,
146 | opt_values=[low, high],
147 | arg_type=arg_type,
148 | nb_samples=nb_samples,
149 | tunable=tunable,
150 | log_base=log_base,
151 | )
152 |
153 | def json_config(self, *args, **kwargs):
154 | self.add_argument(*args, **kwargs)
155 | self.json_config_arg_name = re.sub('-', '', args[-1])
156 |
157 | def __parse_args(self, args=None, namespace=None):
158 | # allow bypassing certain missing params which other parts of test tube may introduce
159 | args, argv = self.parse_known_args(args, namespace)
160 | args, argv = self.__whitelist_cluster_commands(args, argv)
161 | if argv:
162 | msg = _('unrecognized arguments: %s')
163 | self.error(msg % ' '.join(argv))
164 | return args
165 |
166 | def __whitelist_cluster_commands(self, args, argv):
167 | parsed = {}
168 |
169 | # build a dict where key = arg, value = value of the arg or None if just a flag
170 | for i, arg_candidate in enumerate(argv):
171 | arg = None
172 | value = None
173 |
174 | # only look at --keys
175 | if '--' not in arg_candidate:
176 | continue
177 |
178 | # skip items not on the white list
179 | if arg_candidate[2:] not in HyperOptArgumentParser.CMD_MAP:
180 | continue
181 |
182 | arg = arg_candidate[2:]
183 | # pull out the value of the argument if given
184 | if i + 1 <= len(argv) - 1:
185 | if '--' not in argv[i + 1]:
186 | value = argv[i + 1]
187 |
188 | if arg is not None:
189 | parsed[arg] = value
190 | else:
191 | if arg is not None:
192 | parsed[arg] = value
193 |
194 | # add the whitelist cmds to the args
195 | all_values = set()
196 | for k, v in args.__dict__.items():
197 | all_values.add(k)
198 | all_values.add(v)
199 |
200 | for arg, v in parsed.items():
201 | v_parsed = self.__parse_primitive_arg_val(v)
202 | all_values.add(v)
203 | all_values.add(arg)
204 | args.__setattr__(arg, v_parsed)
205 |
206 | # make list with only the unknown args
207 | unk_args = []
208 | for arg in argv:
209 | arg_candidate = re.sub('--', '', arg)
210 | is_bool = arg_candidate == 'True' or arg_candidate == 'False'
211 | if is_bool: continue
212 |
213 | if arg_candidate not in all_values:
214 | unk_args.append(arg)
215 |
216 | # when no bad args are left, return none to be consistent with super api
217 | if len(unk_args) == 0:
218 | unk_args = None
219 |
220 | # add hpc_exp_number if not passed in so we can never get None
221 | if HyperOptArgumentParser.SLURM_EXP_CMD not in args:
222 | args.__setattr__(HyperOptArgumentParser.SLURM_EXP_CMD, None)
223 |
224 | return args, unk_args
225 |
226 | def __parse_primitive_arg_val(self, val):
227 | if val is None:
228 | return True
229 | try:
230 | return int(val)
231 | except ValueError:
232 | try:
233 | return float(val)
234 | except ValueError:
235 | return val
236 |
237 | def parse_args(self, args=None, namespace=None):
238 | # call superclass arg first
239 | results = self.__parse_args(args, namespace)
240 |
241 | # extract vals
242 | old_args = vars(results)
243 |
244 | # override with json args if given
245 | if self.json_config_arg_name and old_args[self.json_config_arg_name]:
246 | for arg, v in self.__read_json_config(old_args[self.json_config_arg_name]).items():
247 | old_args[arg] = v
248 |
249 | # track args
250 | self.parsed_args = deepcopy(old_args)
251 | # attach optimization fx
252 | old_args['trials'] = self.opt_trials
253 | old_args['optimize_parallel'] = self.optimize_parallel
254 | old_args['optimize_parallel_gpu'] = self.optimize_parallel_gpu
255 | old_args['optimize_parallel_cpu'] = self.optimize_parallel_cpu
256 | old_args['generate_trials'] = self.generate_trials
257 | old_args['optimize_trials_parallel_gpu'] = self.optimize_trials_parallel_gpu
258 |
259 | return TTNamespace(**old_args)
260 |
261 | def __read_json_config(self, file_path):
262 | with open(file_path) as json_data:
263 | json_args = json.load(json_data)
264 | return json_args
265 |
266 | def opt_trials(self, num):
267 | self.trials = strategies.generate_trials(
268 | strategy=self.strategy,
269 | flat_params=self.__flatten_params(self.opt_args),
270 | nb_trials=num,
271 | )
272 |
273 | for trial in self.trials:
274 | ns = self.__namespace_from_trial(trial)
275 | yield ns
276 |
277 | def generate_trials(self, nb_trials):
278 | trials = strategies.generate_trials(
279 | strategy=self.strategy,
280 | flat_params=self.__flatten_params(self.opt_args),
281 | nb_trials=nb_trials,
282 | )
283 |
284 | trials = [self.__namespace_from_trial(x) for x in trials]
285 | return trials
286 |
287 | def optimize_parallel_gpu(
288 | self,
289 | train_function,
290 | gpu_ids,
291 | max_nb_trials=None,
292 | ):
293 | """
294 | Runs optimization across gpus with cuda drivers
295 | :param train_function:
296 | :param max_nb_trials:
297 | :param gpu_ids: List of strings like: ['0', '1, 3']
298 | :return:
299 | """
300 | self.trials = strategies.generate_trials(
301 | strategy=self.strategy,
302 | flat_params=self.__flatten_params(self.opt_args),
303 | nb_trials=max_nb_trials,
304 | )
305 |
306 | self.trials = [(self.__namespace_from_trial(x), train_function) for x in self.trials]
307 |
308 | # build q of gpu ids so we can use them in each process
309 | # this is thread safe so each process can pull out a gpu id, run its task and put it back when done
310 | if self.pool is None:
311 | gpu_q = Queue()
312 | for gpu_id in gpu_ids:
313 | gpu_q.put(gpu_id)
314 |
315 | # called by the Pool when a process starts
316 | def init(local_gpu_q):
317 | global g_gpu_id_q
318 | g_gpu_id_q = local_gpu_q
319 |
320 | # init a pool with the nb of worker threads we want
321 | nb_workers = len(gpu_ids)
322 | self.pool = Pool(processes=nb_workers, initializer=init, initargs=(gpu_q,))
323 |
324 | # apply parallelization
325 | results = self.pool.map(optimize_parallel_gpu_private, self.trials)
326 | return results
327 |
328 | def optimize_trials_parallel_gpu(
329 | self,
330 | train_function,
331 | nb_trials,
332 | trials,
333 | gpu_ids,
334 | nb_workers=4,
335 | ):
336 | """
337 | Runs optimization across gpus with cuda drivers
338 | :param train_function:
339 | :param nb_trials:
340 | :param gpu_ids: List of strings like: ['0', '1, 3']
341 | :param nb_workers:
342 | :return:
343 | """
344 | self.trials = trials
345 | self.trials = [(x, train_function) for x in self.trials]
346 |
347 | # build q of gpu ids so we can use them in each process
348 | # this is thread safe so each process can pull out a gpu id, run its task and put it back when done
349 | if self.pool is None:
350 | gpu_q = Queue()
351 | for gpu_id in gpu_ids:
352 | gpu_q.put(gpu_id)
353 |
354 | # called by the Pool when a process starts
355 | def init(local_gpu_q):
356 | global g_gpu_id_q
357 | g_gpu_id_q = local_gpu_q
358 |
359 | # init a pool with the nb of worker threads we want
360 | self.pool = Pool(processes=nb_workers, initializer=init, initargs=(gpu_q,))
361 |
362 | # apply parallelization
363 | results = self.pool.map(optimize_parallel_gpu_private, self.trials)
364 | return results
365 |
366 | def optimize_parallel_cpu(
367 | self,
368 | train_function,
369 | nb_trials,
370 | nb_workers=4,
371 | ):
372 | """
373 | Runs optimization across n cpus
374 | :param train_function:
375 | :param nb_trials:
376 | :param nb_workers:
377 | :return:
378 | """
379 | self.trials = strategies.generate_trials(
380 | strategy=self.strategy,
381 | flat_params=self.__flatten_params(self.opt_args),
382 | nb_trials=nb_trials
383 | )
384 |
385 | self.trials = [(self.__namespace_from_trial(x), train_function) for x in self.trials]
386 |
387 | # init a pool with the nb of worker threads we want
388 | if self.pool is None:
389 | self.pool = Pool(processes=nb_workers)
390 |
391 | # apply parallelization
392 | results = self.pool.map(optimize_parallel_cpu_private, self.trials)
393 | return results
394 |
395 | def optimize_parallel(
396 | self,
397 | train_function,
398 | nb_trials,
399 | nb_parallel=4,
400 | ):
401 | self.trials = strategies.generate_trials(
402 | strategy=self.strategy,
403 | flat_params=self.__flatten_params(self.opt_args),
404 | nb_trials=nb_trials
405 | )
406 |
407 | # nb of runs through all parallel systems
408 | fork_batches = [
409 | self.trials[i:i + nb_parallel] for i in range(0, len(self.trials), nb_parallel)
410 | ]
411 |
412 | for fork_batch in fork_batches:
413 | children = []
414 |
415 | # run n parallel forks
416 | for parallel_nb, trial in enumerate(fork_batch):
417 |
418 | # q up the trial and convert to a namespace
419 | ns = self.__namespace_from_trial(trial)
420 |
421 | # split new fork
422 | pid = os.fork()
423 |
424 | # when the process is a parent
425 | if pid:
426 | children.append(pid)
427 |
428 | # when process is a child
429 | else:
430 | # slight delay to make sure we don't overwrite over test tube log versions
431 | sleep(parallel_nb * 0.5)
432 | train_function(ns, parallel_nb)
433 | os._exit(0)
434 |
435 | for i, child in enumerate(children):
436 | os.waitpid(child, 0)
437 |
438 | def __namespace_from_trial(self, trial):
439 | trial_dict = {d['name']: d['val'] for d in trial}
440 | for k, v in self.parsed_args.items():
441 | if k not in trial_dict:
442 | trial_dict[k] = v
443 |
444 | return TTNamespace(**trial_dict)
445 |
446 | def __flatten_params(self, params):
447 | """
448 | Turns a list of parameters with values into a flat tuple list of lists
449 | so we can permute
450 | :param params:
451 | :return:
452 | """
453 | flat_params = []
454 | for i, (opt_name, opt_arg) in enumerate(params.items()):
455 | if opt_arg.tunable:
456 | clean_name = opt_name.strip('-')
457 | clean_name = re.sub('-', '_', clean_name)
458 | param_groups = []
459 | for val in opt_arg.opt_values:
460 | param_groups.append({'idx': i, 'val': val, 'name': clean_name})
461 | flat_params.append(param_groups)
462 | return flat_params
463 |
464 |
465 | class TTNamespace(argparse.Namespace):
466 |
467 | def __str__(self):
468 | result = '-' * 100 + '\nHyperparameters:\n'
469 | for k, v in self.__dict__.items():
470 | result += '{0:20}: {1}\n'.format(k, v)
471 | return result
472 |
473 | def __getstate__(self):
474 | # capture what is normally pickled
475 | state = self.__dict__.copy()
476 |
477 | # remove all functions from the namespace
478 | clean_state = {}
479 | for k, v in state.items():
480 | if not hasattr(v, '__call__'):
481 | clean_state[k] = v
482 |
483 | # what we return here will be stored in the pickle
484 | return clean_state
485 |
486 | def __setstate__(self, newstate):
487 | # re-instate our __dict__ state from the pickled state
488 | self.__dict__.update(newstate)
489 |
490 |
491 | class OptArg(object):
492 | def __init__(
493 | self,
494 | obj_id,
495 | opt_values,
496 | arg_type=None,
497 | nb_samples=None,
498 | tunable=False,
499 | log_base=None,
500 | ):
501 | self.opt_values = opt_values
502 | self.obj_id = obj_id
503 | self.tunable = tunable
504 |
505 | # convert range to list of values
506 | if nb_samples:
507 | low, high = opt_values
508 |
509 | if log_base is None:
510 | # random search on uniform scale
511 | if arg_type is int:
512 | self.opt_values = [int(_) for _ in np.random.choice(np.arange(low, high), nb_samples, replace=False)]
513 | elif arg_type is float:
514 | self.opt_values = np.random.uniform(low, high, nb_samples)
515 | else:
516 | # random search on log scale with specified base
517 | assert high >= low > 0, "`opt_values` must be positive to do log-scale search."
518 |
519 | log_low, log_high = math.log(low, log_base), math.log(high, log_base)
520 |
521 | self.opt_values = log_base ** np.random.uniform(log_low, log_high, nb_samples)
522 |
523 |
--------------------------------------------------------------------------------
/test_tube/hpc.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import os
3 | import signal
4 | import sys
5 | import time
6 | import traceback
7 | from subprocess import call
8 |
9 | from .argparse_hopt import HyperOptArgumentParser
10 |
11 |
12 | def exit():
13 | time.sleep(1)
14 | os._exit(1)
15 |
16 |
17 | class AbstractCluster(object):
18 |
19 | RUN_CMD = 'sbatch'
20 | def __init__(
21 | self,
22 | hyperparam_optimizer=None,
23 | log_path=None,
24 | python_cmd='python3',
25 | enable_log_err=True,
26 | enable_log_out=True,
27 | ):
28 | self.hyperparam_optimizer = hyperparam_optimizer
29 | self.log_path = log_path
30 |
31 | self.enable_log_err = enable_log_err
32 | self.enable_log_out = enable_log_out
33 | self.slurm_files_log_path = None
34 | self.err_log_path = None
35 | self.out_log_path = None
36 | self.modules = []
37 | self.script_name = os.path.realpath(sys.argv[0])
38 | self.job_time = '15:00'
39 | self.minutes_to_checkpoint_before_walltime = 5
40 | self.per_experiment_nb_gpus = 1
41 | self.per_experiment_nb_cpus = 1
42 | self.per_experiment_nb_nodes = 1
43 | self.memory_mb_per_node = 2000
44 | self.email = None
45 | self.notify_on_end = False
46 | self.notify_on_fail = False
47 | self.job_name = None
48 | self.python_cmd = python_cmd
49 | self.gpu_type = None
50 | self.on_gpu = False
51 | self.call_load_checkpoint = False
52 | self.commands = []
53 | self.slurm_commands = []
54 | self.hpc_exp_number = 0
55 |
56 | # these are set via getters and setters so we can use a BaseManager which can be shared across processes
57 | self.checkpoint_save_function = None
58 | self.checkpoint_load_function = None
59 |
60 | # detect when this was called because a slurm object started a hopt.
61 | # if true, remove the flag so tt logs don't show it
62 | if hyperparam_optimizer is not None:
63 |
64 | self.is_from_slurm_object = HyperOptArgumentParser.TRIGGER_CMD in vars(self.hyperparam_optimizer) and vars(self.hyperparam_optimizer)[HyperOptArgumentParser.TRIGGER_CMD] == True
65 | if self.is_from_slurm_object:
66 | self.hyperparam_optimizer.__delattr__(HyperOptArgumentParser.TRIGGER_CMD)
67 |
68 | self.call_load_checkpoint = HyperOptArgumentParser.SLURM_LOAD_CMD in vars(self.hyperparam_optimizer)
69 | if self.call_load_checkpoint:
70 | self.hyperparam_optimizer.__delattr__(HyperOptArgumentParser.SLURM_LOAD_CMD)
71 |
72 | self.hpc_exp_number = self.hyperparam_optimizer.hpc_exp_number
73 |
74 | def set_checkpoint_save_function(self, fx, kwargs):
75 | self.checkpoint_save_function = [fx, kwargs]
76 |
77 | def get_checkpoint_save_function(self):
78 | return self.checkpoint_save_function
79 |
80 | def set_checkpoint_load_function(self, fx, kwargs):
81 | # if we were passed in the load flag, then we call the load function as soon as it's added
82 | if self.call_load_checkpoint:
83 | fx(**kwargs)
84 |
85 | self.checkpoint_load_function = [fx, kwargs]
86 |
87 | def get_checkpoint_load_function(self):
88 | return self.checkpoint_load_function
89 |
90 | def add_slurm_cmd(self, cmd, value, comment):
91 | self.slurm_commands.append((cmd, value, comment))
92 |
93 | def add_command(self, cmd):
94 | self.commands.append(cmd)
95 |
96 | def load_modules(self, modules):
97 | self.modules = modules
98 |
99 | def notify_job_status(self, email, on_done, on_fail):
100 | self.email = email
101 | self.notify_on_end = on_done
102 | self.notify_on_fail = on_fail
103 |
104 | def optimize_parallel_cluster(self, train_function, nb_trials, job_name):
105 | raise NotImplementedError
106 |
107 | def optimize_parallel_slurm(self, job_name, output_file, error_file, job_time, nb_gpus, nb_nodes, memory, notifications_email, gpu_types):
108 | pass
109 |
110 |
111 | class SlurmCluster(AbstractCluster):
112 | def __init__(self, *args, **kwargs):
113 | super(SlurmCluster, self).__init__(*args, **kwargs)
114 |
115 | def optimize_parallel_cluster_gpu(
116 | self,
117 | train_function,
118 | nb_trials,
119 | job_name,
120 | enable_auto_resubmit=False,
121 | job_display_name=None
122 | ):
123 | if job_display_name is None:
124 | job_display_name = job_name
125 |
126 | self.__optimize_parallel_cluster_internal(train_function, nb_trials, job_name, job_display_name,
127 | enable_auto_resubmit, on_gpu=True)
128 |
129 | def optimize_parallel_cluster_cpu(
130 | self,
131 | train_function,
132 | nb_trials,
133 | job_name,
134 | enable_auto_resubmit=False,
135 | job_display_name=None
136 | ):
137 | if job_display_name is None:
138 | job_display_name = job_name
139 |
140 | self.__optimize_parallel_cluster_internal(train_function, nb_trials, job_name, job_display_name,
141 | enable_auto_resubmit, on_gpu=False)
142 |
143 | def __optimize_parallel_cluster_internal(
144 | self,
145 | train_function,
146 | nb_trials,
147 | job_name,
148 | job_display_name,
149 | enable_auto_resubmit,
150 | on_gpu
151 | ):
152 | """
153 | Runs optimization on the attached cluster
154 | :param train_function:
155 | :param nb_trials:
156 | :param job_name:
157 | :return:
158 | """
159 | self.job_name = job_name
160 | self.job_display_name = job_display_name
161 | self.on_gpu = on_gpu
162 | self.enable_auto_resubmit = enable_auto_resubmit
163 |
164 | # layout logging structure
165 | self.__layout_logging_dir()
166 |
167 | if self.is_from_slurm_object:
168 | # Script is called by slurm: it's an actual experiment.
169 | self.__run_experiment(train_function)
170 | else:
171 | # Launcher script. Generate trials and launch jobs.
172 |
173 | # generate hopt trials
174 | trials = self.hyperparam_optimizer.generate_trials(nb_trials)
175 |
176 | # get the max test tube exp version so far if it's there
177 | scripts_path = os.path.join(self.log_path, 'slurm_out_logs')
178 | next_trial_version = self.__get_max_trial_version(scripts_path)
179 |
180 | # for each trial, generate a slurm command
181 | for i, trial_params in enumerate(trials):
182 | exp_i = i + next_trial_version
183 | self.schedule_experiment(trial_params, exp_i)
184 |
185 | def schedule_experiment(self, trial_params, exp_i):
186 | timestamp = datetime.datetime.now().strftime("%Y-%m-%d__%H-%M-%S")
187 | timestamp = 'trial_{}_{}'.format(exp_i, timestamp)
188 |
189 | # generate command
190 | slurm_cmd_script_path = os.path.join(self.slurm_files_log_path, '{}_slurm_cmd.sh'.format(timestamp))
191 | slurm_cmd = self.__build_slurm_command(trial_params, slurm_cmd_script_path, timestamp, exp_i, self.on_gpu)
192 | self.__save_slurm_cmd(slurm_cmd, slurm_cmd_script_path)
193 |
194 | # run script to launch job
195 | print('\nlaunching exp...')
196 | result = call('{} {}'.format(AbstractCluster.RUN_CMD, slurm_cmd_script_path), shell=True)
197 | if result == 0:
198 | print('launched exp ', slurm_cmd_script_path)
199 | else:
200 | print('launch failed...')
201 |
202 | def slurm_time_to_seconds(self, job_time):
203 | seconds = 0
204 | time_component = job_time
205 | if '-' in job_time:
206 | days, time_component = job_time.split('-')
207 | seconds += int(days) * 24 * 60 * 60
208 |
209 | time_components = time_component.split(':')
210 | if len(time_components) == 3:
211 | hours, minutes, secs = time_components
212 | time_seconds = int(secs) + (int(minutes) * 60) + (int(hours) * 60 * 60)
213 | seconds += time_seconds
214 |
215 | elif len(time_components) == 2:
216 | minutes, secs = time_components
217 | time_seconds = int(secs) + (int(minutes) * 60)
218 | seconds += time_seconds
219 |
220 | elif len(time_components) == 1:
221 | secs = time_components[0]
222 | seconds += int(secs)
223 |
224 | return seconds
225 |
226 | def call_save(self):
227 | print('calling save')
228 |
229 | # if save function was passed, call it
230 | if self.get_checkpoint_save_function() is not None:
231 | save_fx, kwargs = self.get_checkpoint_save_function()
232 | save_fx(**kwargs)
233 |
234 | # if we're here, the job didn't finish and we were given a save function
235 | # if we were given a load function, then schedule the program again and pass in the load function
236 | if self.get_checkpoint_load_function() is not None:
237 | job_id = os.environ['SLURM_JOB_ID']
238 | cmd = 'scontrol requeue {}'.format(job_id)
239 |
240 | print('\nrequeing job {}...'.format(job_id))
241 | result = call(cmd, shell=True)
242 | if result == 0:
243 | print('requeued exp ', job_id)
244 | else:
245 | print('requeue failed...')
246 |
247 | # stop program
248 | os._exit(0)
249 |
250 | def sig_handler(self, signum, frame):
251 | print("caught signal", signum)
252 | self.call_save()
253 | # sys.exit(-1)
254 |
255 | # ------------------------
256 | # HANDLE SLURM SIGNALS
257 | # ------------------------
258 | def term_handler(self, signum, frame):
259 | print("bypassing sigterm")
260 |
261 | def __run_experiment(self, train_function):
262 | if self.enable_auto_resubmit:
263 | print('setting signal')
264 | signal.signal(signal.SIGUSR1, self.sig_handler)
265 | signal.signal(signal.SIGTERM, self.term_handler)
266 |
267 | try:
268 | # run training
269 | train_function(self.hyperparam_optimizer, self)
270 |
271 | except Exception as e:
272 | print('Caught exception in worker thread', e)
273 |
274 | # This prints the type, value, and stack trace of the
275 | # current exception being handled.
276 | traceback.print_exc()
277 | raise SystemExit
278 |
279 | def __save_slurm_cmd(self, slurm_cmd, slurm_cmd_script_path):
280 | with open(slurm_cmd_script_path, mode='w') as file:
281 | file.write(slurm_cmd)
282 |
283 | def __get_max_trial_version(self, path):
284 | files = os.listdir(path)
285 | version_files = [f for f in files if 'trial_' in f]
286 | if len(version_files) > 0:
287 | # regex out everything except file version for ve
288 | versions = [int(f_name.split('_')[1]) for f_name in version_files]
289 | max_version = max(versions)
290 | return max_version + 1
291 | else:
292 | return 0
293 |
294 | def __layout_logging_dir(self):
295 | """
296 | Generates dir structure for logging errors and outputs
297 | :return:
298 | """
299 |
300 | # format the logging folder path
301 | slurm_out_path = os.path.join(self.log_path, self.job_name)
302 |
303 | self.log_path = slurm_out_path
304 |
305 | # if we have a test tube name, make the folder and set as the logging destination
306 | if not os.path.exists(slurm_out_path):
307 | os.makedirs(slurm_out_path)
308 |
309 | # when err logging is enabled, build add the err logging folder
310 | if self.enable_log_err:
311 | err_path = os.path.join(slurm_out_path, 'slurm_err_logs')
312 | if not os.path.exists(err_path):
313 | os.makedirs(err_path)
314 | self.err_log_path = err_path
315 |
316 | # when out logging is enabled, build add the out logging folder
317 | if self.enable_log_out:
318 | out_path = os.path.join(slurm_out_path, 'slurm_out_logs')
319 | if not os.path.exists(out_path):
320 | os.makedirs(out_path)
321 | self.out_log_path = out_path
322 |
323 | # place where slurm files log to
324 | self.slurm_files_log_path = os.path.join(slurm_out_path, 'slurm_scripts')
325 | if not os.path.exists(self.slurm_files_log_path):
326 | os.makedirs(self.slurm_files_log_path)
327 |
328 | def __get_hopt_params(self, trial):
329 | """
330 | Turns hopt trial into script params
331 | :param trial:
332 | :return:
333 | """
334 |
335 | params = []
336 | for k in trial.__dict__:
337 | v = trial.__dict__[k]
338 |
339 | # don't add None params
340 | if v is None or v is False:
341 | continue
342 |
343 | # put everything in quotes except bools
344 | if self.__should_escape(v):
345 | cmd = '--{} \"{}\"'.format(k, v)
346 | else:
347 | cmd = '--{} {}'.format(k, v)
348 | params.append(cmd)
349 |
350 | # this arg lets the hyperparameter optimizer do its thing
351 | params.append('--{}'.format(HyperOptArgumentParser.TRIGGER_CMD))
352 |
353 | full_cmd = ' '.join(params)
354 | return full_cmd
355 |
356 | def __should_escape(self, v):
357 | v = str(v)
358 | return '[' in v or ';' in v or ' ' in v
359 |
360 | def __build_slurm_command(self, trial, slurm_cmd_script_path, timestamp, exp_i, on_gpu):
361 | sub_commands = []
362 |
363 | command =[
364 | '#!/bin/bash',
365 | '#',
366 | '# Auto-generated by test-tube (https://github.com/williamFalcon/test-tube)',
367 | '#################\n'
368 | ]
369 | sub_commands.extend(command)
370 |
371 | # add job name
372 | job_with_version = '{}v{}'.format(self.job_display_name, exp_i)
373 | command = [
374 | '# set a job name',
375 | '#SBATCH --job-name={}'.format(job_with_version),
376 | '#################\n',
377 | ]
378 | sub_commands.extend(command)
379 |
380 | # add out output
381 | if self.enable_log_out:
382 | out_path = os.path.join(self.out_log_path, '{}_slurm_output_%j.out'.format(timestamp))
383 | command = [
384 | '# a file for job output, you can check job progress',
385 | '#SBATCH --output={}'.format(out_path),
386 | '#################\n',
387 | ]
388 | sub_commands.extend(command)
389 |
390 | # add err output
391 | if self.enable_log_err:
392 | err_path = os.path.join(self.err_log_path, '{}_slurm_output_%j.err'.format(timestamp))
393 | command = [
394 | '# a file for errors',
395 | '#SBATCH --error={}'.format(err_path),
396 | '#################\n',
397 | ]
398 | sub_commands.extend(command)
399 |
400 | # add job time
401 | command = [
402 | '# time needed for job',
403 | '#SBATCH --time={}'.format(self.job_time),
404 | '#################\n'
405 | ]
406 | sub_commands.extend(command)
407 |
408 | # add nb of gpus
409 | if self.per_experiment_nb_gpus > 0 and on_gpu:
410 | command = [
411 | '# gpus per node',
412 | '#SBATCH --gres=gpu:{}'.format(self.per_experiment_nb_gpus),
413 | '#################\n'
414 | ]
415 | if self.gpu_type is not None:
416 | command = [
417 | '# gpus per node',
418 | '#SBATCH --gres=gpu:{}:{}'.format(self.gpu_type, self.per_experiment_nb_gpus),
419 | '#################\n'
420 | ]
421 | sub_commands.extend(command)
422 |
423 | # add nb of cpus if not looking at a gpu job
424 | if self.per_experiment_nb_cpus > 0:
425 | command = [
426 | '# cpus per job',
427 | '#SBATCH --cpus-per-task={}'.format(self.per_experiment_nb_cpus),
428 | '#################\n'
429 | ]
430 | sub_commands.extend(command)
431 |
432 | # pick nb nodes
433 | command = [
434 | '# number of requested nodes',
435 | '#SBATCH --nodes={}'.format(self.per_experiment_nb_nodes),
436 | '#################\n'
437 | ]
438 | sub_commands.extend(command)
439 |
440 | # pick memory per node
441 | command = [
442 | '# memory per node',
443 | '#SBATCH --mem={}'.format(self.memory_mb_per_node),
444 | '#################\n'
445 | ]
446 | sub_commands.extend(command)
447 |
448 | # add signal command to catch job termination
449 | command = [
450 | '# slurm will send a signal this far out before it kills the job',
451 | f'#SBATCH --signal=USR1@{self.minutes_to_checkpoint_before_walltime * 60}',
452 | '#################\n'
453 | ]
454 |
455 | sub_commands.extend(command)
456 |
457 | # Subscribe to email if requested
458 | mail_type = []
459 | if self.notify_on_end:
460 | mail_type.append('END')
461 | if self.notify_on_fail:
462 | mail_type.append('FAIL')
463 | if len(mail_type) > 0:
464 | mail_type_query = [
465 | '# Have SLURM send you an email when the job ends or fails',
466 | '#SBATCH --mail-type={}'.format(','.join(mail_type))
467 | ]
468 | sub_commands.extend(mail_type_query)
469 |
470 | email_query = [
471 | '#SBATCH --mail-user={}'.format(self.email),
472 | ]
473 | sub_commands.extend(email_query)
474 |
475 | # add custom sbatch commands
476 | sub_commands.append('\n')
477 | for (cmd, value, comment) in self.slurm_commands:
478 | comment = '# {}'.format(comment)
479 | cmd = '#SBATCH --{}={}'.format(cmd, value)
480 | spaces = '#################\n'
481 | sub_commands.extend([comment, cmd, spaces])
482 |
483 | # load modules
484 | sub_commands.append('\n')
485 | for module in self.modules:
486 | cmd = 'module load {}'.format(module)
487 | sub_commands.append(cmd)
488 |
489 | # remove spaces before the hash
490 | sub_commands = [x.lstrip() for x in sub_commands]
491 |
492 | # add additional commands
493 | for cmd in self.commands:
494 | sub_commands.append(cmd)
495 | sub_commands.append('\n')
496 |
497 | # add run command
498 | trial_args = self.__get_hopt_params(trial)
499 | trial_args = '{} --{} {} --{} {}'.format(trial_args,
500 | HyperOptArgumentParser.SLURM_CMD_PATH,
501 | slurm_cmd_script_path,
502 | HyperOptArgumentParser.SLURM_EXP_CMD,
503 | exp_i)
504 |
505 | cmd = 'srun {} {} {}'.format(self.python_cmd, self.script_name, trial_args)
506 | sub_commands.append(cmd)
507 |
508 | # build full command with empty lines in between
509 | full_command = '\n'.join(sub_commands)
510 | return full_command
511 |
--------------------------------------------------------------------------------
/site/hpc/SlurmCluster/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 | SlurmCluster class API - Test tube Documentation
12 |
13 |
14 |
15 |
16 |
17 |
18 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
46 |
47 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 | Test tube Documentation
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 | Docs »
124 |
125 |
126 |
127 | Hpc »
128 |
129 |
130 |
131 | SlurmCluster class API
132 |
133 |
134 | Edit on GitHub
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
SlurmCluster class API
145 |
[Github Code ]
146 |
The SlurmCluster class enables hyperparameter search parallelization on a cluster managed via Slurm workload manager .
147 |
At a high level, the SlurmCluster creates a submit script for each permutation of hyperparameters requested. If the job hits the walltime but has not completed, the SlurmManager will checkpoint the model and submit a new job to continue training using the saved weights.
148 |
152 |
You can instantiate a SlurmCluster via:
153 |
from test_tube.hpc import SlurmCluster
154 |
155 | # hyperparameters is a test-tube hyper params object
156 | # see https://williamfalcon.github.io/test-tube/hyperparameter_optimization/HyperOptArgumentParser/
157 | hyperparams = args.parse()
158 |
159 | # init cluster
160 | cluster = SlurmCluster(
161 | hyperparam_optimizer=hyperparams,
162 | log_path='/path/to/log/results/to',
163 | python_cmd='python3'
164 | )
165 |
166 | # let the cluster know where to email for a change in job status (ie: complete, fail, etc...)
167 | cluster.notify_job_status(email='some@email.com', on_done=True, on_fail=True)
168 |
169 | # set the job options. In this instance, we'll run 20 different models
170 | # each with its own set of hyperparameters giving each one 1 GPU (ie: taking up 20 GPUs)
171 | cluster.per_experiment_nb_gpus = 1
172 | cluster.per_experiment_nb_nodes = 1
173 |
174 | # we'll request 10GB of memory per node
175 | cluster.memory_mb_per_node = 10000
176 |
177 | # set a walltime of 10 minues
178 | cluster.job_time = '10:00'
179 |
180 | # 1 minute before walltime is up, SlurmCluster will launch a continuation job and kill this job.
181 | # you must provide your own loading and saving function which the cluster object will call
182 | cluster.minutes_to_checkpoint_before_walltime = 1
183 |
184 | # run the models on the cluster
185 | cluster.optimize_parallel_cluster_gpu(train, nb_trials=20, job_name='first_tt_batch', job_display_name='my_batch')
186 |
187 |
188 |
189 |
init options
190 |
hyperparam_optimizer
191 |
A HyperOptArgumentParser object
192 | which contains all permutations of model hyperparameters to run.
193 |
log_path
194 |
Path to save the slurm scripts, error logs and out logs created. Usually this would be the experiments folder path where test tube saves Experiment information.
195 |
python_cmd
196 |
This is the command that starts the python program. Normally it is:
197 |
# python 2
198 | python main.py
199 |
200 | # python 3
201 | python3 main.py
202 |
203 |
204 |
enable_log_err
205 |
If true, saves slurm error logs to the path at log_path . If anything goes wrong in your job, you'll find the error here.
206 |
enable_log_out
207 |
If true, saves slurm output logs to the path at log_path . This file contains all outputs that would show up on the console normally.
208 |
test_tube_exp_name
209 |
When this is given, it structures the files in a nice format to fit with the folder structure of the Experiment object's output.
210 |
Properties
211 |
job_time
212 | String. Walltime requested. Examples:
213 |
# 1 hour and 10 minutes
214 | cluster.job_time = '1:10:00'
215 |
216 | # 1 day and 1 hour and 10 minutes
217 | cluster.job_time = '1-1:10:00'
218 |
219 | # 1 day and 1 hour and 10 minutes
220 | cluster.job_time = '25:10:00'
221 |
222 | # 10 minutes
223 | cluster.job_time = '10:00'
224 |
225 | # 10 seconds
226 | cluster.job_time = '10'
227 |
228 |
229 |
minutes_to_checkpoint_before_walltime
230 | Int. Minutes before walltime when a continuation job will be auto-submitted.
231 |
cluster.job_time = '10:00'
232 | cluster.minutes_to_checkpoint_before_walltime = 2
233 |
234 | # New job will be submited to continue training after 8 minutes of the job running.
235 |
236 |
237 |
per_experiment_nb_gpus
238 | Int. Number of GPUs each job will get.
239 |
# EACH job will get 2 GPUs (ie: if a model runs over two GPUs at the same time).
240 | cluster.per_experiment_nb_gpus = 2
241 |
242 |
243 |
per_experiment_nb_cpus
244 | Int. Number of CPUs each job will get.
245 |
cluster.per_experiment_nb_cpus = 1
246 |
247 |
248 |
per_experiment_nb_nodes
249 | Int. Number of nodes each job will get.
250 |
cluster.per_experiment_nb_nodes = 1
251 |
252 |
253 |
gpu_type
254 | String. Gpu type requested. Example:
255 |
cluster.gpu_type = '1080ti'
256 |
257 |
258 |
259 |
Methods
260 |
set_checkpoint_save_function
261 |
cluster.set_checkpoint_save_function(fx, kwargs)
262 |
263 |
264 |
Called if the model isn't finished training minutes_to_checkpoint_before_walltime before the walltime. If walltime = '15:00' and minutes_to_checkpoint_before_walltime = '1:00' the SlurmCluster will call your save function after 14 minutes of training.
265 |
266 | fx A python function.
267 | kwargs Dictionary where keys are the literal argument names to the function. Dictionary values are the values of the arguments.
268 |
269 |
Example
270 |
def my_save_function(arg_1, arg_k):
271 | # ... save my model here
272 |
273 | cluster.set_checkpoint_save_function(my_save_function, kwargs={'arg_1': 'whatever', 'arg_k': 'you_want'})
274 |
275 |
276 |
277 |
set_checkpoint_load_function
278 |
cluster.set_checkpoint_load_function(fx, kwargs)
279 |
280 |
281 |
Called internally when a job is auto-submitted by the SlurmCluster to give your program a chance to load the model weights or whatever you need to continue training.
282 | This will call your load function immediately whenever you call this method AND training is continuing.
283 |
284 | fx A python function.
285 | kwargs Dictionary where keys are the literal argument names to the function. Dictionary values are the values of the arguments.
286 |
287 |
Example
288 |
def my_load_function(arg_1, arg_k):
289 | # ... restore my model here
290 |
291 | cluster.set_checkpoint_save_function(my_load_function, kwargs={'arg_1': 'whatever', 'arg_k': 'you_want'})
292 |
293 |
294 |
295 |
add_slurm_cmd
296 |
cluster.add_slurm_cmd(cmd, value, comment)
297 |
298 |
299 |
Adds whatever Slurm command you need manually to the generated script. All possible commands are listed here .
300 |
301 | cmd String with the bash command.
302 | value String value for the command. Numericals need to be in single quotes '1'
303 | comment String with the command comment.
304 |
305 |
Example
306 |
cluster.add_slurm_cmd(cmd='cpus-per-task', value='1', comment='nb cpus per task')
307 |
308 | # the above command will add an entry like this to the slurm script
309 |
310 | # #nb cpus per task
311 | # #SBATCH --cpus-per-task=1
312 | # ############
313 |
314 |
315 |
316 |
add_command
317 |
cluster.add_command(cmd)
318 |
319 |
320 |
Adds arbitrary bash commands to the script. Use this to activate conda environments, install packages, whatever else you would think about calling on bash.
321 |
322 | cmd String with your bash command.
323 |
324 |
Example
325 |
# load the anaconda package on the launch node
326 | cluster.add_command('module load anaconda')
327 |
328 | # activate the environment on the launch node
329 | cluster.add_command('source activate myCondaEnv')
330 |
331 |
332 |
load_modules
333 |
cluster.load_modules(modules)
334 |
335 |
336 |
Loads modules needed to run the job. Your Slurm documentation should have a list of available modules. You can also get those by running module avail.
337 | - modules Array of module names.
338 |
Example
339 |
cluster.load_modules([
340 | 'python-3',
341 | 'anaconda3'
342 | ])
343 |
344 |
345 |
notify_job_status
346 |
cluster.notify_job_status(email, on_done, on_fail)
347 |
348 |
349 |
Loads modules needed to run the job. Your Slurm documentation should have a list of available modules. You can also get those by running module avail.
350 |
351 | email String. Email address to get notifications.
352 | on_done Boolean. If true, you'll get an email when the job completes.
353 | on_fail Boolean. If true, you'll get an email if the job fails.
354 |
355 |
Example
356 |
cluster.notify_job_status(email='some@email.com', on_done=True, on_fail=True)
357 |
358 |
359 |
optimize_parallel_cluster_gpu
360 |
cluster.optimize_parallel_cluster_gpu(train_function, nb_trials, job_name, job_display_name=None)
361 |
362 |
363 |
Launches the hyperparameter search across the cluster nodes.
364 | - train_function The entry point to start your training routine.
365 | - nb_trials Number of trials to launch. This is the number of hyperparameter configurations to train over.
366 | - job_name Folder name where the slurm scripts will save to. This should be the same as your Experiment name.
367 | - job_display_name Visible name when slurm lists running jobs (ie: through squeue -u user_name). This should be really short (if using a test tube Experiment, it'll put the experiment version at the end).
368 |
Example
369 |
def main(hparams, cluster, return_dict):
370 | # do your own generic training code here...
371 | # init model
372 | model = model_build(hparams)
373 |
374 | # set the load and save fxs
375 | cluster.set_checkpoint_save_function(fx, {})
376 | cluster.set_checkpoint_load_function(fx, {})
377 |
378 | # train ...
379 |
380 |
381 | cluster.optimize_parallel_cluster_gpu(main, nb_trials=20, job_name='my_job', job_display_name='mj')
382 |
383 |
384 |
Now if you get the job information, you'll see this:
385 |
(conda_env) [user@node dir]$ squeue -u my_name
386 | JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON)
387 | 104040 all mjv0 my_name R 58:22 1 nodeName
388 | 104041 all mjv1 my_name R 58:22 1 nodeName
389 | 104042 all mjv2 my_name R 58:22 1 nodeName
390 | 104043 all mjv3 my_name R 58:22 1 nodeName
391 |
392 |
393 |
optimize_parallel_cluster_cpu
394 |
cluster.optimize_parallel_cluster_cpu(train_function, nb_trials, job_name, job_display_name=None)
395 |
396 |
397 |
Launches the hyperparameter search across the cluster nodes using cpus.
398 | - train_function The entry point to start your training routine.
399 | - nb_trials Number of trials to launch. This is the number of hyperparameter configurations to train over.
400 | - job_name Folder name where the slurm scripts will save to. This should be the same as your Experiment name.
401 | - job_display_name Visible name when slurm lists running jobs (ie: through squeue -u user_name). This should be really short (if using a test tube Experiment, it'll put the experiment version at the end).
402 |
Example
403 |
def main(hparams, cluster, return_dict):
404 | # do your own generic training code here...
405 | # init model
406 | model = model_build(hparams)
407 |
408 | # set the load and save fxs
409 | cluster.set_checkpoint_save_function(fx, {})
410 | cluster.set_checkpoint_load_function(fx, {})
411 |
412 | # train ...
413 |
414 |
415 | cluster.optimize_parallel_cluster_cpu(main, nb_trials=20, job_name='my_job', job_display_name='mj')
416 |
417 |
418 |
Now if you get the job information, you'll see this:
419 |
(conda_env) [user@node dir]$ squeue -u my_name
420 | JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON)
421 | 104040 all mjv0 my_name R 58:22 1 nodeName
422 | 104041 all mjv1 my_name R 58:22 1 nodeName
423 | 104042 all mjv2 my_name R 58:22 1 nodeName
424 | 104043 all mjv3 my_name R 58:22 1 nodeName
425 |
426 |
427 |
428 |
429 |
430 |
431 |
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
447 |
448 | Built with MkDocs using a theme provided by Read the Docs .
449 |
450 |
451 |
452 |
453 |
454 |
455 |
456 |
457 |
458 |
471 |
472 |
473 |
474 |
475 |
476 |
477 |
--------------------------------------------------------------------------------
/test_tube/log.py:
--------------------------------------------------------------------------------
1 | import contextlib
2 | import json
3 | import os
4 | import shutil
5 | from datetime import datetime
6 |
7 | import numpy as np
8 | import pandas as pd
9 | from imageio import imwrite
10 | from tensorboard.compat.proto.event_pb2 import Event
11 | from tensorboard.compat.proto.event_pb2 import SessionLog
12 | from torch.utils.tensorboard import SummaryWriter, FileWriter
13 |
14 | # constants
15 | _ROOT = os.path.abspath(os.path.dirname(__file__))
16 |
17 | # -----------------------------
18 | # Experiment object
19 | # -----------------------------
20 |
21 |
22 | class DDPExperiment(object):
23 | def __init__(
24 | self,
25 | exp
26 | ):
27 | """
28 | Used as meta_data storage if the experiment needs to be pickled
29 | :param name:
30 | :param debug:
31 | :param version:
32 | :param save_dir:
33 | :param autosave:
34 | :param description:
35 | :param create_git_tag:
36 | :param args:
37 | :param kwargs:
38 | """
39 |
40 | self.tag_markdown_saved = exp.tag_markdown_saved
41 | self.no_save_dir = exp.no_save_dir
42 | self.metrics = exp.metrics
43 | self.tags = exp.tags
44 | self.name = exp.name
45 | self.debug = exp.debug
46 | self.version = exp.version
47 | self.autosave = exp.autosave
48 | self.description = exp.description
49 | self.create_git_tag = exp.create_git_tag
50 | self.exp_hash = exp.exp_hash
51 | self.created_at = exp.created_at
52 | self.save_dir = exp.save_dir
53 |
54 |
55 | def get_non_ddp_exp(self):
56 | return Experiment(
57 | name=self.name,
58 | debug=self.debug,
59 | version=self.version,
60 | save_dir=self.save_dir,
61 | autosave=self.autosave,
62 | description=self.description,
63 | create_git_tag=self.create_git_tag
64 | )
65 |
66 | class Experiment(SummaryWriter):
67 |
68 | def __init__(
69 | self,
70 | save_dir=None,
71 | name='default',
72 | debug=False,
73 | version=None,
74 | autosave=False,
75 | description=None,
76 | create_git_tag=False,
77 | rank=0,
78 | *args, **kwargs
79 | ):
80 | """
81 | A new Experiment object defaults to 'default' unless a specific name is provided
82 | If a known name is already provided, then the file version is changed
83 | :param name:
84 | :param debug:
85 | """
86 |
87 | # change where the save dir is if requested
88 |
89 | if save_dir is not None:
90 | global _ROOT
91 | _ROOT = save_dir
92 |
93 | self.save_dir = save_dir
94 | self.tag_markdown_saved = False
95 | self.no_save_dir = save_dir is None
96 | self.metrics = []
97 | self.tags = {}
98 | self.name = name
99 | self.debug = debug
100 | self.version = version
101 | self.autosave = autosave
102 | self.description = description
103 | self.create_git_tag = create_git_tag
104 | self.exp_hash = '{}_v{}'.format(self.name, version)
105 | self.created_at = str(datetime.utcnow())
106 | self.rank = rank
107 | self.process = os.getpid()
108 |
109 | # when debugging don't do anything else
110 | if debug:
111 | return
112 |
113 | # update version hash if we need to increase version on our own
114 | # we will increase the previous version, so do it now so the hash
115 | # is accurate
116 | if version is None:
117 | old_version = self.__get_last_experiment_version()
118 | self.exp_hash = '{}_v{}'.format(self.name, old_version + 1)
119 | self.version = old_version + 1
120 |
121 | # create a new log file
122 | self.__init_cache_file_if_needed()
123 |
124 | # when we have a version, load it
125 | if self.version is not None:
126 |
127 | # when no version and no file, create it
128 | if not os.path.exists(self.__get_log_name()):
129 | self.__create_exp_file(self.version)
130 | else:
131 | # otherwise load it
132 | try:
133 | self.__load()
134 | except Exception as e:
135 | self.debug = True
136 | else:
137 | # if no version given, increase the version to a new exp
138 | # create the file if not exists
139 | old_version = self.__get_last_experiment_version()
140 | self.version = old_version
141 | self.__create_exp_file(self.version + 1)
142 |
143 | # create a git tag if requested
144 | if self.create_git_tag:
145 | desc = description if description is not None else 'no description'
146 | tag_msg = 'Test tube exp: {} - {}'.format(self.name, desc)
147 | cmd = 'git tag -a tt_{} -m "{}"'.format(self.exp_hash, tag_msg)
148 | os.system(cmd)
149 | print('Test tube created git tag:', 'tt_{}'.format(self.exp_hash))
150 |
151 | # set the tensorboardx log path to the /tf folder in the exp folder
152 | log_dir = self.get_tensorboardx_path(self.name, self.version)
153 | # this is a fix for pytorch 1.1 since it does not have this attribute
154 | for attr, val in [('purge_step', None),
155 | ('max_queue', 10),
156 | ('flush_secs', 120),
157 | ('filename_suffix', '')]:
158 | if not hasattr(self, attr):
159 | setattr(self, attr, val)
160 | super().__init__(log_dir=log_dir, *args, **kwargs)
161 |
162 | # register on exit fx so we always close the writer
163 | # atexit.register(self.on_exit)
164 |
165 | def get_meta_copy(self):
166 | """
167 | Gets a meta-version only copy of this module
168 | :return:
169 | """
170 | return DDPExperiment(self)
171 |
172 | def on_exit(self):
173 | pass
174 |
175 |
176 | def __clean_dir(self):
177 | files = os.listdir(self.save_dir)
178 |
179 | if self.rank == 0:
180 | return
181 |
182 | for f in files:
183 | if str(self.process) in f:
184 | os.remove(os.path.join(self.save_dir, f))
185 |
186 | def argparse(self, argparser):
187 | parsed = vars(argparser)
188 | to_add = {}
189 |
190 | # don't store methods
191 | for k, v in parsed.items():
192 | if not callable(v):
193 | to_add[k] = v
194 |
195 | self.tag(to_add)
196 |
197 | def add_meta_from_hyperopt(self, hypo):
198 | """
199 | Transfers meta data about all the params from the
200 | hyperoptimizer to the log
201 | :param hypo:
202 | :return:
203 | """
204 | meta = hypo.get_current_trial_meta()
205 | for tag in meta:
206 | self.tag(tag)
207 |
208 | # --------------------------------
209 | # FILE IO UTILS
210 | # --------------------------------
211 | def __init_cache_file_if_needed(self):
212 | """
213 | Inits a file that we log historical experiments
214 | :return:
215 | """
216 | try:
217 | exp_cache_file = self.get_data_path(self.name, self.version)
218 | if not os.path.isdir(exp_cache_file):
219 | os.makedirs(exp_cache_file, exist_ok=True)
220 | except Exception as e:
221 | # file already exists (likely written by another exp. In this case disable the experiment
222 | self.debug = True
223 |
224 | def __create_exp_file(self, version):
225 | """
226 | Recreates the old file with this exp and version
227 | :param version:
228 | :return:
229 | """
230 |
231 | try:
232 | exp_cache_file = self.get_data_path(self.name, self.version)
233 | # if no exp, then make it
234 | path = '{}/meta.experiment'.format(exp_cache_file)
235 | open(path, 'w').close()
236 | self.version = version
237 |
238 | # make the directory for the experiment media assets name
239 | os.makedirs(self.get_media_path(self.name, self.version), exist_ok=True)
240 |
241 | # make the directory for tensorboardx stuff
242 | os.makedirs(self.get_tensorboardx_path(self.name, self.version), exist_ok=True)
243 | except Exception as e:
244 | # file already exists (likely written by another exp. In this case disable the experiment
245 | self.debug = True
246 |
247 |
248 | def __get_last_experiment_version(self):
249 | try:
250 | exp_cache_file = os.sep.join(self.get_data_path(self.name, self.version).split(os.sep)[:-1])
251 | return find_last_experiment_version(exp_cache_file)
252 | except Exception as e:
253 | return -1
254 |
255 | def __get_log_name(self):
256 | exp_cache_file = self.get_data_path(self.name, self.version)
257 | return '{}/meta.experiment'.format(exp_cache_file)
258 |
259 | def tag(self, tag_dict):
260 | """
261 | Adds a tag to the experiment.
262 | Tags are metadata for the exp.
263 |
264 | >> e.tag({"model": "Convnet A"})
265 |
266 | :param key:
267 | :param val:
268 | :return:
269 | """
270 | if self.debug or self.rank > 0: return
271 |
272 | # parse tags
273 | for k, v in tag_dict.items():
274 | self.tags[k] = v
275 |
276 | # save if needed
277 | if self.autosave == True:
278 | self.save()
279 |
280 | def log(self, metrics_dict, global_step=None, walltime=None):
281 | """
282 | Adds a json dict of metrics.
283 |
284 | >> e.log({"loss": 23, "coeff_a": 0.2})
285 |
286 | :param metrics_dict:
287 | :tag optional tfx tag
288 | :return:
289 | """
290 | if self.debug or self.rank > 0: return
291 |
292 | # handle tfx metrics
293 | if global_step is None:
294 | global_step = len(self.metrics)
295 |
296 | new_metrics_dict = metrics_dict.copy()
297 | for k, v in metrics_dict.items():
298 | if isinstance(v, dict):
299 | self.add_scalars(main_tag=k, tag_scalar_dict=v, global_step=global_step, walltime=walltime)
300 | tmp_metrics_dict = new_metrics_dict.pop(k)
301 | new_metrics_dict.update(tmp_metrics_dict)
302 | else:
303 | self.add_scalar(tag=k, scalar_value=v, global_step=global_step, walltime=walltime)
304 |
305 | metrics_dict = new_metrics_dict
306 |
307 | # timestamp
308 | if 'created_at' not in metrics_dict:
309 | metrics_dict['created_at'] = str(datetime.utcnow())
310 |
311 | self.__convert_numpy_types(metrics_dict)
312 |
313 | self.metrics.append(metrics_dict)
314 |
315 | if self.autosave:
316 | self.save()
317 |
318 | def __convert_numpy_types(self, metrics_dict):
319 | for k, v in metrics_dict.items():
320 | if v.__class__.__name__ == 'float32':
321 | metrics_dict[k] = float(v)
322 |
323 | if v.__class__.__name__ == 'float64':
324 | metrics_dict[k] = float(v)
325 |
326 | def save(self):
327 | """
328 | Saves current experiment progress
329 | :return:
330 | """
331 | if self.debug or self.rank > 0: return
332 |
333 | # save images and replace the image array with the
334 | # file name
335 | self.__save_images(self.metrics)
336 | metrics_file_path = self.get_data_path(self.name, self.version) + '/metrics.csv'
337 | meta_tags_path = self.get_data_path(self.name, self.version) + '/meta_tags.csv'
338 |
339 | obj = {
340 | 'name': self.name,
341 | 'version': self.version,
342 | 'tags_path': meta_tags_path,
343 | 'metrics_path': metrics_file_path,
344 | 'autosave': self.autosave,
345 | 'description': self.description,
346 | 'created_at': self.created_at,
347 | 'exp_hash': self.exp_hash
348 | }
349 |
350 | # save the experiment meta file
351 | with atomic_write(self.__get_log_name()) as tmp_path:
352 | with open(tmp_path, 'w') as file:
353 | json.dump(obj, file, ensure_ascii=False)
354 |
355 | # save the metatags file
356 | df = pd.DataFrame({'key': list(self.tags.keys()), 'value': list(self.tags.values())})
357 | with atomic_write(meta_tags_path) as tmp_path:
358 | df.to_csv(tmp_path, index=False)
359 |
360 | # save the metrics data
361 | df = pd.DataFrame(self.metrics)
362 | with atomic_write(metrics_file_path) as tmp_path:
363 | df.to_csv(tmp_path, index=False)
364 |
365 | # write new vals to disk
366 | self.flush()
367 |
368 | # until hparam plugin is fixed, generate hparams as text
369 | if not self.tag_markdown_saved and len(self.tags) > 0:
370 | self.tag_markdown_saved = True
371 | self.add_text('hparams', self.__generate_tfx_meta_log())
372 |
373 | def __generate_tfx_meta_log(self):
374 | header = f'''###### {self.name}, version {self.version}\n---\n'''
375 | desc = ''
376 | if self.description is not None:
377 | desc = f'''#####*{self.description}*\n'''
378 | params = f'''##### Hyperparameters\n'''
379 |
380 | row_header = '''parameter|value\n-|-\n'''
381 | rows = [row_header]
382 | for k, v in self.tags.items():
383 | row = f'''{k}|{v}\n'''
384 | rows.append(row)
385 |
386 | all_rows = [
387 | header,
388 | desc,
389 | params
390 | ]
391 | all_rows.extend(rows)
392 | mkdown_log = ''.join(all_rows)
393 | return mkdown_log
394 |
395 | def __save_images(self, metrics):
396 | """
397 | Save tags that have a png_ prefix (as images)
398 | and replace the meta tag with the file name
399 | :param metrics:
400 | :return:
401 | """
402 | # iterate all metrics and find keys with a specific prefix
403 | for i, metric in enumerate(metrics):
404 | for k, v in metric.items():
405 | # if the prefix is a png, save the image and replace the value with the path
406 | img_extension = None
407 | img_extension = 'png' if 'png_' in k else img_extension
408 | img_extension = 'jpg' if 'jpg' in k else img_extension
409 | img_extension = 'jpeg' if 'jpeg' in k else img_extension
410 |
411 | if img_extension is not None:
412 | # determine the file name
413 | img_name = '_'.join(k.split('_')[1:])
414 | save_path = self.get_media_path(self.name, self.version)
415 | save_path = '{}/{}_{}.{}'.format(save_path, img_name, i, img_extension)
416 |
417 | # save image to disk
418 | if type(metric[k]) is not str:
419 | imwrite(save_path, metric[k])
420 |
421 | # replace the image in the metric with the file path
422 | metric[k] = save_path
423 |
424 | def __load(self):
425 | # load .experiment file
426 | with open(self.__get_log_name(), 'r') as file:
427 | data = json.load(file)
428 | self.name = data['name']
429 | self.version = data['version']
430 | self.autosave = data['autosave']
431 | self.created_at = data['created_at']
432 | self.description = data['description']
433 | self.exp_hash = data['exp_hash']
434 |
435 | # load .tags file
436 | meta_tags_path = self.get_data_path(self.name, self.version) + '/meta_tags.csv'
437 | df = pd.read_csv(meta_tags_path)
438 | self.tags_list = df.to_dict(orient='records')
439 | self.tags = {}
440 | for d in self.tags_list:
441 | k, v = d['key'], d['value']
442 | self.tags[k] = v
443 |
444 | # load metrics
445 | metrics_file_path = self.get_data_path(self.name, self.version) + '/metrics.csv'
446 | try:
447 | df = pd.read_csv(metrics_file_path)
448 | self.metrics = df.to_dict(orient='records')
449 |
450 | # remove nans
451 | for metric in self.metrics:
452 | to_delete = []
453 | for k, v in metric.items():
454 | try:
455 | if np.isnan(v):
456 | to_delete.append(k)
457 | except Exception as e:
458 | pass
459 |
460 | for k in to_delete:
461 | del metric[k]
462 | except Exception as e:
463 | # metrics was empty...
464 | self.metrics = []
465 |
466 | def get_data_path(self, exp_name, exp_version):
467 | """
468 | Returns the path to the local package cache
469 | :param path:
470 | :return:
471 | """
472 | if self.no_save_dir:
473 | return os.path.join(_ROOT, 'test_tube_data', exp_name, 'version_{}'.format(exp_version))
474 | else:
475 | return os.path.join(_ROOT, exp_name, 'version_{}'.format(exp_version))
476 |
477 | def get_media_path(self, exp_name, exp_version):
478 | """
479 | Returns the path to the local package cache
480 | :param path:
481 | :return:
482 | """
483 | return os.path.join(self.get_data_path(exp_name, exp_version), 'media')
484 |
485 | def get_tensorboardx_path(self, exp_name, exp_version):
486 | """
487 | Returns the path to the local package cache
488 | :param path:
489 | :return:
490 | """
491 | return os.path.join(self.get_data_path(exp_name, exp_version), 'tf')
492 |
493 | def get_tensorboardx_scalars_path(self, exp_name, exp_version):
494 | """
495 | Returns the path to the local package cache
496 | :param path:
497 | :return:
498 | """
499 | tfx_path = self.get_tensorboardx_path(exp_name, exp_version)
500 | return os.path.join(tfx_path, 'scalars.json')
501 |
502 |
503 | # ----------------------------
504 | # OVERWRITES
505 | # ----------------------------
506 | def _get_file_writer(self):
507 | """Returns the default FileWriter instance. Recreates it if closed."""
508 | if self.rank > 0:
509 | return TTDummyFileWriter()
510 |
511 | if self.all_writers is None or self.file_writer is None:
512 | if self.purge_step is not None:
513 | most_recent_step = self.purge_step
514 | self.file_writer = FileWriter(self.log_dir, self.max_queue,
515 | self.flush_secs, self.filename_suffix)
516 | self.file_writer.debug = self.debug
517 | self.file_writer.rank = self.rank
518 |
519 | self.file_writer.add_event(
520 | Event(step=most_recent_step, file_version='brain.Event:2'))
521 | self.file_writer.add_event(
522 | Event(step=most_recent_step, session_log=SessionLog(status=SessionLog.START)))
523 | else:
524 | self.file_writer = FileWriter(self.log_dir, self.max_queue,
525 | self.flush_secs, self.filename_suffix)
526 | self.all_writers = {self.file_writer.get_logdir(): self.file_writer}
527 | return self.file_writer
528 |
529 |
530 | def __str__(self):
531 | return 'Exp: {}, v: {}'.format(self.name, self.version)
532 |
533 | def __hash__(self):
534 | return 'Exp: {}, v: {}'.format(self.name, self.version)
535 |
536 | def flush(self):
537 | if self.rank > 0:
538 | return
539 |
540 | if self.all_writers is None:
541 | return # ignore double close
542 |
543 | for writer in self.all_writers.values():
544 | writer.flush()
545 |
546 |
547 | class TTDummyFileWriter(object):
548 |
549 | def add_summary(self, summary, global_step=None, walltime=None):
550 | """
551 | Overwrite tf add summary so we can ignore when other non-zero processes call it
552 | Avoids overwriting logs from multiple processes
553 | :param summary:
554 | :param global_step:
555 | :param walltime:
556 | :return:
557 | """
558 | return
559 |
560 |
561 | @contextlib.contextmanager
562 | def atomic_write(dst_path):
563 | """A context manager to simplify atomic writing.
564 |
565 | Usage:
566 | >>> with atomic_write(dst_path) as tmp_path:
567 | >>> # write to tmp_path
568 | >>> # Here tmp_path renamed to dst_path, if no exception happened.
569 | """
570 | tmp_path = str(dst_path) + '.tmp'
571 | try:
572 | yield tmp_path
573 | except:
574 | if os.path.exists(tmp_path):
575 | os.remove(tmp_path)
576 | raise
577 | else:
578 | # If everything is fine, move tmp file to the destination.
579 | shutil.move(tmp_path, str(dst_path))
580 |
581 |
582 | def find_last_experiment_version(path):
583 | last_version = -1
584 | for f in os.listdir(path):
585 | if 'version_' in f:
586 | file_parts = f.split('_')
587 | version = int(file_parts[-1])
588 | last_version = max(last_version, version)
589 | return last_version
590 |
591 |
592 | if __name__ == '__main__':
593 | from time import sleep
594 | e = Experiment(description='my description')
595 | e.tag({'lr': 0.02, 'layers': 4})
596 |
597 | for n_iter in range(20):
598 | sleep(0.3)
599 | e.log({'loss/xsinx': n_iter * np.sin(n_iter)})
600 | if n_iter % 10 == 0:
601 | print('saved')
602 | e.save()
603 |
604 | e.close()
605 | os._exit(1)
606 |
607 |
--------------------------------------------------------------------------------